In [52]:
import pandas as pd

from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


MOVIE = "/home/youcs/Documents/git/Sentiment_Test/data/Movie_Review/"
TWEET = "/home/youcs/Documents/git/Sentiment_Test/data/Tweet_Sentiments/"
DISASTER_TWEET = "/home/youcs/Documents/git/Sentiment_Test/data/Disaster_Tweets/"

TRAIN_MOVIE = "train.tsv"
TEST_MOVIE = "test.tsv"
TRAIN_TWEET = "train.csv"
TEST_TWEET = "test.csv"




############### Helper functions ###############

def load_data(path,file):
    try:
        df = pd.read_csv(path+file)
    except:
        df = pd.read_table(path+file)
    return df

def get_phrase(df, sentiment):
    return df[df.Sentiment == sentiment]

def combine_df(list_of_dfs):
    return pd.concat(list_of_dfs)

def tokenize2phrases(phrases):
    tokens = [text_to_word_sequence(x) for x in df.Phrase]
    return tokens

def tokenize2words(phrases):
    return [y for x in phrases for y in text_to_word_sequence(x)]

def tokenize2unique(phrases):
    tokens = [text_to_word_sequence(x) for x in df.Phrase]
    tokens = [i for x in tokens for i in x]
    return set(tokens)

def lemmatize_tokenized_phrases(tokenized_phrases):
    lemmatizer = WordNetLemmatizer()
    lemm_token_phrases = []
    for tokens in tokenized_phrases:
        tokenize = []
        for token in pos_tag(tokens):
            pos = 'a'
            if token[1].startswith('VB'):
                pos = 'v'
            elif token[1].startswith('NN'):
                pos = 'n'
            tokenize.append(lemmatizer.lemmatize(token[0], pos))
        lemm_token_phrases.append(tokenize)
    return lemm_token_phrases

def remove_stopwords(tokens):
    return [x for x in tokens if x not in stopwords.words('english')]

In [2]:
df = load_data(MOVIE, TRAIN_MOVIE)

In [3]:
neg_df = get_phrase(df, 0)
sw_neg_df = get_phrase(df, 1)
neu_df = get_phrase(df, 2)
sw_pos_df = get_phrase(df, 3)
pos_df = get_phrase(df, 4)

neg_pos_df = combine_df([neg_df, pos_df])

In [8]:
def tokenize2phrases(phrases):
    tokens = [text_to_word_sequence(x) for x in df.Phrase]
    return tokens

def tokenize2words(phrases):

In [4]:
t2p = tokenize2phrases(neg_pos_df.Phrase)
t2w = tokenize2words(neg_pos_df.Phrase)

In [12]:
num_unique_words = len(set(t2w))
max_length = max(map(len, t2p))

In [16]:
lines = [x for x in neg_pos_df.Phrase]

In [44]:
test = [text_to_word_sequence(x) for x in neg_pos_df.Phrase]
test = [' '.join(x) for x in test]

In [53]:
tokenize = Tokenizer()
tokenize.fit_on_texts(test)

In [54]:
seq = tokenize.texts_to_sequences(test)

In [56]:
pad = pad_sequences(seq, maxlen=max_length, padding='post')

In [57]:
pad

array([[ 108,   42,    2, ...,    0,    0,    0],
       [  42,    2,  156, ...,    0,    0,    0],
       [5092,  176, 6284, ...,    0,    0,    0],
       ...,
       [  13,  747,  956, ...,    0,    0,    0],
       [  96,  112,    2, ...,    0,    0,    0],
       [   2,   69,  609, ...,    0,    0,    0]], dtype=int32)

In [61]:
len(pad)

16278

In [64]:
y_train = neg_pos_df.Sentiment

In [65]:
len(y_train)

16278

In [67]:
from keras.models import Sequential
from keras.models import Model

from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import Dropout
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import concatenate


inputs1 = Input(shape=(max_length,))
embedding1 = Embedding(num_unique_words, 100)(inputs1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPooling1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)

inputs2 = Input(shape=(max_length,))
embedding2 = Embedding(num_unique_words, 100)(inputs2)
conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
drop2 = Dropout(0.5)(conv2)
pool2 = MaxPooling1D(pool_size=2)(drop2)
flat2 = Flatten()(pool2)

inputs3 = Input(shape=(max_length,))
embedding3 = Embedding(num_unique_words, 100)(inputs3)
conv3 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding3)
drop3 = Dropout(0.5)(conv3)
pool3 = MaxPooling1D(pool_size=2)(drop3)
flat3 = Flatten()(pool3)

merged = concatenate([flat1,flat2,flat3])

dense1 = Dense(10, activation='relu')(merged)
outputs = Dense(1, activation='softmax')(dense1)
model = Model(inputs=[inputs1, inputs2, inputs3], outputs = outputs)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
model.fit([pad,pad,pad], y_train, epochs=7, batch_size=16)

Epoch 1/7

KeyboardInterrupt: 