In [49]:
import pandas as pd

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from gensim.models.keyedvectors import KeyedVectors
import gensim
from keras.preprocessing.text import text_to_word_sequence

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


MOVIE = "/home/youcs/Documents/git/Sentiment_Test/data/Movie_Review/"
TWEET = "/home/youcs/Documents/git/Sentiment_Test/data/Tweet_Sentiments/"
DISASTER_TWEET = "/home/youcs/Documents/git/Sentiment_Test/data/Disaster_Tweets/"

TRAIN_MOVIE = "train.tsv"
TEST_MOVIE = "test.tsv"
TRAIN_TWEET = "train.csv"
TEST_TWEET = "test.csv"

############### Helper functions ###############

def load_data(path,file):
    try:
        df = pd.read_csv(path+file)
    except:
        df = pd.read_table(path+file)
    return df

def get_phrase(df, sentiment):
    return df[df.Sentiment == sentiment]

def tokenize2phrases(phrases):
    tokens = [text_to_word_sequence(x) for x in df.Phrase]
    return tokens

def tokenize2words(phrases):
    return [y for x in phrases for y in x.split()]

def tokenize2unique(phrases):
    tokens = [text_to_word_sequence(x) for x in df.Phrase]
    tokens = [i for x in tokens for i in x]
    return set(tokens)

def lemmatize_tokenized_phrases(tokenized_phrases):
    lemmatizer = WordNetLemmatizer()
    lemm_token_phrases = []
    for tokens in tokenized_phrases:
        tokenize = []
        for token in pos_tag(tokens):
            pos = 'a'
            if token[1].startswith('VB'):
                pos = 'v'
            elif token[1].startswith('NN'):
                pos = 'n'
            tokenize.append(lemmatizer.lemmatize(token[0], pos))
        lemm_token_phrases.append(tokenize)
    return lemm_token_phrases

def remove_stopwords(tokens):
    return [x for x in tokens if x not in stopwords.words('english')]

'''
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive
'''

'\n0 - negative\n1 - somewhat negative\n2 - neutral\n3 - somewhat positive\n4 - positive\n'

In [4]:
df = load_data(MOVIE, TRAIN_MOVIE)

In [5]:
neg_df = get_phrase(df,0)
somewhat_neg_df = get_phrase(df,1)
neutral = get_phrase(df,2)
somewhat_pos_df = get_phrase(df,3)
pos_df = get_phrase(df,4)

In [None]:
# num_of_unique_vocab, max_length, 

In [None]:
tokenize = Tokenizer()
tokenize.fit_to_texts()

In [38]:
a = tokenize2words(neg_df.Phrase)

In [None]:
a

In [40]:
b = tokenize2phrases(neg_df.Phrase)

In [50]:
bb = lemmatize_tokenized_phrases(b)

In [51]:
bb

[['a',
  'series',
  'of',
  'escapade',
  'demonstrate',
  'the',
  'adage',
  'that',
  'what',
  'be',
  'good',
  'for',
  'the',
  'goose',
  'be',
  'also',
  'good',
  'for',
  'the',
  'gander',
  'some',
  'of',
  'which',
  'occasionally',
  'amuse',
  'but',
  'none',
  'of',
  'which',
  'amount',
  'to',
  'much',
  'of',
  'a',
  'story'],
 ['a',
  'series',
  'of',
  'escapade',
  'demonstrate',
  'the',
  'adage',
  'that',
  'what',
  'be',
  'good',
  'for',
  'the',
  'goose'],
 ['a', 'series'],
 ['a'],
 ['series'],
 ['of',
  'escapade',
  'demonstrate',
  'the',
  'adage',
  'that',
  'what',
  'be',
  'good',
  'for',
  'the',
  'goose'],
 ['of'],
 ['escapade',
  'demonstrate',
  'the',
  'adage',
  'that',
  'what',
  'be',
  'good',
  'for',
  'the',
  'goose'],
 ['escapade'],
 ['demonstrate',
  'the',
  'adage',
  'that',
  'what',
  'be',
  'good',
  'for',
  'the',
  'goose'],
 ['demonstrate', 'the', 'adage'],
 ['demonstrate'],
 ['the', 'adage'],
 ['the'],
 ['

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(a)

In [14]:
a

['would have a hard time sitting through this one',
 'have a hard time sitting through this one',
 'Aggressive self-glorification and a manipulative whitewash',
 'self-glorification and a manipulative whitewash',
 'Trouble Every Day is a plodding mess .',
 'is a plodding mess',
 'plodding mess',
 'could hate it for the same reason',
 'hate it',
 'hate',
 "is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that",
 'baseball movies that try too hard to be mythic',
 'Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire .',
 'a self-indulgent script',
 "There 's very little sense to what 's going on here ,",
 'avoid',
 'almost feels as if the movie is more interested in entertaining itself than in amusing us .',
 "The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . '",
 "The movie 

In [15]:
from collections import Counter

In [16]:
tokens = Counter(a)

In [19]:
df = load_data(MOVIE, TRAIN_MOVIE)
neg = get_phrase(df, 0)
pos = get_phrase(df, 4)
new_df = pd.concat([neg, pos])
new_df.replace(4,1,inplace=True)

neg_token_phrase = [text_to_word_sequence(x) for x in neg.Phrase]
neg_y = [0 for x in range(len(neg_token_phrase))]
pos_token_phrase = [text_to_word_sequence(x) for x in pos.Phrase]
pos_y = [1 for x in range(len(pos_token_phrase))]
all_token_phrase = neg_token_phrase + pos_token_phrase
all_y = neg_y + pos_y

In [9]:
model = gensim.models.Word2Vec(all_tokens, size=50, window=10, min_count=2, workers=8, iter=10)

In [13]:
w1 = 'forced'
model.wv.most_similar(positive=w1)

[('save', 0.9864736199378967),
 ('disappoint', 0.985970675945282),
 ('defeated', 0.983375072479248),
 ('desiccated', 0.9809083938598633),
 ('topple', 0.9802108407020569),
 ('prolonged', 0.9798016548156738),
 ('hint', 0.9792092442512512),
 ('painfully', 0.9789096713066101),
 ('mishmash', 0.9780198931694031),
 ('tunney', 0.9771900177001953)]

In [23]:

from keras.models import Model
from keras.models import Sequential
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import concatenate

In [54]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max([len(s.split()) for s in lines])

def encode_text(tokenizer, lines, length):
    encoded =tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

def define_model(length, vocab_size):
    inputs1 = Input(shape=(length,))
    embedding1 = Embedding(vocab_size, 100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size, 100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size, 100)(inputs3)
    conv3 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    merged = concatenate([flat1,flat2,flat3])
    
    dense1 = Dense(10, activation='relu')(merged)
    outputs = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=[inputs1, inputs2, inputs3], outputs = outputs)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

In [79]:
negative = df[df.Sentiment==0]
positive = df[df.Sentiment==4]
new_df = pd.concat([negative, positive])
X_train = [x for x in new_df.Phrase]
y_train = [[x] for x in new_df.Sentiment]

In [80]:
len(y_train)

16278

In [77]:
len(X_train)

16278

In [78]:
len(trainX)

16278

In [56]:
tok = create_tokenizer(X_train)

In [57]:
length = max_length(X_train)

In [58]:
vocab_size = len(tok.word_index)+1

In [70]:
vocab_size

10634

In [59]:
trainX = encode_text(tok, X_train, length)

In [75]:
trainX[120]

array([ 124, 1111,   19,    5, 1520,    2,  932,    9,   53,  298,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [None]:
model = Sequential()
model.add(Embedding(max_features, 100, input_length=max_words))
model.add(Dropout(0,2))

model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(num_classes, activation='softmax'))

model.compile(loss=categorical_crossentropy, optimizer='adam', metrics=['accuracy'])

In [None]:
model = Sequential()
model.add(Embedding(max_features, 100, input_length=max_words))