In [14]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import gensim
from gensim.models import word2vec

import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
col_names = ["Sentiment", "Id", "Date", "Flag", "User", "Text"]
data = pd.read_csv(r'D:\CSC590_Design_Project\Data\train.csv',names = col_names,encoding="ISO-8859-1")
sentiment_conv = {0:-1,2:0,4:1}
data['Sentiment'] = data['Sentiment'].map(sentiment_conv)
data.drop(["Id", "Date", "Flag", "User"],axis = 1,inplace = True)

In [16]:
stops = set(stopwords.words("english"))
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = SnowballStemmer("english")

def process_text(text,remove_stops = False, stem = False):
    text = str(text).lower().strip()
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove url links
    text = re.sub("@[\w]*",'',text) # remove "@user"
    text = re.sub('[^a-zA-Z]',' ',text) # leave only characters
    words =[]
    for word in text.split():
        if not remove_stops or word not in stops:
            if not stem:
                words.append(word)
            else:
                words.append(stemmer.stem(word))
    return words    

data['Text'] = data['Text'].apply(lambda x: process_text(x,remove_stops = True))

In [17]:
train,test = train_test_split(data, test_size=0.2, random_state=42)
train_sentences = train['Text'].tolist()

In [18]:
W2V_SIZE = 300 # word dimensions
W2V_WINDOW = 7 # maximum distance 
W2V_EPOCH = 32
W2V_MIN_COUNT = 10 # minimum frequency requirement
W2V_WORKERS=8

model = word2vec.Word2Vec(vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=W2V_WORKERS)
model.build_vocab(train_sentences)
model.train(train_sentences, total_examples=len(train_sentences), epochs=W2V_EPOCH)

2021-05-09 14:49:51,665 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=300, alpha=0.025)', 'datetime': '2021-05-09T14:49:51.665135', 'gensim': '4.0.1', 'python': '3.6.13 (default, Feb 19 2021, 05:17:09) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'created'}
2021-05-09 14:49:51,666 : INFO : collecting all words and their counts
2021-05-09 14:49:51,669 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-05-09 14:49:51,718 : INFO : PROGRESS: at sentence #10000, processed 70182 words, keeping 12971 word types
2021-05-09 14:49:51,764 : INFO : PROGRESS: at sentence #20000, processed 139809 words, keeping 19764 word types
2021-05-09 14:49:51,819 : INFO : PROGRESS: at sentence #30000, processed 209009 words, keeping 25027 word types
2021-05-09 14:49:51,872 : INFO : PROGRESS: at sentence #40000, processed 279257 words, keeping 29591 word types
2021-05-09 14:49:51,925 : INFO : PROGRESS: at sentence #5000

2021-05-09 14:49:54,971 : INFO : PROGRESS: at sentence #690000, processed 4823162 words, keeping 159450 word types
2021-05-09 14:49:55,022 : INFO : PROGRESS: at sentence #700000, processed 4893431 words, keeping 160783 word types
2021-05-09 14:49:55,077 : INFO : PROGRESS: at sentence #710000, processed 4963766 words, keeping 162180 word types
2021-05-09 14:49:55,132 : INFO : PROGRESS: at sentence #720000, processed 5033545 words, keeping 163568 word types
2021-05-09 14:49:55,179 : INFO : PROGRESS: at sentence #730000, processed 5102921 words, keeping 164913 word types
2021-05-09 14:49:55,227 : INFO : PROGRESS: at sentence #740000, processed 5172821 words, keeping 166206 word types
2021-05-09 14:49:55,275 : INFO : PROGRESS: at sentence #750000, processed 5242285 words, keeping 167487 word types
2021-05-09 14:49:55,322 : INFO : PROGRESS: at sentence #760000, processed 5312307 words, keeping 168803 word types
2021-05-09 14:49:55,360 : INFO : PROGRESS: at sentence #770000, processed 538262

2021-05-09 14:49:58,652 : INFO : estimated required memory for 29194 words and 300 dimensions: 84662600 bytes
2021-05-09 14:49:58,653 : INFO : resetting layer weights
2021-05-09 14:49:58,772 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2021-05-09T14:49:58.772285', 'gensim': '4.0.1', 'python': '3.6.13 (default, Feb 19 2021, 05:17:09) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'build_vocab'}
2021-05-09 14:49:58,774 : INFO : Word2Vec lifecycle event {'msg': 'training model with 8 workers on 29194 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7', 'datetime': '2021-05-09T14:49:58.774281', 'gensim': '4.0.1', 'python': '3.6.13 (default, Feb 19 2021, 05:17:09) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.18362-SP0', 'event': 'train'}
2021-05-09 14:49:59,812 : INFO : EPOCH 1 - PROGRESS: at 4.26% examples, 334341 words/s, in_qsize 12, out_qsize 3
2021-05-09 14:50:00,855 : INF

2021-05-09 14:50:51,887 : INFO : EPOCH 3 - PROGRESS: at 59.33% examples, 304003 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:50:52,896 : INFO : EPOCH 3 - PROGRESS: at 63.78% examples, 307058 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:50:53,919 : INFO : EPOCH 3 - PROGRESS: at 68.37% examples, 310057 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:50:54,933 : INFO : EPOCH 3 - PROGRESS: at 73.18% examples, 313814 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:50:55,991 : INFO : EPOCH 3 - PROGRESS: at 77.21% examples, 313311 words/s, in_qsize 14, out_qsize 4
2021-05-09 14:50:57,063 : INFO : EPOCH 3 - PROGRESS: at 81.93% examples, 315239 words/s, in_qsize 16, out_qsize 1
2021-05-09 14:50:58,125 : INFO : EPOCH 3 - PROGRESS: at 86.28% examples, 315865 words/s, in_qsize 12, out_qsize 4
2021-05-09 14:50:59,146 : INFO : EPOCH 3 - PROGRESS: at 90.53% examples, 316608 words/s, in_qsize 14, out_qsize 1
2021-05-09 14:51:00,153 : INFO : EPOCH 3 - PROGRESS: at 95.25% examples, 318999 words/s,

2021-05-09 14:51:51,292 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:51:51,296 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:51:51,300 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:51:51,315 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:51:51,321 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:51:51,332 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-09 14:51:51,336 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-09 14:51:51,379 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-09 14:51:51,383 : INFO : EPOCH - 5 : training on 8941546 raw words (8004731 effective words) took 24.1s, 332534 effective words/s
2021-05-09 14:51:52,499 : INFO : EPOCH 6 - PROGRESS: at 3.47% examples, 258845 words/s, in_qsize 15, out_qsize 2
2021-05-09 14:51:53,6

2021-05-09 14:52:45,006 : INFO : EPOCH 8 - PROGRESS: at 34.34% examples, 334122 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:52:46,092 : INFO : EPOCH 8 - PROGRESS: at 39.13% examples, 336489 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:52:47,132 : INFO : EPOCH 8 - PROGRESS: at 43.37% examples, 335516 words/s, in_qsize 12, out_qsize 5
2021-05-09 14:52:48,185 : INFO : EPOCH 8 - PROGRESS: at 48.72% examples, 342212 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:52:49,188 : INFO : EPOCH 8 - PROGRESS: at 53.07% examples, 342692 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:52:50,228 : INFO : EPOCH 8 - PROGRESS: at 57.98% examples, 345454 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:52:51,247 : INFO : EPOCH 8 - PROGRESS: at 62.00% examples, 343373 words/s, in_qsize 15, out_qsize 1
2021-05-09 14:52:52,275 : INFO : EPOCH 8 - PROGRESS: at 66.80% examples, 345439 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:52:53,330 : INFO : EPOCH 8 - PROGRESS: at 71.95% examples, 348305 words/s,

2021-05-09 14:53:42,558 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:53:42,574 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:53:42,584 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:53:42,596 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:53:42,600 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-09 14:53:42,614 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-09 14:53:42,641 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-09 14:53:42,645 : INFO : EPOCH - 10 : training on 8941546 raw words (8004219 effective words) took 20.5s, 391164 effective words/s
2021-05-09 14:53:43,667 : INFO : EPOCH 11 - PROGRESS: at 4.03% examples, 321526 words/s, in_qsize 16, out_qsize 1
2021-05-09 14:53:44,678 : INFO : EPOCH 11 - PROGRESS: at 9.97% examples, 395636 words/s, in_qsize 15, out_qsiz

2021-05-09 14:54:35,779 : INFO : EPOCH 13 - PROGRESS: at 93.33% examples, 453994 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:54:36,739 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:54:36,749 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:54:36,752 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:54:36,757 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:54:36,763 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:54:36,768 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-09 14:54:36,802 : INFO : EPOCH 13 - PROGRESS: at 99.89% examples, 457385 words/s, in_qsize 1, out_qsize 1
2021-05-09 14:54:36,806 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-09 14:54:36,809 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-09 14:54:36,811 : INFO : EPOCH - 13

2021-05-09 14:55:26,745 : INFO : EPOCH 16 - PROGRESS: at 79.12% examples, 444372 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:55:27,766 : INFO : EPOCH 16 - PROGRESS: at 84.61% examples, 443363 words/s, in_qsize 14, out_qsize 2
2021-05-09 14:55:28,771 : INFO : EPOCH 16 - PROGRESS: at 90.53% examples, 445112 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:55:29,801 : INFO : EPOCH 16 - PROGRESS: at 96.15% examples, 444460 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:55:30,251 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:55:30,262 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:55:30,276 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:55:30,280 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:55:30,287 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:55:30,299 : INFO : worker thread finished; awaiting finish of 2 more th

2021-05-09 14:56:19,616 : INFO : EPOCH 19 - PROGRESS: at 78.78% examples, 437656 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:56:20,657 : INFO : EPOCH 19 - PROGRESS: at 85.17% examples, 441147 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:56:21,674 : INFO : EPOCH 19 - PROGRESS: at 91.10% examples, 442690 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:56:22,722 : INFO : EPOCH 19 - PROGRESS: at 97.26% examples, 444268 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:56:23,056 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:56:23,073 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:56:23,074 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:56:23,076 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:56:23,109 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:56:23,130 : INFO : worker thread finished; awaiting finish of 2 more th

2021-05-09 14:57:11,004 : INFO : EPOCH 22 - PROGRESS: at 66.47% examples, 469293 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:57:12,045 : INFO : EPOCH 22 - PROGRESS: at 71.84% examples, 464570 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:57:13,093 : INFO : EPOCH 22 - PROGRESS: at 76.65% examples, 456986 words/s, in_qsize 16, out_qsize 1
2021-05-09 14:57:14,099 : INFO : EPOCH 22 - PROGRESS: at 81.59% examples, 452395 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:57:15,129 : INFO : EPOCH 22 - PROGRESS: at 86.28% examples, 446570 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:57:16,149 : INFO : EPOCH 22 - PROGRESS: at 91.99% examples, 446592 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:57:17,186 : INFO : EPOCH 22 - PROGRESS: at 97.93% examples, 447222 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:57:17,376 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:57:17,390 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14

2021-05-09 14:58:01,832 : INFO : EPOCH 25 - PROGRESS: at 70.72% examples, 504933 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:58:02,844 : INFO : EPOCH 25 - PROGRESS: at 77.43% examples, 507043 words/s, in_qsize 14, out_qsize 1
2021-05-09 14:58:03,845 : INFO : EPOCH 25 - PROGRESS: at 84.16% examples, 509272 words/s, in_qsize 16, out_qsize 0
2021-05-09 14:58:04,875 : INFO : EPOCH 25 - PROGRESS: at 90.42% examples, 507593 words/s, in_qsize 15, out_qsize 1
2021-05-09 14:58:05,900 : INFO : EPOCH 25 - PROGRESS: at 97.37% examples, 509836 words/s, in_qsize 16, out_qsize 1
2021-05-09 14:58:06,163 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:58:06,166 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:58:06,172 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:58:06,177 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:58:06,197 : INFO : worker thread finished; awai

2021-05-09 14:58:53,145 : INFO : EPOCH 28 - PROGRESS: at 96.71% examples, 502099 words/s, in_qsize 15, out_qsize 2
2021-05-09 14:58:53,418 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-09 14:58:53,450 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-09 14:58:53,458 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-09 14:58:53,467 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:58:53,477 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:58:53,482 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-09 14:58:53,495 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-09 14:58:53,506 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-09 14:58:53,509 : INFO : EPOCH - 28 : training on 8941546 raw words (8005177 effective words) took 15.8s, 507260 effective words/s
2021-05-09 14:58:5

2021-05-09 14:59:42,136 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-09 14:59:42,145 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-09 14:59:42,149 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-09 14:59:42,157 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-09 14:59:42,179 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-09 14:59:42,183 : INFO : EPOCH - 31 : training on 8941546 raw words (8004578 effective words) took 15.8s, 506415 effective words/s
2021-05-09 14:59:43,261 : INFO : EPOCH 32 - PROGRESS: at 6.05% examples, 454614 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:59:44,282 : INFO : EPOCH 32 - PROGRESS: at 12.79% examples, 489923 words/s, in_qsize 15, out_qsize 0
2021-05-09 14:59:45,298 : INFO : EPOCH 32 - PROGRESS: at 19.25% examples, 496811 words/s, in_qsize 15, out_qsize 6
2021-05-09 14:59:46,318 : INFO : EPOCH 32 - PROGRESS: at 26.07

In [19]:
filename = 'w2v_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [20]:
model.wv.most_similar('love')

[('luv', 0.5982000827789307),
 ('loves', 0.5754449963569641),
 ('adore', 0.5405144691467285),
 ('loved', 0.534015417098999),
 ('amazing', 0.5124794840812683),
 ('looove', 0.49945759773254395),
 ('awesome', 0.4703872501850128),
 ('loveee', 0.45295587182044983),
 ('lt', 0.45230111479759216),
 ('lovee', 0.44587135314941406)]

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['Text'])

vocab_size = len(tokenizer.word_index) + 1

In [8]:
SEQUENCE_LENGTH = 300
x_train = pad_sequences(tokenizer.texts_to_sequences(train['Text']), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test['Text']), maxlen=SEQUENCE_LENGTH)

In [9]:
labels = [-1,1,0]
encoder = LabelEncoder()
encoder.fit(train['Sentiment'].tolist())

y_train = encoder.transform(train['Sentiment'].tolist())
y_test = encoder.transform(test['Sentiment'].tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

In [10]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in model.wv:
    embedding_matrix[i] = model.wv[word]

In [11]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [12]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [13]:
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where








Train on 1152000 samples, validate on 128000 samples
Epoch 1/8

KeyboardInterrupt: 