In [213]:
import numpy as np
import pandas as pd
import matplotlib as plt
import re 
import logging
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import nltk.data
import nltk
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

ModuleNotFoundError: No module named 'keras'

In [202]:
col_names = ['ID','Text','Selected_Text','Sentiment']
data = pd.read_csv('https://media.githubusercontent.com/media/dczzzzzdc/CSC590_Design_Project/main/Data/train.csv')
data.columns = col_names
sentiment_conv = {'negative':-1,'positive':1,'neutral':0}
data.drop(['Selected_Text','ID'],axis = 1,inplace = True)
data['Sentiment'] = data['Sentiment'].map(sentiment_conv)

In [203]:
stops = set(stopwords.words("english"))
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stemmer = SnowballStemmer("english")

def process_text(text,remove_stops = False, stem = False):
    text = str(text).lower().strip()
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove url links
    text = re.sub("@[\w]*",'',text) # remove "@user"
    text = re.sub('[^a-zA-Z]',' ',text) # leave only characters
    words =[]
    for word in text.split():
        if not remove_stops or word not in stops:
            if not stem:
                words.append(word)
            else:
                words.append(stemmer.stem(word))
    return words    

data['Text'] = data['Text'].apply(lambda x: process_text(x,remove_stops = True))

In [204]:
train,test = train_test_split(data, test_size=0.2, random_state=42)
train_sentences = train['Text'].tolist()

In [206]:
# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# Initializing the train model
from gensim.models import word2vec
model = word2vec.Word2Vec(size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=8)
model.build_vocab(train_sentences)
model.train(train_sentences, total_examples=len(train_sentences), epochs=W2V_EPOCH)

2021-05-08 20:38:34,174 : INFO : collecting all words and their counts
2021-05-08 20:38:34,176 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-05-08 20:38:34,208 : INFO : PROGRESS: at sentence #10000, processed 69542 words, keeping 12812 word types
2021-05-08 20:38:34,243 : INFO : PROGRESS: at sentence #20000, processed 139226 words, keeping 19484 word types
2021-05-08 20:38:34,252 : INFO : collected 20544 word types from a corpus of 152813 raw words and 21984 sentences
2021-05-08 20:38:34,254 : INFO : Loading a fresh vocabulary
2021-05-08 20:38:34,275 : INFO : effective_min_count=10 retains 2061 unique words (10% of original 20544, drops 18483)
2021-05-08 20:38:34,277 : INFO : effective_min_count=10 leaves 117725 word corpus (77% of original 152813, drops 35088)
2021-05-08 20:38:34,287 : INFO : deleting the raw counts dictionary of 20544 items
2021-05-08 20:38:34,289 : INFO : sample=0.001 downsamples 62 most-common words
2021-05-08 20:38:34,290 : INFO :

2021-05-08 20:38:37,563 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-08 20:38:37,577 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-08 20:38:37,579 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-08 20:38:37,588 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-08 20:38:37,607 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-08 20:38:37,612 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-08 20:38:37,615 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-08 20:38:37,617 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-08 20:38:37,620 : INFO : EPOCH - 9 : training on 152813 raw words (104299 effective words) took 0.2s, 474578 effective words/s
2021-05-08 20:38:37,827 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-08 20:38:37,848 : INFO : worker thread

2021-05-08 20:38:39,970 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-08 20:38:39,983 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-08 20:38:39,989 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-08 20:38:39,991 : INFO : EPOCH - 18 : training on 152813 raw words (104514 effective words) took 0.2s, 439360 effective words/s
2021-05-08 20:38:40,201 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-08 20:38:40,214 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-08 20:38:40,219 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-08 20:38:40,220 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-08 20:38:40,250 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-08 20:38:40,254 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-08 20:38:40,262 : INFO : worker threa

2021-05-08 20:38:42,678 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-08 20:38:42,689 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-05-08 20:38:42,692 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-05-08 20:38:42,702 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-05-08 20:38:42,705 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-05-08 20:38:42,721 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-05-08 20:38:42,728 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-05-08 20:38:42,732 : INFO : EPOCH - 28 : training on 152813 raw words (104409 effective words) took 0.3s, 369586 effective words/s
2021-05-08 20:38:42,913 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-05-08 20:38:42,951 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-05-08 20:38:42,958 : INFO : worker threa

(3341687, 4890016)

In [212]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

NameError: name 'Tokenizer' is not defined