In [1]:
import nltk #natural language tool kit for tokenization, stop words, lemmatization
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import tree, ensemble, model_selection, metrics
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import string
import pickle
import os

# Data Preprocessing

In [6]:
#load data
current_path = os.getcwd()
pickle_in = open('data.pickle', 'rb')
train_data = pickle.load(pickle_in)

#lowercase
train_data['text'] = train_data['text'].apply(lambda x: x.lower())

#punctuations removal
def punctuation_removal(text):
    return text.translate(str.maketrans('','',string.punctuation))
train_data['text'] = train_data['text'].apply(punctuation_removal)

#stopwords removal
stopwords_lst = stopwords.words('english')
def stopwords_removal(text):
    text_lst = text.split()
    text_lst = [word for word in text_lst if word.lower() not in stopwords_lst]
    text = ' '.join(text_lst)
    return text
train_data['text'] = train_data['text'].apply(stopwords_removal)

#lemmatization
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmawords = [lemmatizer.lemmatize(w) for w in text.split()]
    text = ' '.join(lemmawords)
    return text
train_data['text'] = train_data['text'].apply(lemmatize)


#train-test split
train_data = train_data.sample(frac=1).reset_index(drop=True)
train, test = train_test_split(train_data, test_size = 0.2, random_state=2022)

#vectorize text data 
#NOTE: simple way to both tokenize a collection of text documents and build a vocabulary of known words
vect = CountVectorizer()
train_X_dtm = vect.fit_transform(train['text'])
test_X_dtm = vect.transform(test['text'])

#prepare train data and its labels
x_train = train_X_dtm
y_train = train['target']

#prepare test data and its labels
x_test = test_X_dtm
y_test = test['target']

print(f'train: {train_X_dtm.shape}, test: {test_X_dtm.shape}')

train: (17922, 24771), test: (4481, 24771)


# VADER

In [2]:
#function to print sentiments of a sentence
def print_sentiment_score(sentence):
    SI_object = SentimentIntensityAnalyzer()
    
    #get dictionary of sentiments of sentence
    sentiment_dict = SI_object.polarity_scores(sentence)

    #get individual scores
    neg = sentiment_dict['neg']*100
    neu = sentiment_dict['neu']*100
    pos = sentiment_dict['pos']*100
    overall = sentiment_dict['compound']
    
    #overall sentiment result of sentence
    if overall>=0.05:
        result = 'Positive'  
    elif overall<= -0.05:
        result = 'Negative'
    else:
        result = 'Neutral'
        
    print(f"Negative: {neg}\nNeutral: {neu}\nPositive: {pos}\nOverall: {overall}\nResult: {result}")

#function to return overall sentiment score of a sentence
def get_sentiment_score(sentence):
    SI_object = SentimentIntensityAnalyzer()
    
    #get dictionary of sentiments of sentence
    sentiment_dict = SI_object.polarity_scores(sentence)

    #get overall sentiment score
    overall = sentiment_dict['compound']
    
    return overall

In [159]:
s = 'Damn for a second I thought this was about the cabbages guy from ATLA. Anyway, buying GME right now...'
get_sentiment_score(s)

-0.4019

# Random Forest Classifier

In [158]:
#fit model
rforest = ensemble.RandomForestClassifier(n_estimators=100)
rforest.fit(x_train, y_train)

#predict
rf_y_pred_class = rforest.predict(x_test)

#accuracy
rf_accuracy = metrics.accuracy_score(y_test, rf_y_pred_class)
rf_accuracy #0.7426913635349253

0.7426913635349253

## Multinomial Naive Bayes

In [138]:
from sklearn.naive_bayes import MultinomialNB

#fit training data
nb = MultinomialNB()
nb.fit(x_train, y_train)

#predict
nb_y_pred_class = nb.predict(x_test)

#accuracy
nb_accuracy = metrics.accuracy_score(y_test, nb_y_pred_class)
nb_accuracy #0.7176969426467307

0.7176969426467307

## Bagging

In [160]:
#fit training data
bagging = ensemble.BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=100, max_samples=x_train.shape[0], max_features=x_train.shape[1], random_state=2022)
bagging.fit(x_train, y_train)

#predict
bagging_y_pred_class = bagging.predict(x_test)

#accuracy
bagging_accuracy = metrics.accuracy_score(y_test, bagging_y_pred_class)
bagging_accuracy #0.7228297255076992

0.7228297255076992

# LSTM

In [151]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Conv1D, Embedding, GlobalAveragePooling1D, SpatialDropout1D, Bidirectional, Flatten, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import to_categorical


# preprocess data
train_sentences = list(train['text'])
train_labels = list(train['target'])

training_size = round(len(train_sentences)*1)

training_sentences = train_sentences[0:training_size]
training_labels = train_labels[0:training_size]

valid_sentences = train_sentences[training_size:]
valid_labels = train_labels[training_size:]

test_sentences = list(test['text'])
test_labels = list(test['target'])

# Setting tokenizer properties
vocab_size = 24500
oov_tok = "<oov>"

# Fit the tokenizer on Training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)


# Setting the padding properties
sequence_length = 300 #longest text is 271 in dataset
trunc_type='post'
padding_type='post'

# Creating padded sequences from train and test data
training_sentences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sentences, maxlen=sequence_length, padding=padding_type, truncating=trunc_type)

valid_sentences = tokenizer.texts_to_sequences(valid_sentences)
valid_padded = pad_sequences(valid_sentences, maxlen=sequence_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=sequence_length, padding=padding_type, truncating=trunc_type)

# Setting the model parameters
embedding_dim = 40
early_stopping = EarlyStopping(monitor='val_accuracy', patience = 5, restore_best_weights=True)
reduce_lr_on_plateau = ReduceLROnPlateau(
                                        monitor='val_loss', 
                                        factor=0.1, 
                                        patience=2, 
                                        verbose=1,
                                        mode='min', 
                                        min_delta=0.0001, 
                                        cooldown=0, 
                                        min_lr=0,
                                        )
model = Sequential([
  Embedding(vocab_size, embedding_dim, input_length =sequence_length),
  SpatialDropout1D(0.5),
  # GlobalAveragePooling1D(),
  # Conv1D(128, 1),
  Bidirectional(LSTM(units=128), name='bd_1'),
  # LSTM(units=128),
  # Bidirectional(LSTM(units=64, return_sequences=True), name='bd_1'),
  # Dropout(0.6),
  # Bidirectional(LSTM(units=32, return_sequences=False), name='bd_2'),
  # LSTM(64, dropout = 0.2, recurrent_dropout=0.2),
  # Dense(128, activation='relu'),
  Dropout(0.5),
  Dense(32, activation='relu'),
  Dropout(0.5),
  GlobalAveragePooling1D(),
  Dense(3, activation = 'softmax')
])

# adam = Adam(learning_rate = 1e-5)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

#convert train data to numpy array
x_train_padded = np.array(training_padded)
train_labels = np.array(train_labels)

x_valid_padded = np.array(valid_padded)
valid_labels =np.array(valid_labels)

x_test_padded = np.array(test_padded)
test_labels = np.array(test_labels)

training_labels = to_categorical(training_labels, 3)
valid_labels = to_categorical(valid_labels, 3)
test_labels = to_categorical(test_labels, 3)

#train model
num_epochs = 15
model.fit(x_train_padded, training_labels, epochs=num_epochs, validation_data=(x_test_padded, test_labels), callbacks=[reduce_lr_on_plateau])


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/15
Epoch 6/15

Epoch 00006: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 7/15
Epoch 8/15

Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 9/15
Epoch 10/15

Epoch 00010: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 11/15
Epoch 12/15

Epoch 00012: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 13/15
Epoch 14/15

Epoch 00014: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fb67b39f820>

In [152]:
result = model.evaluate(x_test_padded, test_labels)
result 



[0.7980167865753174, 0.7462620139122009]

In [153]:
model.save('LSTM.h5')

In [161]:
model.summary()

Model: "sequential_43"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_44 (Embedding)     (None, 300, 40)           980000    
_________________________________________________________________
spatial_dropout1d_40 (Spatia (None, 300, 40)           0         
_________________________________________________________________
bd_1 (Bidirectional)         (None, 256)               173056    
_________________________________________________________________
dropout_67 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_90 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_68 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_91 (Dense)             (None, 3)               

## ANN

In [169]:
model2 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length =sequence_length),
    Dropout(0.5),
    Dense(32, activation = 'relu'),
    GlobalAveragePooling1D(),
    Dropout(0.5),
    Dense(3, activation = 'softmax')])

model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 50
model2.fit(x_train_padded, training_labels, epochs=num_epochs, validation_data=(x_test_padded, test_labels), callbacks=[reduce_lr_on_plateau])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 13/50
Epoch 14/50

Epoch 00014: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 15/50
Epoch 16/50

Epoch 00016: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 17/50
Epoch 18/50

Epoch 00018: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 19/50
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 21/50
Epoch 22/50

Epoch 00022: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 23/50
Epoch 24/50

Epoch 00024: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-10.
Epoch 25/50
Epoch 26/50

Epoch 00026: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-11.
Epoch 27/50
Epoch 28/50

Epoch 00028: ReduceL

<tensorflow.python.keras.callbacks.History at 0x7fb6a8924b20>

In [171]:
model2.evaluate(x_test_padded, test_labels)



[0.6419309973716736, 0.7290783524513245]

In [170]:
model2.summary()

Model: "sequential_51"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_48 (Embedding)     (None, 300, 40)           980000    
_________________________________________________________________
dropout_81 (Dropout)         (None, 300, 40)           0         
_________________________________________________________________
dense_110 (Dense)            (None, 300, 32)           1312      
_________________________________________________________________
global_average_pooling1d_9 ( (None, 32)                0         
_________________________________________________________________
dropout_82 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_111 (Dense)            (None, 3)                 99        
Total params: 981,411
Trainable params: 981,411
Non-trainable params: 0
_______________________________________________

In [21]:
import tensorflow
tensorflow.keras.utils.to_categorical(train_data['target'], 3)[4] 
#-1: [0,0,1], 0: [1,0,0], 1: [0,1,0]

array([1., 0., 0.], dtype=float32)

In [8]:
train_data['target']

0       -1
1       -1
2       -1
3        1
4        0
        ..
22398    1
22399    0
22400    1
22401   -1
22402    0
Name: target, Length: 22403, dtype: int64