In [None]:
# link drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import packages
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
import csv
import pandas as pd

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
# get training data 
train_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/340W_Project/data/train.tsv', sep='\t', header=0)
train_pid = train_data['PhraseId'].tolist()
train_sid = train_data['SentenceId'].tolist()
train_phrase = train_data['Phrase'].tolist()
train_y = train_data['Sentiment'].tolist()
train_data

# get only the rows of training data with the entire phrase

sentence_dict = dict()
for i in train_sid:
  if i not in sentence_dict:
    sentence_dict[i] = train_sid.index(i)

new_train_sid = []
new_train_pid = []
new_train_phrase = []
new_train_y = []

for i in sentence_dict:
  new_train_sid.append(train_sid[sentence_dict[i]])
  new_train_pid.append(train_pid[sentence_dict[i]])
  new_train_phrase.append(train_phrase[sentence_dict[i]])
  new_train_y.append(train_y[sentence_dict[i]])

train_sid = new_train_sid
train_pid = new_train_pid
train_phrase = new_train_phrase
train_y = new_train_y

print(train_pid[0], train_sid[0], train_phrase[0], train_y[0])

1 1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1


In [None]:
# repeat this same process for the test data
# recall that there is no sentiment column here
test_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/340W_Project/data/test.tsv', sep='\t', header=0)
test_pid = test_data['PhraseId'].tolist()
test_sid = test_data['SentenceId'].tolist()
test_phrase = test_data['Phrase'].tolist()
test_data

# get only the rows of training data with the entire phrase

sentence_dict = dict()
for i in test_sid:
  if i not in sentence_dict:
    sentence_dict[i] = test_sid.index(i)

new_test_sid = []
new_test_pid = []
new_test_phrase = []
new_test_y = []

for i in sentence_dict:
  new_test_sid.append(test_sid[sentence_dict[i]])
  new_test_pid.append(test_pid[sentence_dict[i]])
  new_test_phrase.append(test_phrase[sentence_dict[i]])

test_sid = new_test_sid
test_pid = new_test_pid
test_phrase = new_test_phrase
test_y = new_test_y

print(test_pid[0], test_sid[0], test_phrase[0])

156061 8545 An intermittently pleasing but mostly routine effort .


In [None]:
# create a function that preprocesses the words in each sentence
# the goal of preprocessing is to be able to find all impactful words
# as well as avoid treating similar words as separate
# Ex: "Cat", "CATS", "cats", and "cat" should all be treated the same

def phrase_preprocessor(phrases):
  stemmer = WordNetLemmatizer()

  processed_phrases = []

  for sen in range(len(phrases)):
    # remove special chars
    phrase = re.sub(r'\W', ' ', str(phrases[sen]))

    # remove all single characters
    phrase = re.sub(r'\s+[a-zA-Z]\s+', ' ', phrase)

    # removes single characters from start of phrase
    phrase = re.sub(r'\^[a-zA-Z]\s+', ' ', phrase) 

    # replace multiple spaces with a single space
    phrase = re.sub(r'\s+', ' ', phrase, flags=re.I)

    # make the phrase all lowercse
    phrase = phrase.lower()

    # lemmatization (i.e. remove small differences that dont't matter)
    # Ex: "cats" -> "cat"
    phrase = phrase.split()
    phrase = [stemmer.lemmatize(word) for word in phrase]
    phrase = ' '.join(phrase)

    # append to output list
    processed_phrases.append(phrase)

  return processed_phrases

# add a column of these processed phrases for both the train and test data
test_processed = phrase_preprocessor(test_phrase)
train_processed = phrase_preprocessor(train_phrase)

# see the result on the first element of train_phrase
print(train_processed[0])

a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of story


In [None]:
# convert the text to numbers that can be processed
from nltk.corpus import stopwords

####################
# the hyperparameters in vectorizer WILL need to be tuned
####################
vectorizer = CountVectorizer(max_features=700, min_df=20, max_df=0.7, stop_words=stopwords.words('english'), analyzer='word')

# train data being vectorized
train_vectorized = vectorizer.fit_transform(train_processed).toarray()

# test data being vectorized
test_vectorized = vectorizer.fit_transform(test_processed).toarray()

# view the results in train_vectorized[0]
print(train_processed[0])
print(train_vectorized[0])

a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of story
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 

In [None]:
# find term frequency and inverse document frequency
# this is known as a tfidf score
from sklearn.feature_extraction.text import TfidfTransformer

tfidfconverter = TfidfTransformer()

# train data
train_tfidf = tfidfconverter.fit_transform(train_vectorized).toarray()

# test data
test_tfidf = tfidfconverter.fit_transform(test_vectorized).toarray()

# view results in train_tfidf[0]
print(train_tfidf[0])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.29859554 0.         0.         0.
 0.         0.         0.38226082 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [None]:
# make a train / validation split on our training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_tfidf, train_y, test_size=0.2, random_state=0)

In [None]:
# train a random forest classifier on the data
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier # imported this to see if it was better. It was not.

####################
# check for results on ALL training rows
train_pid = train_data['PhraseId'].tolist()
train_sid = train_data['SentenceId'].tolist()
train_phrase = train_data['Phrase'].tolist()
train_y = train_data['Sentiment'].tolist()

train_processed = phrase_preprocessor(train_phrase)

train_vectorized = vectorizer.fit_transform(train_processed).toarray()

train_tfidf = tfidfconverter.fit_transform(train_vectorized).toarray()

X_train, X_test, y_train, y_test = train_test_split(train_tfidf, train_y, test_size=0.2, random_state=0)

####################
# these hyperparameters WILL need to be tweaked
####################
classifier = RandomForestClassifier(n_estimators=1000, random_state=2, verbose=True)
classifier.fit(X_train, y_train) 
y_pred = classifier.predict(X_test)

# check the number of classes, should be five
print(classifier.n_classes_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed: 136.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


5


[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  1.0min finished


In [None]:
# validate results
# the bottommost number is our overall accuracy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

####################
# NOTE: We have not done anything with the test_... data from before. 
# this is because it has no sentiment column for us to compare to.
# if we wish to submit our results to Kaggle, however, we need to use
# our model to make prediction on test_tfidf
####################

[[  368   496   596    40     4]
 [  327  1674  3224   206    22]
 [   82   870 13870  1082    78]
 [   12   164  3348  2479   438]
 [    0    17   521   733   561]]
              precision    recall  f1-score   support

           0       0.47      0.24      0.32      1504
           1       0.52      0.31      0.39      5453
           2       0.64      0.87      0.74     15982
           3       0.55      0.38      0.45      6441
           4       0.51      0.31      0.38      1832

    accuracy                           0.61     31212
   macro avg       0.54      0.42      0.46     31212
weighted avg       0.59      0.61      0.58     31212

0.60720235806741


In [None]:
# create a csv to submit to Kaggle 
# for submission to Kaggle, we need to predict on all phrases, not just whole phrases
# model accuracy may suffer as a result

test_phrase2 = test_data['Phrase'].tolist()
test_processed2 = phrase_preprocessor(test_phrase2)

test_vectorized2 = vectorizer.fit_transform(test_processed2).toarray()

test_tfidf2 = tfidfconverter.fit_transform(test_vectorized2).toarray()

print(test_processed2[0])
print(test_vectorized[0])
print(test_tfidf2[0])

y_pred2 = classifier.predict(test_tfidf2)

d = {'PhraseId':test_data['PhraseId'].tolist(), 'Sentiment':y_pred2}
df = pd.DataFrame(d)

# create the csv and download it (in order to submit to Kaggle)
from google.colab import files
df.to_csv('/content/drive/My Drive/Colab Notebooks/340W_Project/data/submission.csv', index=False)
files.download('/content/drive/My Drive/Colab Notebooks/340W_Project/data/submission.csv')

an intermittently pleasing but mostly routine effort
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.      

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1000 out of 1000 | elapsed:  2.4min finished


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# examine phrases, their predictions, and true values for the first few rows of data to see what might cause failures
for i in range(20):
  print(train_phrase[i], y_pred[i], y_train[i])
  print(" ")

####################
# Our model still seems to struggle with sentence negation and indirect language.
####################

A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 2 2
 
A series of escapades demonstrating the adage that what is good for the goose 2 1
 
A series 2 3
 
A 1 4
 
series 3 4
 
of escapades demonstrating the adage that what is good for the goose 2 2
 
of 2 3
 
escapades demonstrating the adage that what is good for the goose 2 2
 
escapades 2 3
 
demonstrating the adage that what is good for the goose 2 2
 
demonstrating the adage 3 2
 
demonstrating 2 2
 
the adage 2 0
 
the 2 2
 
adage 2 3
 
that what is good for the goose 2 3
 
that 2 2
 
what is good for the goose 2 2
 
what 2 2
 
is good for the goose 2 2
 


In [None]:
# view data

train_data[0:60]


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2
