In [None]:
# link drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import packages
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
import csv
import pandas as pd

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
# get training data 
train_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/340W_Project/data/train.tsv', sep='\t', header=0)
train_pid = train_data['PhraseId'].tolist()
train_sid = train_data['SentenceId'].tolist()
train_phrase = train_data['Phrase'].tolist()
train_y = train_data['Sentiment'].tolist()
train_data

# repeat this same process for the test data
# recall that there is no sentiment column here
test_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/340W_Project/data/test.tsv', sep='\t', header=0)
test_pid = test_data['PhraseId'].tolist()
test_sid = test_data['SentenceId'].tolist()
test_phrase = test_data['Phrase'].tolist()
test_data

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine
...,...,...,...
66287,222348,11855,"A long-winded , predictable scenario ."
66288,222349,11855,"A long-winded , predictable scenario"
66289,222350,11855,"A long-winded ,"
66290,222351,11855,A long-winded


In [None]:
# create a function that preprocesses the words in each sentence
# the goal of preprocessing is to be able to find all impactful words
# as well as avoid treating similar words as separate
# Ex: "Cat", "CATS", "cats", and "cat" should all be treated the same

def phrase_preprocessor(phrases):
  stemmer = WordNetLemmatizer()

  processed_phrases = []

  for sen in range(len(phrases)):
    # remove special chars
    phrase = re.sub(r'\W', ' ', str(phrases[sen]))

    # remove all single characters
    phrase = re.sub(r'\s+[a-zA-Z]\s+', ' ', phrase)

    # removes single characters from start of phrase
    phrase = re.sub(r'\^[a-zA-Z]\s+', ' ', phrase) 

    # replace multiple spaces with a single space
    phrase = re.sub(r'\s+', ' ', phrase, flags=re.I)

    # make the phrase all lowercse
    phrase = phrase.lower()

    # lemmatization (i.e. remove small differences that dont't matter)
    # Ex: "cats" -> "cat"
    phrase = phrase.split()
    phrase = [stemmer.lemmatize(word) for word in phrase]
    phrase = ' '.join(phrase)

    # append to output list
    processed_phrases.append(phrase)

  return processed_phrases

# add a column of these processed phrases for both the train and test data
test_processed = phrase_preprocessor(test_phrase)
train_processed = phrase_preprocessor(train_phrase)

# see the result on the first element of train_phrase
print(train_processed[0])
print(len(train_processed))

# set up corpuses of data
corpus= phrase_preprocessor(train_data['Phrase'])

corpus1= phrase_preprocessor(test_data['Phrase'])

print(len(corpus1))

a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of story
156060
66292


In [None]:
# get counts of each sentiment

word_count=pd.value_counts(train_data['Sentiment'].values, sort=False)
word_count

0     7072
1    27273
2    79582
3    32927
4     9206
dtype: int64

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
x__train = cv.fit_transform(corpus).toarray()
x__test= cv.fit_transform(corpus1).toarray()
y = train_y


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x__train, y, test_size = 0.40, random_state = 0)

print(X_train[0])

[0 0 0 ... 0 0 0]


In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_real_pred = classifier.predict(x__test)

# compute accuracy

count = 0
preds = classifier.predict(X_test)
for i in range(len(preds)):
  if preds[i] == y_test[i]:
    count += 1
print("accuracy: " + str(count/len(X_test)))

accuracy: 0.5851595540176855


In [None]:
d = {'PhraseId':test_data['PhraseId'].tolist(), 'Sentiment':y_real_pred}
df = pd.DataFrame(d)

# create the csv and download it (in order to submit to Kaggle)
from google.colab import files
df.to_csv('/content/drive/My Drive/Colab Notebooks/340W_Project/data/submission.csv', index=False)
files.download('/content/drive/My Drive/Colab Notebooks/340W_Project/data/submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>