# Modeling 

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('trainDataFeatures.tsv', sep='\t', index_col=0)

In [5]:
df.tail()

Unnamed: 0,id,sentiment,review,review_length,awesome,love,enjoyed,good,amazing,interesting,terrible,bad,awful,boring,stupid,disgusting,poor,ridiculous
24995,3453_3,0,It seems like more consideration has gone into...,580,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24996,5064_1,0,I don't believe they made this film. Completel...,975,0,0,0,2,0,1,0,1,0,0,0,0,0,0
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil...",641,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...,1145,0,0,0,0,0,0,0,0,0,0,0,0,3,0
24999,8478_8,1,I saw this movie as a child and it broke my he...,942,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
columns = df.columns[3:]

In [11]:
columns

Index([u'review_length', u'awesome', u'love', u'enjoyed', u'good', u'amazing',
       u'interesting', u'terrible', u'bad', u'awful', u'boring', u'stupid',
       u'disgusting', u'poor', u'ridiculous'],
      dtype='object')

In [12]:
X = np.asarray(df[columns])

In [13]:
X

array([[2302,    0,    0, ...,    0,    0,    0],
       [ 946,    0,    0, ...,    0,    0,    0],
       [2455,    0,    0, ...,    0,    0,    1],
       ..., 
       [ 641,    0,    0, ...,    0,    0,    0],
       [1145,    0,    0, ...,    0,    3,    0],
       [ 942,    0,    0, ...,    0,    0,    0]])

In [14]:
y = np.asarray(df.sentiment.transpose())

In [15]:
y

array([1, 1, 0, ..., 0, 0, 1])

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
nb = MultinomialNB()

In [36]:
nb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [37]:
nb.predict(np.asarray(df[columns]))

array([0, 1, 0, ..., 1, 0, 1])

In [31]:
np.mean(nb.predict(np.asarray(df[columns])) == df.sentiment)

0.70164000000000004

In [32]:
test_data = pd.read_csv('testData.tsv', sep='\t')

In [34]:
test_data.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [38]:
test_data['review_length'] = test_data.review.apply(len)

In [39]:
# Do the same feature extraction with the test data

from sklearn.feature_extraction.text import CountVectorizer
vocab_expand = ['awesome','love','enjoyed', 'good', 'amazing', 'interesting', 'terrible', 'bad', 'awful','boring','stupid','disgusting','poor','ridiculous']
expand_vectorizer = CountVectorizer(vocabulary=vocab_expand)

bow_expand = expand_vectorizer.fit_transform(test_data.review).todense()
words_expand = list(expand_vectorizer.vocabulary_.keys())
bow_expand_df = pd.DataFrame(bow_expand, index=test_data.index, columns = words_expand)
test_data_expand = test_data.join(bow_expand_df)
test_data_expand.to_csv('testDataFeatures.tsv', sep='\t')

In [40]:
predictions = nb.predict(np.asarray(test_data_expand[columns]))

In [41]:
submission = pd.DataFrame({"id": test_data.id, "sentiment": predictions})

In [42]:
submission.to_csv('submission_kaggle.tsv', sep='\t')

In [43]:
!ls

0-Exploration.ipynb        3-Validation.ipynb         environment.yml            testData.tsv
1-Feature_extraction.ipynb 4-Ensemble.ipynb           labeledTrainData.tsv       testDataFeatures.tsv
2-Modeling.ipynb           README.md                  submission_kaggle.tsv      trainDataFeatures.tsv
