# Modeling 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('trainDataFeatures.tsv', sep='\t', index_col=0)

In [3]:
columns = df.columns[3:]

In [4]:
columns

Index([u'review_length', u'good', u'awful', u'bad', u'amazing', u'awesome',
       u'terrible', u'boring', u'interesting'],
      dtype='object')

In [5]:
X = np.asarray(df[columns])

In [6]:
X

array([[2302,    0,    0, ...,    3,    0,    1],
       [ 946,    0,    0, ...,    0,    0,    0],
       [2449,    0,    0, ...,    0,    0,    1],
       ..., 
       [ 641,    0,    0, ...,    0,    0,    0],
       [1141,    0,    0, ...,    0,    0,    0],
       [ 942,    0,    0, ...,    0,    0,    0]])

In [7]:
y = np.asarray(df.sentiment.transpose())

In [8]:
y

array([1, 1, 0, ..., 0, 0, 1])

In [9]:
from sklearn.naive_bayes import MultinomialNB

In [10]:
nb = MultinomialNB()

In [11]:
nb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
nb.predict(np.asarray(df[columns]))

array([0, 1, 0, ..., 1, 1, 1])

In [13]:
np.mean(nb.predict(np.asarray(df[columns])) == df.sentiment)

0.67023999999999995

## Make Kaggle submission with test data

In [14]:
test_data = pd.read_csv('testData.tsv', sep='\t')

In [15]:
test_data

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...
5,2913_8,"...as valuable as King Tut's tomb! (OK, maybe ..."
6,4396_1,This has to be one of the biggest misfires eve...
7,395_2,"This is one of those movies I watched, and won..."
8,10616_1,The worst movie i've seen in years (and i've s...
9,9074_9,"Five medical students (Kevin Bacon, David Labr..."


In [16]:
test_data['review_length'] = test_data.review.apply(len)

In [17]:
# Do the same feature extraction with the test data

from sklearn.feature_extraction.text import CountVectorizer
vocab_expand = ['awesome', 'good', 'amazing', 'interesting', 'terrible', 'bad', 'awful','boring']
expand_vectorizer = CountVectorizer(vocabulary=vocab_expand)
bow_expand = expand_vectorizer.fit_transform(test_data.review).todense()
words_expand = list(expand_vectorizer.vocabulary_.keys())
bow_expand_df = pd.DataFrame(bow_expand, index=test_data.index, columns = words_expand)
test_data_expand = test_data.join(bow_expand_df)
test_data_expand.to_csv('testDataFeatures.tsv', sep='\t')

In [18]:
predictions = nb.predict(np.asarray(test_data_expand[columns]))

In [19]:
submission = pd.DataFrame({"id": test_data.id, "sentiment": predictions})

In [20]:
submission

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1
5,2913_8,1
6,4396_1,1
7,395_2,0
8,10616_1,0
9,9074_9,1
