# Modeling 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('trainDataFeatures.tsv', sep='\t', index_col=0)

In [4]:
df.head()

Unnamed: 0,id,sentiment,review,review_length,awesome,good,amazing,interesting,terrible,bad,awful,boring
0,5814_8,1,With all this stuff going down at the moment w...,2302,0,0,0,0,0,3,0,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",946,0,0,0,0,0,0,0,0
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,2455,0,0,0,0,0,0,0,1
3,3630_4,0,It must be assumed that those who praised this...,2245,0,1,0,0,0,0,0,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,2233,0,0,0,0,0,0,0,0


In [5]:
columns = df.columns[3:]

In [6]:
columns

Index([u'review_length', u'awesome', u'good', u'amazing', u'interesting',
       u'terrible', u'bad', u'awful', u'boring'],
      dtype='object')

In [6]:
X = np.asarray(df[columns])

In [7]:
X

array([[2302,    0,    0, ...,    3,    0,    1],
       [ 946,    0,    0, ...,    0,    0,    0],
       [2455,    0,    0, ...,    0,    0,    1],
       ..., 
       [ 641,    0,    0, ...,    0,    0,    0],
       [1145,    0,    0, ...,    0,    0,    0],
       [ 942,    0,    0, ...,    0,    0,    0]])

In [8]:
y = np.asarray(df.sentiment.transpose())

In [9]:
y

array([1, 1, 0, ..., 0, 0, 1])

In [10]:
from sklearn.naive_bayes import MultinomialNB

In [11]:
nb = MultinomialNB()

In [12]:
nb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
nb.predict(np.asarray(df[columns]))

array([0, 1, 0, ..., 1, 1, 1])

In [14]:
np.mean(nb.predict(np.asarray(df[columns])) == df.sentiment)

0.67023999999999995

## Make Kaggle submission with test data

In [15]:
test_data = pd.read_csv('testData.tsv', sep='\t')

In [16]:
test_data

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...
5,2913_8,"...as valuable as King Tut's tomb! (OK, maybe ..."
6,4396_1,This has to be one of the biggest misfires eve...
7,395_2,"This is one of those movies I watched, and won..."
8,10616_1,The worst movie i've seen in years (and i've s...
9,9074_9,"Five medical students (Kevin Bacon, David Labr..."


In [17]:
test_data['review_length'] = test_data.review.apply(len)

In [18]:
# Do the same feature extraction with the test data

from sklearn.feature_extraction.text import CountVectorizer
vocab_expand = ['awesome', 'good', 'amazing', 'interesting', 'terrible', 'bad', 'awful','boring']
expand_vectorizer = CountVectorizer(vocabulary=vocab_expand)
bow_expand = expand_vectorizer.fit_transform(test_data.review).todense()
words_expand = list(expand_vectorizer.vocabulary_.keys())
bow_expand_df = pd.DataFrame(bow_expand, index=test_data.index, columns = words_expand)
test_data_expand = test_data.join(bow_expand_df)
test_data_expand.to_csv('testDataFeatures.tsv', sep='\t')

In [19]:
predictions = nb.predict(np.asarray(test_data_expand[columns]))

In [20]:
submission = pd.DataFrame({"id": test_data.id, "sentiment": predictions})

In [21]:
submission.to_csv('submission_kaggle.tsv', sep='\t')

In [22]:
!ls

0-Exploration.ipynb        labeledTrainData.tsv
1-Feature_extraction.ipynb submission_kaggle.tsv
2-Modeling.ipynb           testData.tsv
3-Validation.ipynb         testDataFeatures.tsv
4-Ensemble.ipynb           trainDataFeatures.tsv
README.md                  trainDataFeatures2.tsv
environment.yml
