In [77]:
import pandas as pd
import numpy as np
labeled_train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter="\t", quoting=3, encoding='latin-1')
sample_submission = pd.read_csv('sampleSubmission.csv')
test = pd.read_csv('testData.tsv', header=0, delimiter="\t", quoting=3, encoding='latin-1')
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3, encoding='latin-1')

In [78]:
labeled_train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [79]:
labeled_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [80]:
sample_submission.head()

Unnamed: 0,id,sentiment
0,12311_10,0
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,0


In [81]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id           25000 non-null object
sentiment    25000 non-null int64
dtypes: int64(1), object(1)
memory usage: 390.7+ KB


In [82]:
test.head()
# testData - The test set. Your task is to predict the sentiment for each one. 

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [83]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB


In [84]:
unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [85]:
unlabeled_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
id        50000 non-null object
review    50000 non-null object
dtypes: object(2)
memory usage: 781.3+ KB


**First off, let's build our own model of NLP and Logistic Regression**

In [86]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennyfairy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
# clean the texts
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    return text

In [88]:
# split a text into a list of words and stem those words
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()];

In [89]:
# split labeled_train dataset into test and train sets
X = labeled_train['review']
y = labeled_train['sentiment']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [90]:
# train those sets
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf),
                ('clf', LogisticRegression(random_state=0))])
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [91]:
y_test_predict = clf.predict(X_test)

In [92]:
# evaluate this model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test, y_test_predict))
print(confusion_matrix(y_test, y_test_predict))
print(classification_report(y_test, y_test_predict))

0.8850666666666667
[[3293  503]
 [ 359 3345]]
             precision    recall  f1-score   support

          0       0.90      0.87      0.88      3796
          1       0.87      0.90      0.89      3704

avg / total       0.89      0.89      0.89      7500



**Now that the accuracy score is good enough, let's use this model to predict the sentiment of each review in the test dataset (which is our task)**

In [93]:
test_predict = clf.predict(test['review'])

In [94]:
# insert test_predict into the test dataset
test.insert(loc=1, column='sentiment', value=test_predict)

In [95]:
test.head(10)

Unnamed: 0,id,sentiment,review
0,"""12311_10""",1,"""Naturally in a film who's main themes are of ..."
1,"""8348_2""",0,"""This movie is a disaster within a disaster fi..."
2,"""5828_4""",1,"""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""",1,"""Afraid of the Dark left me with the impressio..."
4,"""12128_7""",1,"""A very accurate depiction of small time mob l..."
5,"""2913_8""",1,"""...as valuable as King Tut's tomb! (OK, maybe..."
6,"""4396_1""",0,"""This has to be one of the biggest misfires ev..."
7,"""395_2""",1,"""This is one of those movies I watched, and wo..."
8,"""10616_1""",0,"""The worst movie i've seen in years (and i've ..."
9,"""9074_9""",0,"""Five medical students (Kevin Bacon, David Lab..."


**Done and done!!!**