In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import gensim
import nltk
import sklearn

import re
import string
from string import digits

movie_train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
movie_test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
movie_train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [3]:
movie_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [4]:
movie_train.columns
movie_train.head()
#text = movie_train['review'][0]
#text.replace('\\', ' ')
movie_test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [5]:
# Cleaning the dataset
def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r'http\S+', '', elem))
    df[text_field] = df[text_field].apply(lambda elem: elem.replace('<br />', ' '))
    df[text_field] = df[text_field].apply(lambda elem: elem.replace('\\', ' '))
    df[text_field] = df[text_field].apply(lambda elem: elem.replace('_', ' '))
    
    remove_digits = str.maketrans('', '', digits)
    df[text_field] = df[text_field].apply(lambda elem: elem.translate(remove_digits))
    return df

clean_movie_train = clean_text(movie_train, 'review')
clean_movie_test = clean_text(movie_test, 'review')

In [6]:
# Tokenization
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

clean_movie_train['tokens'] = clean_movie_train['review'].apply(tokenizer.tokenize)
clean_movie_train.head()

clean_movie_test['tokens'] = clean_movie_test['review'].apply(tokenizer.tokenize)

In [7]:
# Explore words and sentences
all_words = [word for tokens in clean_movie_train['tokens'] for word in tokens]
sentence_lengths = [len(tokens) for tokens in clean_movie_train['tokens']]
Vocab = sorted(list(set(all_words)))
print('%s words total, with a vocabulary size of %s' %(len(all_words), len(Vocab)))

5919546 words total, with a vocabulary size of 73495


In [8]:
# Explore vocabulary
print(Vocab[0:100])

from collections import Counter
count_all_words = Counter(all_words)
count_all_words.most_common(100)

['a', 'aa', 'aaa', 'aaaaaaah', 'aaaaah', 'aaaaatch', 'aaaahhhhhhh', 'aaaand', 'aaaarrgh', 'aaah', 'aaargh', 'aaaugh', 'aaawwwwnnn', 'aachen', 'aada', 'aadha', 'aag', 'aage', 'aaghh', 'aah', 'aahhh', 'aaip', 'aaja', 'aakash', 'aaker', 'aakrosh', 'aaliyah', 'aames', 'aamir', 'aan', 'aankh', 'aankhen', 'aap', 'aapke', 'aapkey', 'aardman', 'aardvarks', 'aargh', 'aaron', 'aarp', 'aarrrgh', 'aatish', 'aauugghh', 'aavjo', 'aaww', 'ab', 'aback', 'abadi', 'abahy', 'abanazer', 'abandon', 'abandoned', 'abandoning', 'abandonment', 'abandons', 'abanks', 'abas', 'abashed', 'abashidze', 'abatement', 'abating', 'abattoirs', 'abba', 'abbad', 'abbas', 'abbasi', 'abbey', 'abbie', 'abbot', 'abbots', 'abbott', 'abbreviated', 'abbu', 'abby', 'abc', 'abcd', 'abdic', 'abdicates', 'abdicating', 'abdomen', 'abdominal', 'abdu', 'abduct', 'abducted', 'abductee', 'abducting', 'abduction', 'abductions', 'abductor', 'abductors', 'abducts', 'abdul', 'abdullah', 'abe', 'abel', 'abercrombie', 'abernathy', 'aberrant', '

[('the', 336755),
 ('and', 164142),
 ('a', 163140),
 ('of', 145865),
 ('to', 135724),
 ('is', 107337),
 ('it', 96471),
 ('in', 93978),
 ('i', 87692),
 ('this', 76007),
 ('that', 73287),
 ('s', 65709),
 ('was', 48209),
 ('as', 46937),
 ('for', 44345),
 ('with', 44130),
 ('movie', 44046),
 ('but', 42623),
 ('film', 40162),
 ('t', 34390),
 ('you', 34268),
 ('on', 34203),
 ('not', 30634),
 ('he', 30155),
 ('are', 29438),
 ('his', 29376),
 ('have', 27732),
 ('be', 26957),
 ('one', 26795),
 ('all', 23985),
 ('at', 23516),
 ('they', 22916),
 ('by', 22549),
 ('an', 21564),
 ('who', 21442),
 ('so', 20617),
 ('from', 20499),
 ('like', 20281),
 ('there', 18866),
 ('her', 18424),
 ('or', 18008),
 ('just', 17774),
 ('about', 17375),
 ('out', 17113),
 ('if', 16809),
 ('has', 16791),
 ('what', 16168),
 ('some', 15749),
 ('good', 15147),
 ('can', 14678),
 ('more', 14255),
 ('she', 14228),
 ('when', 14184),
 ('very', 14068),
 ('up', 13293),
 ('time', 12727),
 ('no', 12716),
 ('even', 12656),
 ('my', 12

In [9]:
#Embedding 1
# TFIDF bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(analyzer = 'word', token_pattern = r'\w+')

tfidf = dict()
tfidf['all_train'] = (tfidf_vectorizer.fit_transform(clean_movie_train['review']))
tfidf['test'] = (tfidf_vectorizer.transform(clean_movie_test['review']))
print(tfidf['all_train'].shape)
print(tfidf['test'].shape)

(25000, 73495)
(25000, 73495)


In [10]:
#Embedding 2
# word2vec 
#from gensim.models import Word2Vec
#model = Word2Vec.load("300features_40minwords_10context")

In [11]:
# The Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

train_target = clean_movie_train['sentiment']
classifier.fit(tfidf['all_train'], train_target)

submission = pd.DataFrame.from_dict({'id': clean_movie_test['id']})
lr_predict = classifier.predict(tfidf['test'])

submission = pd.DataFrame(data={"id":clean_movie_test["id"], "sentiment":lr_predict})
submission.head()
#submission.to_csv( "submission.csv", index=False, quoting=3 )

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1
