In [1]:
import pandas as pd
df = pd.read_csv("movie_data.csv")
df.head(10)

Unnamed: 0,review,sentiment
0,This movie is just crap. Even though the direc...,0
1,Another detailed work on the subject by Dr Dwi...,1
2,THE CAT O'NINE TAILS (Il Gatto a Nove Code) <b...,0
3,"Like with any movie genre, there are good gang...",0
4,I watched it with my mom and we were like...<b...,0
5,This movie is probably one of 3 worst movies m...,0
6,"this movie is quite bad, aggressive, not playe...",0
7,And a perfect film to watch during the holiday...,1
8,"I like Noel Coward, the wit. I like Noel Cowar...",0
9,"""The Days"" is a typical family drama with a li...",1


In [2]:
df['review'][1]

'Another detailed work on the subject by Dr Dwivedi takes us back in time to pre-partioned Panjab. Dr Dwivedi chose a difficult subject for his movie debut. He has worked on all meticulous details to bring the story to life. The treatment of the subject is very delicate.<br /><br />Even though we have not been to the region during that time, the sets and costumes look real. Unlike most movies made on partition, this one focuses not on the gory details of violence to attract audience, but on its after-effects. The characters come to life. Priyanshu Chatterjee has given an impressive performance. Manoj Bajpai has acted his heart out showing the plight of a guilt-ridden man. The rest of the cast has done a good job too.'

In [3]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining, the weather is sweet, and one and one is two'])

bag = count.fit_transform(docs)

In [4]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [5]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
np.set_printoptions(precision = 2)

tfidf = TfidfTransformer(use_idf = True, norm='l2', smooth_idf = True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [7]:
df.loc[4, 'review'][-50:]

'TINY bit nicer-looking.<br /><br />My rating: 1/10'

In [8]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-','')
    return text

In [9]:
preprocessor(df.loc[4, 'review'][-50:])

'tiny bit nicer looking my rating 1 10'

In [10]:
preprocessor("</a>This :) is a :( test :-)!")

'this is a test :) :( :)'

In [11]:
df['review'] = df['review'].apply(preprocessor)

In [12]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

In [13]:
def tokenizer(text):
    return text.split()


In [14]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [15]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a running like running and runs a lot')[-10:] if w not in stop]

['run', 'like', 'run', 'run', 'lot']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents = None,
                      lowercase = False,
                      preprocessor = None,
                        tokenizer = tokenizer_porter,
                      use_idf = True,
                      norm = 'l2',
                      smooth_idf = True)

y = df.sentiment.values
X = tfidf.fit_transform(df.review)

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1, test_size = 0.5, shuffle = False)

In [37]:
import pickle
from sklearn.linear_model import LogisticRegressionCV

clf = LogisticRegressionCV(cv = 5,
                          scoring = 'accuracy',
                          random_state = 0,
                          n_jobs = -1,
                          verbose = 3,
                          max_iter = 300).fit(X_train, y_train)

saved_model = open('saved_model.sav','wb')
pickle.dump(clf, saved_model)
saved_model.close()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.7min remaining:  5.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.6min finished


In [38]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

In [39]:
saved_clf.score(X_test, y_test)

0.8969381628977386