In [6]:
import pyprind
import pandas as pd
import os
import numpy as np

In [5]:
basepath='aclImdb'

labels = {'pos':1,'neg':0}
pbar=pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path,file),'r',encoding='utf-8') as infile:
                txt=infile.read()
            df=df.append([[txt,labels[l]]],ignore_index=True)
            pbar.update()
df.columns=['review','sentiment']          
                

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:17


In [7]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv',index=False,encoding='utf-8')

In [9]:
df = pd.read_csv('movie_data.csv',encoding='utf-8')
df.head(3)

Unnamed: 0,0,1
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [25]:
df.columns=['review','sentiment']    

In [10]:
df.shape

(50000, 2)

## Introducing the bag of words model

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining','The weather is sweet','The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [21]:
print(docs)
print(bag)
print(bag.toarray())

['The sun is shining' 'The weather is sweet'
 'The sun is shining, the weather is sweet, and one and one is two']
  (0, 3)	1
  (0, 1)	1
  (0, 4)	1
  (0, 6)	1
  (1, 5)	1
  (1, 8)	1
  (1, 1)	1
  (1, 6)	1
  (2, 7)	1
  (2, 2)	2
  (2, 0)	2
  (2, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 1)	3
  (2, 4)	1
  (2, 6)	2
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [22]:
print(count)
print(count.vocabulary_)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)
{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [23]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [26]:
df.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [27]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    return text

In [29]:
preprocessor(df.loc[0,'review'][-50:])

'is seven title brazil not available'

In [30]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [31]:
df['review'] = df['review'].apply(preprocessor)

In [32]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [34]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aussie.haryono\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [38]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
porterized = tokenizer_porter('a runner likes running and runs a lot')
[w for w in porterized if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

## Training a logistic regression model for document classification

In [45]:
X_train = df.loc[:1000,'review'].values
y_train = df.loc[:1000,'sentiment'].values
X_test = df.loc[1000:2000,'review'].values
y_test = df.loc[1000:2000,'sentiment'].values

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid1 = [{
    'vect__ngram_range':[(1,1)],
    'vect__stop_words':[stop,None],
    'vect__tokenizer':[tokenizer,tokenizer_porter],
    'clf__penalty':['l1','l2'],
    'clf__C':[1.0,10.0,100.0]
},
    {
        'vect__ngram_range':[(1,1)],
        'vect__stop_words':[stop,None],
        'vect__tokenizer':[tokenizer,tokenizer_porter],
        'vect__use_idf':[False],
        'vect__norm':[None],        
        'clf__penalty':['l1','l2'],
        'clf__C':[1.0,10.0,100.0]    
    }
]
param_grid2 = [{
    'vect__ngram_range':[(1,1)],
    'vect__stop_words':[stop],
    'vect__tokenizer':[tokenizer_porter],
    'clf__penalty':['l1','l2'],
    'clf__C':[1.0,10.0,100.0]
}
]


In [48]:
lr_tfidf = Pipeline([('vect',tfidf),
                    ('clf',LogisticRegression(random_state=0))
                    ])
gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid2,scoring='accuracy',cv=5,verbose=1,n_jobs=1)
gs_lr_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  5.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...kenizer_porter at 0x000002442745C730>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [51]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print(clf)
print('Test Accuracy: %.3f' % clf.score(X_test,y_test))

CV Accuracy: 0.836
Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
Test Accuracy: 0.830


## Working with bigger data - SGD

In [53]:
import pickle 
import os
dest = os.path.join('pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=4)
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=4)

In [67]:
from vectorizer import vect
clf = pickle.load(open(os.path.join('pkl_objects','classifier.pkl'),'rb'))

In [59]:
clf

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [93]:
label = {0:'negative',1:'positive'}
example = ['I love this movie']
X = vect.transform(example)
example2 = ['movie','hate','ugly']
print(X)
pred = clf.predict(example2)[0]
pred_proba = clf.predict_proba(example2)
print("Prediction: {}; Probability: {}".format(pred,pred_proba))

  (0, 730602)	-0.7071067811865475
  (0, 1721353)	-0.7071067811865475
Prediction: 0; Probability: [[0.81 0.19]
 [0.52 0.48]
 [0.51 0.49]]
