In [1]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\winuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import datasets
reviews =  load_files('txt_sentoken/')

In [3]:
x,y = reviews.data,reviews.target

In [4]:
# Storing as Pickle Files
with open('x.pickle','wb') as f: #'wb' = write,byte
    pickle.dump(x,f)
    
with open('y.pickle','wb') as f:
    pickle.dump(y,f)
    
#unpickling the datasets
with open ('x.pickle','rb') as f: # 'rb' = read,byte
    x=pickle.load(f)
    
with open ('y.pickle','rb') as f:
    y=pickle.load(f)

### Data Pre-Processing


In [10]:
# Creating the clean corpus
corpus = []
for i in range(0,len(x)):
    review = re.sub(r'\W',' ',str(x[i])) #Substitue all non word charachter with space
    review = review.lower()
    review = re.sub(r'\s+[a-z]\s+',' ',review) #Substituting all single charachter with space
    review = re.sub(r'^[a-z]\s+',' ',review) #Removing all single charachter as start of sentence
    review = re.sub(r'\s+',' ',review) #Substitue one or more spaces with one space
    corpus.append(review)

In [12]:
corpus[1]

' good films are hard to find these days ngreat films are beyond rare nproof of life russell crowe one two punch of deft kidnap and rescue thriller is one of those rare gems na taut drama laced with strong and subtle acting an intelligent script and masterful directing together it delivers something virtually unheard of in the film industry these days genuine motivation in story that rings true nconsider the strange coincidence of russell crowe character in proof of life making the moves on distraught wife played by meg ryan character in the film all while the real russell crowe was hitching up with married woman meg ryan in the outside world ni haven seen this much chemistry between actors since mcqueen and mcgraw teamed up in peckinpah masterpiece the getaway nbut enough with the gossip let get to the review nthe film revolves around the kidnapping of peter bowman david morse an american engineer working in south america who is kidnapped during mass ambush of civilians by anti govern

### Transforming Data in BOW model
### TFIDF Model

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
vectorizer = TfidfVectorizer(max_features=2000,#Max frequently occuring word
                             min_df=3,# exclude word which are appearing in 3 doucment or less
                             max_df=0.6, # exclude all the different word that appear in 60% of document
                             stop_words=stopwords.words('english'))

In [41]:
# TFIDF bag of word model
X = vectorizer.fit_transform(corpus).toarray()

### Train-Test Split

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
text_train, text_test, sent_train, sent_test = train_test_split(X, y,
                                                    test_size=0.2, 
                                                    random_state=0)

### Model Training

In [44]:
from sklearn.linear_model import  LogisticRegression

In [45]:
classifier = LogisticRegression()

In [46]:
classifier.fit(text_train,sent_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Test model performance

In [47]:
sent_pred = classifier.predict(text_test)

In [48]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [49]:
print(confusion_matrix(sent_test,sent_pred))

[[168  40]
 [ 21 171]]


In [50]:
print(accuracy_score(sent_test,sent_pred))

0.8475


### Saving the model

In [35]:
# Pickling the classifier
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f)

In [51]:
# Pickling the vectorizer
with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)

### Import pickle files

In [52]:
# unpickling the classifier and vectorizer

In [53]:
with open('classifier.pickle','rb') as f:
    clf = pickle.load(f)

In [54]:
with open('tfidfmodel.pickle','rb') as f:
    tfidf = pickle.load(f)

In [75]:
sample = ["You are a nice person man, have a good life"]
sample = tfidf.transform(sample).toarray()

In [76]:
print(clf.predict(sample))

[1]
