In [1]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

  from collections import Sequence


In [2]:
reviews = load_files('txt_sentoken/')
X, y = reviews.data, reviews.target

In [3]:
def process_data(review):
    review = review.lower() # to lower case
    review = re.sub(r'\W', ' ', str(review)) # removes all the non-word characters
    review = re.sub(r'\s+[a-z]\s+', ' ',review) # removes all the single characters
    review = re.sub(r'^[a-z]\s+', ' ',review) # removes all the single characters
    review = re.sub(r'\s+', ' ', review) # removes all the extra spaces
    return review

In [4]:
corpus = list(map(process_data, X))

In [5]:
vectorizer = TfidfVectorizer(max_features = 2000, 
                             min_df = 3, 
                             max_df = 0.6, 
                             stop_words = stopwords.words('english'))

X = vectorizer.fit_transform(corpus).toarray()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
y_pred = classifier.predict(X_test)

In [10]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.81      0.86      0.83       319
          1       0.86      0.81      0.83       341

avg / total       0.83      0.83      0.83       660



In [11]:
print(confusion_matrix(y_test, y_pred))

[[274  45]
 [ 65 276]]


### Saving classifier

In [12]:
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier, f)

### Saving model

In [13]:
with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer, f)