In [1]:
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files

  from collections import Sequence


### Importing the dataset

In [2]:
reviews = load_files('txt_sentoken/')
X, y = reviews.data, reviews.target

In [11]:
X[3]

b'we could paraphrase michelle pfieffer\'s character in dangerous minds and say that beyond rangoon starts with an " a . " \nthat\'s fair enough . \nall movies , like all school children , should be given the benefit of the doubt . \nthe chance to succeed . \nafter all , we like to think that the right combination of talent and effort can do wonders . \nmountains can be moved , and good movies can be made . \nyeah right . \nchildren fail , as do films . \nas does director john boorman\'s latest . \nthe success of beyond rangoon hinges on the believability of patricia arquette ( ed wood , true romance ) as the busty westerner-in-peril wandering about 1988 burma without a passport . \nthough we can stomach the mild plot contrivances that get her there , it\'s a tougher task to overlook the actress . \nshe\'s a lightweight . \nas the first scene ( with narration ! ) \ndemonstrates , she doesn\'t have * nearly * enough range for the emotions that her character--a mother fleeing the memorie

In [12]:
y[3]

0

### Processing the data

In [13]:
def process_data(review):
    review = review.lower() # to lower case
    review = re.sub(r'\W', ' ', str(review)) # removes all the non-word characters
    review = re.sub(r'\s+[a-z]\s+', ' ',review) # removes all the single characters
    review = re.sub(r'^[a-z]\s+', ' ',review) # removes all the single characters
    review = re.sub(r'\s+', ' ', review) # removes all the extra spaces
    return review

In [16]:
corpus = list(map(process_data, X))

In [17]:
corpus[3]

' we could paraphrase michelle pfieffer character in dangerous minds and say that beyond rangoon starts with an nthat fair enough nall movies like all school children should be given the benefit of the doubt nthe chance to succeed nafter all we like to think that the right combination of talent and effort can do wonders nmountains can be moved and good movies can be made nyeah right nchildren fail as do films nas does director john boorman latest nthe success of beyond rangoon hinges on the believability of patricia arquette ed wood true romance as the busty westerner in peril wandering about 1988 burma without passport nthough we can stomach the mild plot contrivances that get her there it a tougher task to overlook the actress nshe a lightweight nas the first scene with narration ndemonstrates she doesn have nearly enough range for the emotions that her character mother fleeing the memories of murdered husband and son is supposed to show nshe may give stronger performance than say ke

### Creating the BOW model

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vectorizer = CountVectorizer(max_features = 2000, 
                             min_df = 3, 
                             max_df = 0.6, 
                             stop_words = stopwords.words('english'))

X = vectorizer.fit_transform(corpus).toarray()

In [22]:
X.shape

(2000, 2000)

In [25]:
X[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 2, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

### Creating the TF-IDF Model from the BOW model

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
transformer = TfidfTransformer()
X = transformer.fit_transform(X).toarray()

In [28]:
X[:10]

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06887219, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.05959645, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06582811, 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Creating the TF-IDF model directly

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
# vectorizer = TfidfVectorizer(max_features = 2000, 
#                              min_df = 3, 
#                              max_df = 0.6, 
#                              stop_words = stopwords.words('english'))

# X = vectorizer.fit_transform(corpus).toarray()

### Train/Test split

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Training the classifier using Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### Testing model performance

In [35]:
y_pred = classifier.predict(X_test)

In [36]:
from sklearn.metrics import classification_report, confusion_matrix

In [38]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.81      0.86      0.83       319
          1       0.86      0.81      0.83       341

avg / total       0.83      0.83      0.83       660



In [40]:
print(confusion_matrix(y_test, y_pred))

[[274  45]
 [ 65 276]]
