In [1]:
import pandas as pd
import glob
import os

In [2]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/train/pos/", "*.txt"))

pos = []

for file_path in file_list:
    with open(file_path) as f_input:
        pos.append(f_input.read())

In [3]:
pos_df = pd.DataFrame(pos, columns = ["reviews"])
pos_df["label"] = 1

nRow, nCol = pos_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 2 columns in the training set.


In [4]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/train/neg/", "*.txt"))

neg = []

for file_path in file_list:
    with open(file_path) as f_input:
        neg.append(f_input.read())

In [5]:
neg_df = pd.DataFrame(neg, columns = ["reviews"])
neg_df["label"] = 0

nRow, nCol = neg_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 2 columns in the training set.


In [8]:
train = pos_df.append(neg_df)

nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 2 columns in the training set.


In [9]:
# Clean of white spaces
train = train.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Print statistics
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 12499
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   reviews  25000 non-null  object
 1   label    25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 585.9+ KB


In [10]:
# Distribution of positive and negative labels
train["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [11]:
# Removing empty rows from csv 
train.dropna(axis=0,inplace=True)
nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 2 columns in the training set.


In [12]:
# Distribution of positive and negative labels
train["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [13]:
train.head()

Unnamed: 0,reviews,label
0,This is one of the best films we watched in my...,1
1,Spoiler This movie is about such a concept. Wi...,1
2,Richard Attenborough is a director whose name ...,1
3,"Joan Crawford had just begun her ""working girl...",1
4,I have just read the lead comment for this fil...,1


In [77]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/test/pos/", "*.txt"))

pos_test = []

for file_path in file_list:
    with open(file_path) as f_input:
        pos_test.append(f_input.read())

In [78]:
pos_test_df = pd.DataFrame(pos_test, columns = ["reviews"])
pos_test_df["label"] = 1

nRow, nCol = pos_test_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 2 columns in the training set.


In [79]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/test/neg/", "*.txt"))

neg_test = []

for file_path in file_list:
    with open(file_path) as f_input:
        neg_test.append(f_input.read())

In [80]:
neg_test_df = pd.DataFrame(neg_test, columns = ["reviews"])
neg_test_df["label"] = 0

nRow, nCol = neg_test_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 2 columns in the training set.


In [81]:
test = pos_test_df.append(neg_test_df)

nRow, nCol = test.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 2 columns in the training set.


In [99]:
# Clean of white spaces
test = test.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Print statistics
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 12499
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviews         25000 non-null  object
 1   label           25000 non-null  int64 
 2   title_tokenize  25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 781.2+ KB


In [83]:
# Distribution of positive and negative labels
test["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [84]:
# Removing empty rows from csv 
test.dropna(axis=0,inplace=True)
nRow, nCol = test.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 2 columns in the training set.


In [85]:
# Distribution of positive and negative labels
test["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [86]:
test.head()

Unnamed: 0,reviews,label
0,This movie is definitely one of the finest of ...,1
1,I recently saw this at the 2007 Palm Springs I...,1
2,This delightful movie tells the story of buds....,1
3,This movie was well done but it also made me f...,1
4,"""The Couch Trip"" is one of those silly comedie...",1


In [87]:
# Import for tokenization 
from nltk.tokenize import word_tokenize

In [88]:
# Tokenize
train['title_tokenize'] = train['reviews'].apply(word_tokenize)

train.head()

Unnamed: 0,reviews,label,title_tokenize
0,This is one of the best films we watched in my...,1,"[This, is, one, of, the, best, films, we, watc..."
1,Spoiler This movie is about such a concept. Wi...,1,"[Spoiler, This, movie, is, about, such, a, con..."
2,Richard Attenborough is a director whose name ...,1,"[Richard, Attenborough, is, a, director, whose..."
3,"Joan Crawford had just begun her ""working girl...",1,"[Joan, Crawford, had, just, begun, her, ``, wo..."
4,I have just read the lead comment for this fil...,1,"[I, have, just, read, the, lead, comment, for,..."


In [89]:
# Tokenize
test['title_tokenize'] = test['reviews'].apply(word_tokenize)

test.head()

Unnamed: 0,reviews,label,title_tokenize
0,This movie is definitely one of the finest of ...,1,"[This, movie, is, definitely, one, of, the, fi..."
1,I recently saw this at the 2007 Palm Springs I...,1,"[I, recently, saw, this, at, the, 2007, Palm, ..."
2,This delightful movie tells the story of buds....,1,"[This, delightful, movie, tells, the, story, o..."
3,This movie was well done but it also made me f...,1,"[This, movie, was, well, done, but, it, also, ..."
4,"""The Couch Trip"" is one of those silly comedie...",1,"[``, The, Couch, Trip, '', is, one, of, those,..."


In [90]:
# We will have vector representation before we can do classification
# Do imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [91]:
# We will consider 1- and 2- gram
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))

In [92]:
# fit training data to the count vectorizer
train_counts = count_vectorizer.fit_transform(train['reviews'].values)

#fit the ngrams count to the tfidf transformers
train_tfidf = transformer.fit_transform(train_counts)

In [101]:
# fit training data to the count vectorizer
test_counts = count_vectorizer.fit_transform(test['reviews'].values)

#fit the ngrams count to the tfidf transformers
test_tfidf = transformer.fit_transform(test_counts)

In [109]:
from sklearn.model_selection import train_test_split

#  If train-test size is not initialized, test_size will be set to 0.25 and train_set = 1-test_size
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, targets, random_state=0)

In [103]:
# X_train = train_tfidf
# y_train = train['label'].values
# X_test = test_tfidf
# y_test = test['label'].values

In [110]:
# Import for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [113]:
# Try titles on random forest
RandomFC= RandomForestClassifier(n_estimators=1000, min_samples_leaf = 100, n_jobs=-1)
RandomFC.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=100, n_estimators=1000, n_jobs=-1)

In [114]:
# Print accuracy
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(X_train, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(X_test, y_test)))
CM = confusion_matrix(y_test, RandomFC.predict(X_test))
print(CM)
CR = classification_report(y_test, RandomFC.predict(X_test))
print(CR)

Accuracy of randomforest classifier on training set: 0.85
Accuracy of randomforest classifier on test set: 0.84
[[2573  528]
 [ 485 2664]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      3101
           1       0.83      0.85      0.84      3149

    accuracy                           0.84      6250
   macro avg       0.84      0.84      0.84      6250
weighted avg       0.84      0.84      0.84      6250



In [24]:
# Also try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000.0)

In [25]:
# Print accuracy
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logreg classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
CM = confusion_matrix(y_test, logreg.predict(X_test))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on test set: 0.91
[[2804  297]
 [ 269 2880]]
