In [1]:
import pandas as pd
import glob
import os

In [2]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/train/pos/", "*.txt"))

pos = []

for file_path in file_list:
    with open(file_path) as f_input:
        pos.append(f_input.read())

In [3]:
pos_df = pd.DataFrame(pos, columns = ["reviews"])
pos_df["label"] = 1
pos_df["data_type"] = "train"

nRow, nCol = pos_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 3 columns in the training set.


In [4]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/train/neg/", "*.txt"))

neg = []

for file_path in file_list:
    with open(file_path) as f_input:
        neg.append(f_input.read())

In [5]:
neg_df = pd.DataFrame(neg, columns = ["reviews"])
neg_df["label"] = 0
neg_df["data_type"] = "train"

nRow, nCol = neg_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 3 columns in the training set.


In [6]:
train = pos_df.append(neg_df)

nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 3 columns in the training set.


In [7]:
# Clean of white spaces
train = train.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Print statistics
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 12499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   reviews    25000 non-null  object
 1   label      25000 non-null  int64 
 2   data_type  25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 781.2+ KB


In [8]:
# Distribution of positive and negative labels
train["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [9]:
# Removing empty rows from csv 
train.dropna(axis=0,inplace=True)
nRow, nCol = train.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 3 columns in the training set.


In [10]:
# Distribution of positive and negative labels
train["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [11]:
train.head()

Unnamed: 0,reviews,label,data_type
0,This is one of the best films we watched in my...,1,train
1,Spoiler This movie is about such a concept. Wi...,1,train
2,Richard Attenborough is a director whose name ...,1,train
3,"Joan Crawford had just begun her ""working girl...",1,train
4,I have just read the lead comment for this fil...,1,train


In [12]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/test/pos/", "*.txt"))

pos_test = []

for file_path in file_list:
    with open(file_path) as f_input:
        pos_test.append(f_input.read())

In [13]:
pos_test_df = pd.DataFrame(pos_test, columns = ["reviews"])
pos_test_df["label"] = 1
pos_test_df["data_type"] = "test"

nRow, nCol = pos_test_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 3 columns in the training set.


In [14]:
file_list = glob.glob(os.path.join(os.getcwd(), "./aclImdb_v1/aclImdb/test/neg/", "*.txt"))

neg_test = []

for file_path in file_list:
    with open(file_path) as f_input:
        neg_test.append(f_input.read())

In [15]:
neg_test_df = pd.DataFrame(neg_test, columns = ["reviews"])
neg_test_df["label"] = 0
neg_test_df["data_type"] = "test"

nRow, nCol = neg_test_df.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 12500 rows and 3 columns in the training set.


In [16]:
test = pos_test_df.append(neg_test_df)

nRow, nCol = test.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 3 columns in the training set.


In [17]:
# Clean of white spaces
test = test.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Print statistics
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25000 entries, 0 to 12499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   reviews    25000 non-null  object
 1   label      25000 non-null  int64 
 2   data_type  25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 781.2+ KB


In [18]:
# Distribution of positive and negative labels
test["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [19]:
# Removing empty rows from csv 
test.dropna(axis=0,inplace=True)
nRow, nCol = test.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 25000 rows and 3 columns in the training set.


In [20]:
# Distribution of positive and negative labels
test["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [21]:
test.head()

Unnamed: 0,reviews,label,data_type
0,This movie is definitely one of the finest of ...,1,test
1,I recently saw this at the 2007 Palm Springs I...,1,test
2,This delightful movie tells the story of buds....,1,test
3,This movie was well done but it also made me f...,1,test
4,"""The Couch Trip"" is one of those silly comedie...",1,test


In [22]:
data = train.append(test)

nRow, nCol = data.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 50000 rows and 3 columns in the training set.


In [23]:
data["data_type"].value_counts()

train    25000
test     25000
Name: data_type, dtype: int64

In [24]:
# Import for tokenization 
from nltk.tokenize import word_tokenize

In [25]:
# Tokenize
data['title_tokenize'] = data['reviews'].apply(word_tokenize)

data.head()

Unnamed: 0,reviews,label,data_type,title_tokenize
0,This is one of the best films we watched in my...,1,train,"[This, is, one, of, the, best, films, we, watc..."
1,Spoiler This movie is about such a concept. Wi...,1,train,"[Spoiler, This, movie, is, about, such, a, con..."
2,Richard Attenborough is a director whose name ...,1,train,"[Richard, Attenborough, is, a, director, whose..."
3,"Joan Crawford had just begun her ""working girl...",1,train,"[Joan, Crawford, had, just, begun, her, ``, wo..."
4,I have just read the lead comment for this fil...,1,train,"[I, have, just, read, the, lead, comment, for,..."


In [26]:
# We will have vector representation before we can do classification
# Do imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
# We will consider 1- and 2- gram
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 2))

In [28]:
# fit training data to the count vectorizer
data_counts = count_vectorizer.fit_transform(data['reviews'].values)

#fit the ngrams count to the tfidf transformers
data_tfidf = transformer.fit_transform(data_counts)

In [44]:
from sklearn.model_selection import train_test_split

#  If train-test size is not initialized, test_size will be set to 0.25 and train_set = 1-test_size
X_train, X_test, y_train, y_test = train_test_split(data_tfidf, data['label'].values, random_state=0, test_size=0.50)

In [103]:
# X_train = data_tfidf
# y_train = train['label'].values
# X_test = test_tfidf
# y_test = test['label'].values

In [46]:
# Import for prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [47]:
# Try titles on random forest
RandomFC= RandomForestClassifier(n_estimators=1000, min_samples_leaf = 100, n_jobs=-1)
RandomFC.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=100, n_estimators=1000, n_jobs=-1)

In [48]:
# Print accuracy
print('Accuracy of randomforest classifier on training set: {:.2f}'.format(RandomFC.score(X_train, y_train)))
print('Accuracy of randomforest classifier on test set: {:.2f}'.format(RandomFC.score(X_test, y_test)))
CM = confusion_matrix(y_test, RandomFC.predict(X_test))
print(CM)
CR = classification_report(y_test, RandomFC.predict(X_test))
print(CR)

Accuracy of randomforest classifier on training set: 0.85
Accuracy of randomforest classifier on test set: 0.84
[[10219  2254]
 [ 1846 10681]]
              precision    recall  f1-score   support

           0       0.85      0.82      0.83     12473
           1       0.83      0.85      0.84     12527

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



In [49]:
# Also try logistic regression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=100000.0)

In [50]:
# Print accuracy
print('Accuracy of Logreg classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logreg classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
CM = confusion_matrix(y_test, logreg.predict(X_test))
print(CM)

Accuracy of Logreg classifier on training set: 1.00
Accuracy of Logreg classifier on test set: 0.91
[[11274  1199]
 [ 1064 11463]]
