In [1]:
import pandas as pd
import numpy as np
import glob, os, string, re, spacy
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

## Import datasets

In [3]:
train_pos_files = glob.glob("train/pos/*.txt")
train_neg_files = glob.glob("train/neg/*.txt")
train_pos_ls = []

for i in train_pos_files:
    file = open(i, "r", encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_pos_ls.append(str)
    
train_neg_ls = []
for i in train_neg_files:
    file = open(i, "r", encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    train_neg_ls.append(str)
    


In [4]:
labels = ['reveiw', 'label']
df_train_pos = pd.DataFrame()
df_train_pos['review'] = train_pos_ls
df_train_pos['label'] = 1
df_train_neg = pd.DataFrame()
df_train_neg['review'] = train_neg_ls
df_train_neg['label'] = -1
df_train = pd.concat([df_train_pos , df_train_neg])

In [5]:
test_pos_files = glob.glob("test/pos/*.txt")
test_neg_files = glob.glob("test/neg/*.txt")
test_pos_ls = []
for i in test_pos_files:
    file = open(i, "r", encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_pos_ls.append(str)
    
test_neg_ls = []
for i in test_neg_files:
    file = open(i, "r", encoding="utf8")
    str = file.readline()
    clean = re.compile('<.*?>')
    str = re.sub(clean, ' ', str)
    test_neg_ls.append(str)
    

In [6]:
labels = ['reveiw', 'label']
df_test_pos = pd.DataFrame()
df_test_pos['review'] = test_pos_ls
df_test_pos['label'] = 1
df_test_neg = pd.DataFrame()
df_test_neg['review'] = test_neg_ls
df_test_neg['label'] = -1
df_test = pd.concat([df_test_pos , df_test_neg])
df_test.head()

Unnamed: 0,review,label
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [7]:
# Define text pre-processing functions
lemma = WordNetLemmatizer()
stops = set(stopwords.words('english'))

# nltk stopwords removal performs better than spacy 
# nlp = spacy.load('en_core_web_sm')
# spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
            
def text_prep(text):
    no_punct = [char for char in text if char not in string.punctuation]
    text = "".join(no_punct)
    text = [lemma.lemmatize(text, pos='v') for text in text.lower().split() if text not in stops] 
    text = " ".join(text)
    return (text)


## Data Preprocessing

In [8]:
df_train['prep_review'] = df_train['review'].apply(lambda x:text_prep(x))
df_train[['prep_review', 'label']].head()

Unnamed: 0,prep_review,label
0,bromwell high cartoon comedy run time program ...,1
1,homelessness houselessness george carlin state...,1
2,brilliant overact lesley ann warren best drama...,1
3,easily underrate film inn brook cannon sure fl...,1
4,typical mel brook film much less slapstick mov...,1


In [9]:
# preprocess testing data
df_test['prep_review'] = df_test['review'].apply(lambda x:text_prep(x))
df_test[['prep_review', 'label']].head()

Unnamed: 0,prep_review,label
0,go saw movie last night coax friends mine ill ...,1
1,actor turn director bill paxton follow promise...,1
2,recreational golfer knowledge sport history pl...,1
3,saw film sneak preview delightful cinematograp...,1
4,bill paxton take true story 1913 us golf open ...,1


In [10]:
# Vectorizing training data 
tfidf = TfidfVectorizer()
# tfidf = TfidfVectorizer(ngram_range = (1,3)) did not improve accuracy
x_train = tfidf.fit_transform(df_train['prep_review'])
y_train = df_train['label']

In [11]:
# Vectorizing testing data 
x_test = tfidf.transform(df_test['prep_review'])
y_test = df_test['label']

## Prediction Models

### Logistic Regression

In [12]:
LR = LogisticRegression(solver = 'lbfgs', n_jobs = -1)
LR.fit(x_train, y_train)
LR_clf = LR.predict(x_test)

In [13]:
LR.score(x_train, y_train)

0.93528

In [14]:
accuracy_score(y_test, LR_clf)

0.88336

### Linear Support Vector Classifier

In [15]:
LSVM = LinearSVC()
LSVM.fit(x_train, y_train)
LSVM_clf = LSVM.predict(x_test)


In [16]:
LSVM.score(x_train, y_train)

0.99128

In [17]:
accuracy_score(y_test, LSVM_clf)

0.87264

### AdaBoost Classifier

In [18]:
ADA = AdaBoostClassifier(n_estimators=100)
ADA.fit(x_train, y_train)
ADA_clf = ADA.predict(x_test)


In [19]:
ADA.score(x_train, y_train)

0.84076

In [20]:
accuracy_score(y_test, ADA_clf)

0.83188

### Random Forest Classifier

In [20]:
RFC = RandomForestClassifier(n_estimators=100, random_state = 42, n_jobs = -1)
RFC.fit(x_train, y_train)
RFC_clf = RFC.predict(x_test) 


In [21]:
RFC.score(x_train, y_train)

1.0

In [22]:
accuracy_score(y_test, RFC_clf)

0.84712

### Multinomial Naive Bayes Classifier

In [21]:
MNB = MultinomialNB()
MNB.fit(x_train, y_train)
MNB_clf = MNB.predict(x_test)


In [22]:
MNB.score(x_train, y_train)

0.9172

In [25]:
accuracy_score(y_test, MNB_clf)

0.83308

In [26]:
# tfidf_NN = TfidfVectorizer(max_features = 1000)
# x_train_NN = tfidf_NN.fit_transform(df_train['prep_review'])
# y_train_NN = df_train['label']
# x_test_NN = tfidf_NN.transform(df_test['prep_review'])
# y_test_NN = df_test['label']
# x_train_NN.shape

In [27]:
# from keras.models import Sequential
# from keras.layers import LSTM, Convolution1D, Flatten, Dropout, Dense

# model = Sequential()
# model.add(Dense(256, input_shape=(1000,) , activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(200, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(160, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(120, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(80, activation='relu'))
# model.add(Dropout(0.3))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()


In [28]:
# model.fit(x_train_NN, y_train_NN, batch_size=128, epochs=10, verbose=1)


In [29]:
# loss, accuracy = model.evaluate(x_train_NN, y_train_NN)
# print (loss, accuracy)

In [30]:
# predictions = model.predict(x_test_NN)
# # round predictions
# rounded = [round(x[0]) for x in predictions]
# predictions = rounded
# score = accuracy_score(y_test_NN ,predictions)
# print(score)
