In [61]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import shuffle
# LIbrary to Clean the texts
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
ps = PorterStemmer()


# Importing the dataset
TrainDataset = pd.read_csv('Train.csv')
TrainDataset=shuffle(TrainDataset)
TestDataset = pd.read_csv('Test.csv')
TestDataset=shuffle(TestDataset)

#load the train and test split
X_train=TrainDataset['Review']
y_train=TrainDataset.iloc[:,1].values
X_test=TestDataset['Review']
y_test=TestDataset.iloc[:,1].values

corpus = []

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elisontuscano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/elisontuscano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [62]:
def getCleanReview(review):
    #only keep alphabets remove rest
    review = re.sub('[^a-zA-Z]', ' ', review)
    #turn all reviews into lowercase
    review = review.lower()
    review = review.split()
    #remove stopwords and do stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #Lemmatize and merge the review together after making all the changes
    review = ' '.join([lemmatizer.lemmatize(word) for word in review])
    return review

In [63]:
#review = getCleanReview( X_train[0])
len(X_train)

25000

In [64]:
for i in range(0, len(X_train)):
    review = getCleanReview(X_train[i])
    corpus.append(review)
len(corpus)

25000

In [70]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizor=TfidfVectorizer(stop_words='english', max_df=0.7 ,max_features=1500)
X = tfidf_vectorizor.fit_transform(corpus).toarray()
y = y_train
y.shape

(25000,)

In [94]:
df=[]
for i in range(0,len(X_train)):
    df.append([corpus[i],y_train[i]])

In [99]:
concatdf=pd.DataFrame(df)
concatdf.to_csv('ConvertedTrain.csv',index=None)

In [71]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
NaiveClassifier = GaussianNB()
NaiveClassifier.fit(X, y)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = NaiveClassifier, X = X, y = y, cv = 10)
accuracies.mean()

0.50404

In [75]:
#checking the accuracy on test data
testcorpus=[]
for i in range(0, len(X_test)):
    review = getCleanReview(str(X_test[i]))
    testcorpus.append(review)
X_test=tfidf_vectorizor.transform(testcorpus).toarray()

# Predicting the Test set results
y_pred = NaiveClassifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [76]:
accuracy =((cm[0][0]+cm[0][1])/float(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))*100
print('accuracy is : {} %'.format(accuracy))

accuracy is : 50.0 %


In [86]:
X_test.shape

(25000, 1500)

In [49]:
from sklearn.externals import joblib
#saving the model
joblib.dump(NaiveClassifier,'model/NaiveBayes_model.sav')
joblib.dump(tfidf_vectorizor,'model/tfidf_model.sav')



['model/tfidf_model.sav']

Now lets check whether we can improve this with a better algorithm
lets try SvM

In [38]:
from sklearn.svm import SVC
SVMClassifier = SVC(kernel = 'linear', random_state = 0)
SVMClassifier.fit(X, y)

# Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = SVMClassifier, X = X, y = y, cv = 10)
accuracies.mean()

0.49350000000000005

In [40]:
# Predicting the Test set results
y_pred = SVMClassifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
accuracy =((cm[0][0]+cm[0][1])/float(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))*100
print('accuracy is : {} %'.format(accuracy))

accuracy is : 50.0 %


In [50]:
#saving the model
joblib.dump(SVMClassifier,'model/SVM_model.sav')

['model/SVM_model.sav']

In [42]:
from sklearn.ensemble import RandomForestClassifier
RandomClassifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
RandomClassifier.fit(X, y)

# Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = RandomClassifier, X = X, y = y, cv = 10)
accuracies.mean()

0.5065

In [43]:
# Predicting the Test set results
y_pred = RandomClassifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
accuracy =((cm[0][0]+cm[0][1])/float(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))*100
print('accuracy is : {} %'.format(accuracy))

accuracy is : 50.0 %


In [52]:
#saving the model
joblib.dump(RandomClassifier,'model/RandomForest_model.sav')

['model/RandomForest_model.sav']

In [45]:
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
KNNClassifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNNClassifier.fit(X, y)

# Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = KNNClassifier, X = X, y = y, cv = 10)
accuracies.mean()

0.488

In [46]:
# Predicting the Test set results
y_pred = KNNClassifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
accuracy =((cm[0][0]+cm[0][1])/float(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))*100
print('accuracy is : {} %'.format(accuracy))

accuracy is : 50.0 %


In [53]:
#saving the model
joblib.dump(KNNClassifier,'model/KNN_model.sav')

['model/KNN_model.sav']

In [47]:
from sklearn.linear_model import LogisticRegression
LogisticClassifier = LogisticRegression(random_state = 0)
LogisticClassifier.fit(X, y)

# Applying k-Fold Cross Validation
accuracies = cross_val_score(estimator = LogisticClassifier, X = X, y = y, cv = 10)
accuracies.mean()

0.47800000000000004

In [48]:
# Predicting the Test set results
y_pred = LogisticClassifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
accuracy =((cm[0][0]+cm[0][1])/float(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]))*100
print('accuracy is : {} %'.format(accuracy))

accuracy is : 50.0 %


In [54]:
#saving the model
joblib.dump(LogisticClassifier,'model/LogisticRegression_model.sav')

['model/LogisticRegression_model.sav']