In [74]:
import pandas as pd
import os
import numpy as np  
import re  
import nltk  
from sklearn.datasets import load_files  
nltk.download('stopwords')  
import pickle  
from nltk.corpus import stopwords  
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to /Users/jpan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
os.chdir(r'/Users/jpan/Environments/macys_reviews/')

baddf = pd.read_csv("macys_597.csv", sep='\t')
baddf = baddf.drop(columns=['Unnamed: 0', 'Ratings'])
wholedf = pd.read_csv("macys_5000.csv", sep='\t')
wholedf = wholedf.drop(columns=['Unnamed: 0'])
gooddf = pd.read_csv("macys_4403.csv")
gooddf = gooddf.drop(columns=['Unnamed: 0', 'Ratings'])

In [76]:
# Read each comment in the dataframe to either pos or neg folder
os.chdir(r'/Users/jpan/Environments/macys_reviews/txt_sentoken/neg')
for i in range(597):
    fh = open(str(i+1) + '_bad.txt', 'w')
    fh.write(''.join(list(baddf['Comments'][i])))
    fh.close()
os.chdir(r'/Users/jpan/Environments/macys_reviews/txt_sentoken/pos')
for i in range(4403):
    fh = open(str(i+1) + '_good.txt', 'w')
    fh.write(''.join(list(gooddf['Comments'][i])))
    fh.close()

In [142]:
macys_data = load_files(r"/Users/jpan/Environments/macys_reviews/txt_sentoken")  
X, y = macys_data.data, macys_data.target  

In [143]:
# Clean text
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    documents.append(document)

In [270]:
# Convert text documents into corresponding numerical features using Bag-of-Words
from sklearn.feature_extraction.text import CountVectorizer  
vectorizer = CountVectorizer(max_features=500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(documents).toarray()

In [271]:
# Convert values obtained using the bag of words model into TFIDF values
from sklearn.feature_extraction.text import TfidfTransformer  
tfidfconverter = TfidfTransformer()  
X = tfidfconverter.fit_transform(X).toarray() 

# from sklearn.feature_extraction.text import TfidfVectorizer  
# tfidfconverter = TfidfVectorizer(max_features=500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
# X = tfidfconverter.fit_transform(documents).toarray()  

In [186]:
# tfidfconverter

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.float64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.7, max_features=500, min_df=5,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'th...', u"shouldn't", u'wasn', u"wasn't", u'weren', u"weren't", u'won', u"won't", u'wouldn', u"wouldn't"],
        strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [272]:
# Divide our data into training and testing sets
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [273]:
# Train our machine learning model using the random forest algorithm
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)  
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [274]:
y_pred = classifier.predict(X_test) 

In [275]:
# Evaluating model

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred)) 

# 91.1% accuracy

[[ 50  54]
 [ 30 867]]
              precision    recall  f1-score   support

           0       0.62      0.48      0.54       104
           1       0.94      0.97      0.95       897

   micro avg       0.92      0.92      0.92      1001
   macro avg       0.78      0.72      0.75      1001
weighted avg       0.91      0.92      0.91      1001

0.916083916083916


In [276]:
# Store model

with open('macys_reviews_classifier', 'wb') as picklefile:  
    pickle.dump(classifier,picklefile)

In [277]:
# Call model

os.chdir(r'/Users/jpan/Environments/macys_reviews')

with open('macys_reviews_classifier', 'rb') as training_model:  
    model = pickle.load(training_model)

In [164]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))  
print(classification_report(y_test, y_pred2))  
print(accuracy_score(y_test, y_pred2))  

[[ 50  54]
 [ 30 867]]
              precision    recall  f1-score   support

           0       0.62      0.48      0.54       104
           1       0.94      0.97      0.95       897

   micro avg       0.92      0.92      0.92      1001
   macro avg       0.78      0.72      0.75      1001
weighted avg       0.91      0.92      0.91      1001

0.916083916083916


In [110]:
y_pred2

array([1, 1, 1, ..., 1, 1, 1])