In [29]:
#Importing
import numpy as np
import pandas as pd
import bs4 as bs
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
nltk.download("stopwords")
nltk.download("wordnet")
eng_stopwords = stopwords.words("english")
import warnings
warnings.simplefilter('ignore')

[nltk_data] Downloading package stopwords to /Users/ethan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ethan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
#Preparing for modeling
def Evaluate(Y_Test, Predictions) :
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    Accuracy = accuracy_score(Y_Test, Predictions)
    Precision = precision_score(Y_Test, Predictions)
    Recall = recall_score(Y_Test, Predictions)
    F1 = f1_score(Y_Test, Predictions)
    Metric = {'Accuracy Score' : round(Accuracy,2),
              'Precision Score' : round(Precision,2),
              'Recall Score' : round(Recall,2),
              'F1 Score' : round(F1,2)}
    print(f'Accuracy Score : {Accuracy * 100:.2f}%')
    print(f'Precision Score : {Precision * 100:.2f}%')
    print(f'Recall Score : {Recall * 100:.2f}%')
    print(f'F1 Score : {F1 * 100:.2f}%')
    return Metric


In [31]:
#Preparing dataset
data=pd.read_csv('reviews.csv', encoding='ISO-8859-1')
print("dimension of data: {}".format(data.shape))
data.drop('Id', axis=1, inplace=True)
data.drop('ProductId', axis=1, inplace=True)
data.drop('UserId', axis=1, inplace=True)
data.drop('ProfileName', axis=1, inplace=True)
data.drop('HelpfulnessNumerator', axis=1, inplace=True)
data.drop('HelpfulnessDenominator', axis=1, inplace=True)
data.drop('Time', axis=1, inplace=True)
data.drop('Summary', axis=1, inplace=True)
data.head()

dimension of data: (568454, 10)


Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [32]:
#Preparing data for review
data = data.head(10000)
text_input = data["Text"]
y = data['Score']

ps = PorterStemmer()
vectorizer = CountVectorizer(
        ngram_range=(1, 1),
        analyzer="word",
        tokenizer=None,
        preprocessor=None,
        stop_words=None,
        max_features=100)

wnl = WordNetLemmatizer()

cleaned_reviews = []

#Iterating through reviews 
for i, review in enumerate(text_input):
    if (i + 1) % 1000 == 0:
        print("Done with %d reviews" % (i + 1))
    review = bs.BeautifulSoup(review).text
    review = re.sub("[^a-zA-Z]", " ", review)
    review = review.lower().split()
    eng_stopwords = set(stopwords.words("english"))
    clean_review = []
    for word in review:
        if word not in eng_stopwords:
            word = wnl.lemmatize(word)
            clean_review.append(word)
    review_processed = " ".join(clean_review)
    cleaned_reviews.append(review_processed)

Done with 1000 reviews
Done with 2000 reviews
Done with 3000 reviews
Done with 4000 reviews
Done with 5000 reviews
Done with 6000 reviews
Done with 7000 reviews
Done with 8000 reviews
Done with 9000 reviews
Done with 10000 reviews


In [33]:
X = vectorizer.fit_transform(cleaned_reviews).toarray()

In [34]:
print(X[0])

[0 0 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print (X_train.shape, X_test.shape)

(7000, 100) (3000, 100)


In [36]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
print("Training set score: {:.3f}".format(model.score(X_train, y_train)))
print("Test set score: {:.3f}".format(model.score(X_test, y_test)))
predicted = model.predict(X_test)
expected = y_test

Training set score: 0.994
Test set score: 0.665
