In [1]:
#loading reviews and sentiments in to the dataframe
import pandas as pd
filename = 'C:/Users/AHMED/Downloads/final-output-sentiments.csv'
reviews = pd.read_csv(filename)
print(reviews) 

                                             reviewText  sentiment
0     The bra is very nice and pretty.  The fabric i...          1
1     Good, loved the sale. Amazing discount price a...          1
2     Comfortable and fit was on point. Like the fac...          1
3     Ordered my &#34;normal size&#34; and they are ...          0
4     If I bought this I don't know what happened to...          0
5     I got this for my son who is 6'2 and weighs ab...          1
6     I hope the watch lives up to the Timex reputat...          1
7     This is a beautiful watch, nicer in person tha...          1
8     I rodered this shoe in a 5.5w BIG GIRL and rec...          0
9     I am extremely upset with my new BM8180-03E be...          0
10    Great bra for the money, I just wish there whe...          1
11    Love how it fits, material feels good and stro...          1
12    Size: Perfect fit. Shipment: very good. Qualit...          1
13    Fit was quite nice, but the center snap/button...       

In [2]:
#checking the column headings
reviews.columns

Index(['reviewText', 'sentiment'], dtype='object')

In [4]:
#count for both classes
reviews['sentiment'].value_counts()

1    4000
0    4000
Name: sentiment, dtype: int64

In [5]:
#testing the datatypes of the two columns
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

#is_string_dtype(reviews['reviewText'])
#is_numeric_dtype(reviews['sentiment'])


In [7]:
#data cleansing
import re
for index, row in reviews.iterrows():
    review = row['reviewText']
    review = re.sub("[^a-zA-Z' ]+", '', str(review)).lower()
    #review = ''.join([i for i in str(review) if not i.isdigit()])
    #review = re.sub("[!@#$+%*:()-]", '', str(review))
    reviews.at[index, 'reviewText'] = review

In [8]:
def PrintReviews():
    print(reviews)

In [9]:
#finding out the unique words
def UniqueWords():
    uniqueWords = list(reviews['reviewText'].str.split(' ', expand=True).stack().unique())
    print(len(uniqueWords))

In [10]:
#removing words that are not found in the english dictionary (optional)
from nltk.corpus import wordnet
for index, row in reviews.iterrows():
    review = row['reviewText']
    review = ' '.join([w for w in str(review).split() if wordnet.synsets(w)])
    reviews.at[index, 'reviewText'] = review


In [11]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AHMED\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
#getting a list of stopWords
from nltk.corpus import stopwords
oldStopWords = stopwords.words('english')
exceptions = ['no', 'nor', 'not','don', "don't", 't', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
stopWords = [word for word in oldStopWords if word not in exceptions]

In [13]:
#removing stop words
for index, row in reviews.iterrows():
    review = row['reviewText']
    wordList = review.split() 
    filteredWords = [word for word in wordList if word not in stopWords]
    review = ' '.join(filteredWords)
    reviews.at[index, 'reviewText'] = review

In [14]:
UniqueWords()

11028


In [17]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def StemSentence(reviews):
    for index, row in reviews.iterrows():
        review = row['reviewText']
        wordsTokens = word_tokenize(review)
        stemmedSentence=[]
        for word in wordsTokens:
            stemmedSentence.append(porter.stem(word))
            stemmedSentence.append(" ")
        review = "".join(stemmedSentence)
        reviews.at[index, 'reviewText'] = review

In [18]:
StemSentence(reviews)
UniqueWords()

6980


In [19]:
PrintReviews()

                                             reviewText  sentiment
0     bra nice pretti fabric stiff rather itchi wash...          1
1     good love sale amaz discount price excel qualiti           1
2     comfort fit point like fact not visibl top ple...          1
3     order normal size full size width small mayb b...          0
4                              bought know happen wear           0
5     got son weigh coat fit great say comfort warm ...          1
6     hope watch live reput cost much still run open...          1
7     beauti watch nicer person pictur love two tone...          1
8     shoe w big girl babi shoe shoe look like pictu...          0
9     extrem upset new second hand not align mark ar...          0
10         great bra money wish color choos size great           1
11    love fit materi feel good strong beef pocket w...          1
12    size perfect fit shipment good color exactli p...          1
13    fit quit nice center broke first time put know...       

In [20]:
reviews.columns

Index(['reviewText', 'sentiment'], dtype='object')

In [21]:
df = pd.DataFrame()
df['reviews'] = reviews['reviewText'].values
df['sentiment'] = reviews['sentiment'].values

In [22]:
df.head()

Unnamed: 0,reviews,sentiment
0,bra nice pretti fabric stiff rather itchi wash...,1
1,good love sale amaz discount price excel qualiti,1
2,comfort fit point like fact not visibl top ple...,1
3,order normal size full size width small mayb b...,0
4,bought know happen wear,0


In [23]:
df['reviews'].dtypes

dtype('O')

In [24]:
#Lets start training Models
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [25]:
# Bag of Words Model
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

cv = CountVectorizer(ngram_range = (1,1),tokenizer = token.tokenize)
#print(cv)

In [26]:
#fit the reviews with the model
text_counts= cv.fit_transform(df['reviews'])

In [27]:
#split the training data into 70% , while the test into 30%
from sklearn.model_selection import train_test_split

X_train_ngram, X_test_ngram, y_train_ngram, y_test_ngram = train_test_split(
    text_counts, df['sentiment'], test_size=0.3, random_state=1)

In [28]:
#Text classification using TF-IDF
from sklearn.naive_bayes import MultinomialNB
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Generation Using Multinomial Naive Bayes
clf = MultinomialNB().fit(X_train_ngram, y_train_ngram)


predicted= clf.predict(X_test_ngram)

print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_ngram, predicted))

MultinomialNB Accuracy: 0.8766666666666667


In [29]:
#tf-idf

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf=TfidfVectorizer()

text_tf= tf.fit_transform(df['reviews'])

In [31]:
from sklearn.model_selection import train_test_split
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(
    text_tf, df['sentiment'], test_size=0.3, random_state=123)

In [32]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Model Generation Using Multinomial Naive Bayes

clf = MultinomialNB().fit(X_train_tf, y_train_tf)
predicted= clf.predict(X_test_tf)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_tf, predicted))

MultinomialNB Accuracy: 0.8783333333333333


In [None]:
#new model training

In [None]:
#SVM for n-gram bow model

In [33]:
from sklearn import svm


In [34]:
clf = svm.SVC().fit(X_train_ngram, y_train_ngram)
predicted_SVM = clf.predict(X_test_ngram)
print("SVM:",metrics.accuracy_score(y_test_ngram, predicted_SVM))



SVM: 0.6483333333333333


In [None]:
#SVM for tf-idf model

In [35]:
clf = svm.SVC().fit(X_train_tf, y_train_tf)
predicted_SVM = clf.predict(X_test_tf)
print("SVM:",metrics.accuracy_score(y_test_tf, predicted_SVM))

SVM: 0.49541666666666667


In [None]:
#KNN for ngram 

In [36]:
#Import Library
from sklearn.neighbors import KNeighborsClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create KNeighbors classifier object model 
model = KNeighborsClassifier(n_neighbors=10) # default value for n_neighbors is 5
# Train the model using the training sets and check score
model.fit(X_train_ngram, y_train_ngram)
#Predict Output
predictedm= model.predict(X_test_ngram)

In [37]:
print("KNN:",metrics.accuracy_score(y_test_ngram, predictedm))

KNN: 0.7033333333333334


In [38]:
#tf-idf knn

#Import Library
from sklearn.neighbors import KNeighborsClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create KNeighbors classifier object model 
model = KNeighborsClassifier(n_neighbors=10) # default value for n_neighbors is 5
# Train the model using the training sets and check score
model.fit(X_train_tf, y_train_tf)
#Predict Output
predictedm= model.predict(X_test_tf)

In [39]:
print("KNN:",metrics.accuracy_score(y_test_tf, predictedm))

KNN: 0.7958333333333333


In [None]:
#random forest , n-gram

In [41]:
#Import Library
from sklearn.ensemble import RandomForestClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Random Forest object
model = RandomForestClassifier()
# Train the model using the training sets and check score
model.fit(X_train_ngram, y_train_ngram)
#Predict Output
predicted_model = model.predict(X_test_ngram)



In [42]:
print("Rand Forest:",metrics.accuracy_score(y_test_ngram, predicted_model))

Rand Forest: 0.8020833333333334


In [None]:
#tf-idf random forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Random Forest object
model = RandomForestClassifier()
# Train the model using the training sets and check score
model.fit(X_train_tf, y_train_tf)
#Predict Output
predicted_model = model.predict(X_test_tf)



In [44]:
print("Rand Forest:",metrics.accuracy_score(y_test_tf, predicted_model))

Rand Forest: 0.7941666666666667


In [None]:
#G-Boost N gram

In [45]:
#Import Library
from sklearn.ensemble import GradientBoostingClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Gradient Boosting Classifier object
model= GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# Train the model using the training sets and check score
model.fit(X_train_ngram, y_train_ngram)
#Predict Output
predicted_mode = model.predict(X_test_ngram)

In [46]:
print("GBoost:",metrics.accuracy_score(y_test_ngram, predicted_mode))

GBoost: 0.8304166666666667


In [None]:
#G-Boost tf-idf

In [47]:
#Import Library
from sklearn.ensemble import GradientBoostingClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Gradient Boosting Classifier object
model= GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
# Train the model using the training sets and check score
model.fit(X_train_tf, y_train_tf)
#Predict Output
predicted_mode = model.predict(X_test_tf)

In [48]:
print("GBoost:",metrics.accuracy_score(y_test_tf, predicted_mode))

GBoost: 0.8204166666666667
