In [1]:
##### ~~~ Library Imports ~~~ #####
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score
import scipy as sp
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
##### ~~~ Importing and Cleaing Training and Testing Data ~~~ #####
# Reading in Training Data
cwd = os.getcwd()
train_bug = pd.read_json(cwd+'/training_data/Bug_tt.json')
train_feature = pd.read_json(cwd+'/training_data/Feature_tt.json')
train_rating = pd.read_json(cwd+'/training_data/Rating_tt.json')
train_userex = pd.read_json(cwd+'/training_data/UserExperience_tt.json')

In [3]:
# Labeling Training Data with Reveiw Types
train_bug['Bug'] = np.where(train_bug['label']=="Bug", 1, 0)
train_feature['Feature'] = np.where(train_feature['label']=="Feature", 1, 0)
train_rating['Rating'] = np.where(train_rating['label']=="Rating", 1, 0)
train_userex['Userex'] = np.where(train_userex['label']=="UserExperience",1,0)

# Filling NAs with zeros for Training Data 
train_bug = train_bug.fillna(0)
train_feature = train_feature.fillna(0)
train_rating = train_rating.fillna(0)
train_userex = train_userex.fillna(0)

In [4]:
# Reading in Testing Data and filling NAs with zeros
test_data = pd.read_excel('data553_CH+AK&CH+CL.xlsx')
test_data=test_data.fillna(0)

In [5]:
##### ~~~ Bigrams ~~~ #####
# Function to process, fit, and run the bigrams model on the testing data
def get_bigram(train_df,type,test_df):
    count_vect = CountVectorizer(ngram_range=(2,2))
    bg_train_counts = count_vect.fit_transform(train_df.comment)
    bg_test_counts = count_vect.transform(test_df.text)
    if type == 'Feature':
        model_bigram = GaussianNB().fit(bg_train_counts.todense(),train_df.Feature)
        bigram_pred = model_bigram.predict(bg_test_counts.todense())
    elif type == 'Bug':
        model_bigram = GaussianNB().fit(bg_train_counts.toarray(),train_df.Bug)
        bigram_pred = model_bigram.predict(bg_test_counts.toarray())
    elif type == 'Rating':
        model_bigram = GaussianNB().fit(bg_train_counts.todense(),train_df.Rating)
        bigram_pred = model_bigram.predict(bg_test_counts.todense())
    elif type == 'Userex':
        model_bigram = GaussianNB().fit(bg_train_counts.todense(),train_df.Userex)
        bigram_pred = model_bigram.predict(bg_test_counts.todense())
    f1 = f1_score(test_df[type],bigram_pred)
    recall = recall_score(test_df[type],bigram_pred)
    precision = precision_score(test_df[type],bigram_pred)
    metrics = [type,precision,recall,f1]
    return metrics

In [6]:
# Running the Bigrams function for each review type and training set
bg_feature_score = get_bigram(train_feature,'Feature',test_data)
bg_bug_score = get_bigram(train_bug,'Bug',test_data)
bg_userex_score = get_bigram(train_userex,'Userex',test_data)
bg_rating_score = get_bigram(train_rating,'Rating',test_data)

In [7]:
# Displaying the Bigrams Classification performance results on the test data
print(bg_feature_score)
print(bg_bug_score)
print(bg_userex_score)
print(bg_rating_score)

['Feature', 0.14534883720930233, 0.6944444444444444, 0.2403846153846154]
['Bug', 0.28776978417266186, 0.7142857142857143, 0.41025641025641024]
['Userex', 0.41040462427745666, 0.6173913043478261, 0.4930555555555556]
['Rating', 0.746606334841629, 0.6818181818181818, 0.712742980561555]


In [8]:
##### ~~~ Bigrams + BOW - Stop Words + lemmatize ~~~ #####
# Lemmatizing Function
def lemmatizeData(commentSeries):
    lemmatizer = WordNetLemmatizer()
    tokens = [token for token in commentSeries.split(" ") if token != ""]
    output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return output

In [9]:
# Function to process, fit, and run the Bigrams + BOW - Stop Words + Lemmatize model on the testing data
def get_bbsl(train_df,type,test_df):
    bbsl_train_counts = train_df.comment.apply(lemmatizeData)
    bbsl_test_counts = test_df.text.apply(lemmatizeData)
    stop_words = set(stopwords.words('english'))
    count_vect = CountVectorizer(stop_words=stop_words, ngram_range=(1,2))
    bbsl_train_counts = count_vect.fit_transform(train_df.comment)
    bbsl_test_counts = count_vect.transform(test_df.text)
    if type == 'Feature':
        model_bbsl = GaussianNB().fit(bbsl_train_counts.todense(),train_df.Feature)
        bbsl_pred = model_bbsl.predict(bbsl_test_counts.todense())
    elif type == 'Bug':
        model_bbsl = GaussianNB().fit(bbsl_train_counts.toarray(),train_df.Bug)
        bbsl_pred = model_bbsl.predict(bbsl_test_counts.toarray())
    elif type == 'Rating':
        model_bbsl = GaussianNB().fit(bbsl_train_counts.todense(),train_df.Rating)
        bbsl_pred = model_bbsl.predict(bbsl_test_counts.todense())
    elif type == 'Userex':
        model_bbsl = GaussianNB().fit(bbsl_train_counts.todense(),train_df.Userex)
        bbsl_pred = model_bbsl.predict(bbsl_test_counts.todense())
    f1 = f1_score(test_df[type],bbsl_pred)
    recall = recall_score(test_df[type],bbsl_pred)
    precision = precision_score(test_df[type],bbsl_pred)
    metrics = [type,precision,recall,f1]
    return metrics

In [10]:
# Running the Bigrams + BOW - Stop Words + Lemmatize function for each review type and training set
bbsl_feature_score = get_bbsl(train_feature,'Feature',test_data)
bbsl_bug_score = get_bbsl(train_bug,'Bug',test_data)
bbsl_userex_score = get_bbsl(train_userex,'Userex',test_data)
bbsl_rating_score = get_bbsl(train_rating,'Rating',test_data)

In [11]:
# Displaying the Bigrams + BOW - Stop Words + Lemmatize Classification performance results on the test data
print(bbsl_feature_score)
print(bbsl_bug_score)
print(bbsl_userex_score)
print(bbsl_rating_score)

['Feature', 0.13157894736842105, 0.5555555555555556, 0.2127659574468085]
['Bug', 0.28104575163398693, 0.7678571428571429, 0.41148325358851673]
['Userex', 0.4258064516129032, 0.5739130434782609, 0.4888888888888889]
['Rating', 0.776255707762557, 0.7024793388429752, 0.737527114967462]


In [12]:
##### ~~~ Bigrams + Ratings + Lemmatize ~~~ #####
# Lemmatizing training and test data
lemmatized_bug = train_bug.copy()
lemmatized_feature = train_feature.copy()
lemmatized_userex = train_userex.copy()
lemmatized_rating = train_rating.copy()
lemmatized_test = test_data.copy()

lemmatized_feature["comment"] = lemmatized_feature["comment"].apply(lemmatizeData)
lemmatized_bug["comment"] = lemmatized_bug["comment"].apply(lemmatizeData)
lemmatized_userex["comment"] = lemmatized_userex["comment"].apply(lemmatizeData)
lemmatized_rating["comment"] = lemmatized_rating["comment"].apply(lemmatizeData)
lemmatized_test["text"] = lemmatized_test["text"].apply(lemmatizeData)

In [13]:
# Function fit, and run the Bigrams + Ratings + Lemmatize model on the testing data
def get_bow_lemmatize_ratings(train_df,type,test_df):
    count_vect = CountVectorizer()
    bow_train = sp.sparse.hstack((count_vect.fit_transform(train_df.comment),train_df[['rating']].values),format='csr')
    bow_test = sp.sparse.hstack((count_vect.transform(test_df.text),test_df[['score_x']].values),format='csr')
    if type == 'Feature':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Feature)
        bow_pred = model_bow.predict(bow_test.todense())
    elif type == 'Bug':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Bug)
        bow_pred = model_bow.predict(bow_test.todense())
    elif type == 'Rating':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Rating)
        bow_pred = model_bow.predict(bow_test.todense())
    elif type == 'Userex':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Userex)
        bow_pred = model_bow.predict(bow_test.todense())
    f1 = f1_score(test_df[type],bow_pred)
    recall = recall_score(test_df[type],bow_pred)
    precision = precision_score(test_df[type],bow_pred)
    metrics = [type,precision,recall,f1]
    return metrics

In [14]:
# Running the Bigrams + Ratings + Lemmatize function for each review type and training set
blr_feature_score = get_bow_lemmatize_ratings(lemmatized_feature, 'Feature', lemmatized_test)
blr_bug_score = get_bow_lemmatize_ratings(lemmatized_bug,'Bug',lemmatized_test)
blr_userex_score = get_bow_lemmatize_ratings(lemmatized_userex,'Userex',lemmatized_test)
blr_rating_score = get_bow_lemmatize_ratings(lemmatized_rating, 'Rating',lemmatized_test)

In [15]:
# Displaying the Bigrams + Ratings + Lemmatize Classification performance results on the test data
print(blr_feature_score)
print(blr_bug_score)
print(blr_userex_score)
print(blr_rating_score)

['Feature', 0.1276595744680851, 0.5, 0.20338983050847456]
['Bug', 0.25, 0.6428571428571429, 0.36]
['Userex', 0.44680851063829785, 0.5478260869565217, 0.49218749999999994]
['Rating', 0.8009049773755657, 0.731404958677686, 0.7645788336933046]


In [16]:
##### ~~~ Summary of Results ~~~ #####
# Storing Bigrams results as a dataframe
d = {'Feature': bg_feature_score, 'Bug': bg_bug_score, 'User_Experience': bg_userex_score, 'Ratings': bg_rating_score}
df = pd.concat([pd.Series(v, name=k) for k, v in d.items()], axis=1)
df = df.drop(0)
df.index = ["Precision", "Recall", "F1"]
df.transpose()

Unnamed: 0,Precision,Recall,F1
Feature,0.145349,0.694444,0.240385
Bug,0.28777,0.714286,0.410256
User_Experience,0.410405,0.617391,0.493056
Ratings,0.746606,0.681818,0.712743


In [17]:
# Storing Bigrams + BOW - Stop Words + Lemmatize results as a dataframe
d = {'Feature': bbsl_feature_score, 'Bug': bbsl_bug_score, 'User_Experience': bbsl_userex_score, 'Ratings': bbsl_rating_score}
df = pd.concat([pd.Series(v, name=k) for k, v in d.items()], axis=1)
df = df.drop(0)
df.index = ["Precision", "Recall", "F1"]
df.transpose()

Unnamed: 0,Precision,Recall,F1
Feature,0.131579,0.555556,0.212766
Bug,0.281046,0.767857,0.411483
User_Experience,0.425806,0.573913,0.488889
Ratings,0.776256,0.702479,0.737527


In [18]:
# Storing Bigrams + Ratings + Lemmatize results as a dataframe
d = {'Feature': blr_feature_score, 'Bug': blr_bug_score, 'User_Experience': blr_userex_score, 'Ratings': blr_rating_score}
df = pd.concat([pd.Series(v, name=k) for k, v in d.items()], axis=1)
df = df.drop(0)
df.index = ["Precision", "Recall", "F1"]
df.transpose()

Unnamed: 0,Precision,Recall,F1
Feature,0.12766,0.5,0.20339
Bug,0.25,0.642857,0.36
User_Experience,0.446809,0.547826,0.492187
Ratings,0.800905,0.731405,0.764579


In [19]:
##### ~~~ Summary of Number of Reviews per Category ~~~ #####
print("Feature: ", test_data.Feature.sum())
print("Bug: ", test_data.Bug.sum())
print("Userex: ", test_data.Userex.sum())
print("Rating: ", test_data.Rating.sum())

Feature:  36.0
Bug:  56.0
Userex:  115.0
Rating:  242.0
