In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score, recall_score

### Libraries added by Connor
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer # Need to discuss stemming vs. lemmatization (they say lemmatization, but results are consistent with stemming)
from nltk.tokenize import word_tokenize # Need to discuss stemming vs. lemmatization (they say lemmatization, but results are consistent with stemming)
from nltk.stem import WordNetLemmatizer

In [2]:
cwd = os.getcwd()
train_bug = pd.read_json(cwd+'/training_data/Bug_tt.json')
train_feature = pd.read_json(cwd+'/training_data/Feature_tt.json')
train_rating = pd.read_json(cwd+'/training_data/Rating_tt.json')
train_userex = pd.read_json(cwd+'/training_data/UserExperience_tt.json')

In [3]:
train_bug['Bug'] = np.where(train_bug['label']=="Bug", 1, 0)
train_feature['Feature'] = np.where(train_feature['label']=="Feature", 1, 0)
train_rating['Rating'] = np.where(train_rating['label']=="Rating", 1, 0)
train_userex['Userex'] = np.where(train_userex['label']=="UserExperience",1,0)

In [4]:
test_data = pd.read_excel('data553_CH+AK&CH+CL.xlsx')
test_data=test_data.fillna(0)

In [5]:
def get_bigram(train_df,type,test_df):
    count_vect = CountVectorizer(ngram_range=(2,2))
    bg_train_counts = count_vect.fit_transform(train_df.comment)
    bg_test_counts = count_vect.transform(test_df.text)
    if type == 'Feature':
        model_bigram = GaussianNB().fit(bg_train_counts.todense(),train_df.Feature)
        bigram_pred = model_bigram.predict(bg_test_counts.todense())
    elif type == 'Bug':
        model_bigram = GaussianNB().fit(bg_train_counts.toarray(),train_df.Bug)
        bigram_pred = model_bigram.predict(bg_test_counts.toarray())
    elif type == 'Rating':
        model_bigram = GaussianNB().fit(bg_train_counts.todense(),train_df.Rating)
        bigram_pred = model_bigram.predict(bg_test_counts.todense())
    elif type == 'Userex':
        model_bigram = GaussianNB().fit(bg_train_counts.todense(),train_df.Userex)
        bigram_pred = model_bigram.predict(bg_test_counts.todense())
    f1 = f1_score(test_df[type],bigram_pred)
    recall = recall_score(test_df[type],bigram_pred)
    precision = precision_score(test_df[type],bigram_pred)
    metrics = [type,precision,recall,f1]
    return metrics

In [6]:
bg_feature_score = get_bigram(train_feature,'Feature',test_data)
bg_bug_score = get_bigram(train_bug,'Bug',test_data)
bg_userex_score = get_bigram(train_userex,'Userex',test_data)
bg_rating_score = get_bigram(train_rating,'Rating',test_data)

In [7]:
print(bg_feature_score)
print(bg_bug_score)
print(bg_userex_score)
print(bg_rating_score)

['Feature', 0.14534883720930233, 0.6944444444444444, 0.2403846153846154]
['Bug', 0.28776978417266186, 0.7142857142857143, 0.41025641025641024]
['Userex', 0.41040462427745666, 0.6173913043478261, 0.4930555555555556]
['Rating', 0.746606334841629, 0.6818181818181818, 0.712742980561555]


**Output just Christians Reviews:**

* ['Feature', 0.0872093023255814, 0.6, 0.15228426395939088]
* ['Bug', 0.1510791366906475, 0.6774193548387096, 0.24705882352941175]
* ['Userex', 0.2543352601156069, 0.6197183098591549, 0.36065573770491804]
* ['Rating', 0.5158371040723982, 0.7169811320754716, 0.6]

In [8]:
##### ~~~ Bigrams + BOW - Stop Words + Stemming ~~~ #####
# Stemming Function (NOTE: From inspecting the output data from the "lemmatization" completed by the experimenters we see that the results are more consistent with stemming than lemmatization)
def stemData(commentSeries):
    lancaster = LancasterStemmer()
    tokens = [token for token in commentSeries.split(" ") if token != ""]
    output = ' '.join([lancaster.stem(word) for word in tokens])
    return output

# Lematizing Function (Not using but included for now so that we can discuss if we want to do stemming or lemmatization)
def lemmatizeData(commentSeries):
    lemmatizer = WordNetLemmatizer()
    tokens = [token for token in commentSeries.split(" ") if token != ""]
    output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return output

In [9]:
# Function will be called get_BBSS for Bigrams + BOW - Stop Words + stemming
def get_bbss(train_df,type,test_df):
    bbss_train_counts = train_df.comment.apply(lemmatizeData)
    bbss_test_counts = test_df.text.apply(lemmatizeData)
    stop_words = set(stopwords.words('english'))
    count_vect = CountVectorizer(stop_words=stop_words, ngram_range=(1,2))
    bbss_train_counts = count_vect.fit_transform(train_df.comment)
    bbss_test_counts = count_vect.transform(test_df.text)
    if type == 'Feature':
        model_bbss = GaussianNB().fit(bbss_train_counts.todense(),train_df.Feature)
        bbss_pred = model_bbss.predict(bbss_test_counts.todense())
    elif type == 'Bug':
        model_bbss = GaussianNB().fit(bbss_train_counts.toarray(),train_df.Bug)
        bbss_pred = model_bbss.predict(bbss_test_counts.toarray())
    elif type == 'Rating':
        model_bbss = GaussianNB().fit(bbss_train_counts.todense(),train_df.Rating)
        bbss_pred = model_bbss.predict(bbss_test_counts.todense())
    elif type == 'Userex':
        model_bbss = GaussianNB().fit(bbss_train_counts.todense(),train_df.Userex)
        bbss_pred = model_bbss.predict(bbss_test_counts.todense())
    f1 = f1_score(test_df[type],bbss_pred)
    recall = recall_score(test_df[type],bbss_pred)
    precision = precision_score(test_df[type],bbss_pred)
    metrics = [type,precision,recall,f1]
    return metrics

In [10]:
bbss_feature_score = get_bbss(train_feature,'Feature',test_data)
bbss_bug_score = get_bbss(train_bug,'Bug',test_data)
bbss_userex_score = get_bbss(train_userex,'Userex',test_data)
bbss_rating_score = get_bbss(train_rating,'Rating',test_data)

In [11]:
print(bbss_feature_score)
print(bbss_bug_score)
print(bbss_userex_score)
print(bbss_rating_score)

['Feature', 0.13157894736842105, 0.5555555555555556, 0.2127659574468085]
['Bug', 0.28104575163398693, 0.7678571428571429, 0.41148325358851673]
['Userex', 0.4258064516129032, 0.5739130434782609, 0.4888888888888889]
['Rating', 0.776255707762557, 0.7024793388429752, 0.737527114967462]


**Output Before Changing from Just Bigrams to Bigrams and Unigrams:**

* ['Feature', 0.19047619047619047, 0.3333333333333333, 0.24242424242424246]
* ['Bug', 0.3548387096774194, 0.39285714285714285, 0.3728813559322034]
* ['Userex', 0.5555555555555556, 0.34782608695652173, 0.42780748663101603]
* ['Rating', 0.6719242902208202, 0.8801652892561983, 0.7620751341681575]

In [16]:
print("Feature: ", test_data.Feature.sum())
print("Bug: ", test_data.Bug.sum())
print("Userex: ", test_data.Userex.sum())
print("Rating: ", test_data.Rating.sum())

Feature:  36.0
Bug:  56.0
Userex:  115.0
Rating:  242.0
