In [1]:
##----- Importing all required Packages -----##
import os
import re
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, precision_score, recall_score
import scipy as sp

In [2]:
##----- Loading all training data -----##
cwd = os.getcwd()
train_bug = pd.read_json(cwd+'/training_data/Bug_tt.json')
train_feature = pd.read_json(cwd+'/training_data/Feature_tt.json')
train_rating = pd.read_json(cwd+'/training_data/Rating_tt.json')
train_userex = pd.read_json(cwd+'/training_data/UserExperience_tt.json')

In [3]:
##----- Creating flags as required & setting NaN values to 0 -----##
train_bug['Bug'] = np.where(train_bug['label']=="Bug", 1, 0)
train_feature['Feature'] = np.where(train_feature['label']=="Feature", 1, 0)
train_rating['Rating'] = np.where(train_rating['label']=="Rating", 1, 0)
train_userex['Userex'] = np.where(train_userex['label']=="UserExperience",1,0)

train_bug = train_bug.fillna(0)
train_feature = train_feature.fillna(0)
train_rating = train_rating.fillna(0)
train_userex = train_userex.fillna(0)

In [4]:
##----- Creating function for Lematizing Comments/Text -----##
def lemmatizeData(commentSeries):
    lemmatizer = WordNetLemmatizer()
    tokens = [token for token in commentSeries.split(" ") if token != ""]
    output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return output

In [5]:
##----- Reading in Testing data -----##
test_data = pd.read_excel('Connor_Christian_ClassificationChecked.xlsx')
test_data=test_data.fillna(0)

In [6]:
##----- Lemmatizing training and test data
lemmatized_bug = train_bug.copy()
lemmatized_feature = train_feature.copy()
lemmatized_userex = train_userex.copy()
lemmatized_rating = train_rating.copy()
lemmatized_test = test_data.copy()

lemmatized_feature["comment"] = lemmatized_feature["comment"].apply(lemmatizeData)
lemmatized_bug["comment"] = lemmatized_bug["comment"].apply(lemmatizeData)
lemmatized_userex["comment"] = lemmatized_userex["comment"].apply(lemmatizeData)
lemmatized_rating["comment"] = lemmatized_rating["comment"].apply(lemmatizeData)
lemmatized_test["text"] = lemmatized_test["text"].apply(lemmatizeData)

In [7]:
def get_bow_lemmatize_ratings(train_df,type,test_df):
    count_vect = CountVectorizer()
    bow_train = sp.sparse.hstack((count_vect.fit_transform(train_df.comment),train_df[['rating']].values),format='csr')
    bow_test = sp.sparse.hstack((count_vect.transform(test_df.text),test_df[['score_x']].values),format='csr')
    if type == 'Feature':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Feature)
        bow_pred = model_bow.predict(bow_test.todense())
    elif type == 'Bug':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Bug)
        bow_pred = model_bow.predict(bow_test.todense())
    elif type == 'Rating':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Rating)
        bow_pred = model_bow.predict(bow_test.todense())
    elif type == 'Userex':
        model_bow = GaussianNB().fit(bow_train.todense(),train_df.Userex)
        bow_pred = model_bow.predict(bow_test.todense())
    f1 = f1_score(test_df[type],bow_pred)
    recall = recall_score(test_df[type],bow_pred)
    precision = precision_score(test_df[type],bow_pred)
    metrics = [type,precision,recall,f1]
    return metrics

In [8]:
blr_feature_score = get_bow_lemmatize_ratings(lemmatized_feature, 'Feature', lemmatized_test)
blr_bug_score = get_bow_lemmatize_ratings(lemmatized_bug,'Bug',lemmatized_test)
blr_userex_score = get_bow_lemmatize_ratings(lemmatized_userex,'Userex',lemmatized_test)
blr_rating_score = get_bow_lemmatize_ratings(lemmatized_rating, 'Rating',lemmatized_test)

In [9]:
print(blr_feature_score)
print(blr_bug_score)
print(blr_userex_score)
print(blr_rating_score)

['Feature', 0.1276595744680851, 0.5, 0.20338983050847456]
['Bug', 0.25, 0.6428571428571429, 0.36]
['Userex', 0.44680851063829785, 0.5478260869565217, 0.49218749999999994]
['Rating', 0.8009049773755657, 0.731404958677686, 0.7645788336933046]
