In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import random
from nltk.metrics import edit_distance
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import pickle

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Import the dataset
df = pd.read_csv('Dataset.csv', encoding = 'latin-1')
df = df[['review', 'category_final', 'sentiment_final', 'req_final']]
df.head()

Unnamed: 0,review,category_final,sentiment_final,req_final
0,The notification badges are showing up on my i...,requirement,neutral,functional
1,Why do they make changes we don't need? Now th...,other,very negative,
2,Story is useless n annoying to user. \nCan't s...,other,negative,
3,This app is always having an update for someth...,requirement,negative,functional
4,Just spent an hour trying to upload photos and...,requirement,negative,non-functional


In [4]:
df.shape

(3000, 4)

In [5]:
# Lower case the category, sentiment, req. type
df['category_final'] = df['category_final'].str.lower()
df['sentiment_final'] = df['sentiment_final'].str.lower()
df['req_final'] = df['req_final'].str.lower()

In [6]:
# Replace categories with numerical values
df['category_final'].replace(['noise', 'requirement', 'bug report', 'other'], [0, 1, 2, 3], inplace=True)
df['sentiment_final'].replace(['very negative', 'negative', 'neutral', 'positive', 'very positive', 'postive','positve'], [0, 0, 1, 2, 2, 2, 2], inplace=True)
df['req_final'].replace(['NaN', 'functional', 'non-functional'], [0, 1, 2], inplace=True)
    
# Replace empty req type with 0s (if it's not requirement, it has no req type)
df['req_final'].replace(np.nan, 0, inplace=True)
df.head()

Unnamed: 0,review,category_final,sentiment_final,req_final
0,The notification badges are showing up on my i...,1,1.0,1.0
1,Why do they make changes we don't need? Now th...,3,0.0,0.0
2,Story is useless n annoying to user. \nCan't s...,3,0.0,0.0
3,This app is always having an update for someth...,1,0.0,1.0
4,Just spent an hour trying to upload photos and...,1,0.0,2.0


In [7]:
df.shape

(3000, 4)

In [8]:
df = df[df['sentiment_final'].notna()]
df.reset_index(inplace=True, drop=True)
df.shape

(2796, 4)

In [9]:
df.head()

Unnamed: 0,review,category_final,sentiment_final,req_final
0,The notification badges are showing up on my i...,1,1.0,1.0
1,Why do they make changes we don't need? Now th...,3,0.0,0.0
2,Story is useless n annoying to user. \nCan't s...,3,0.0,0.0
3,This app is always having an update for someth...,1,0.0,1.0
4,Just spent an hour trying to upload photos and...,1,0.0,2.0


In [10]:
# sw now stores all stopwords from english dictionary
review = df['review'].str.lower()
category = df['category_final']
sentiment = df['sentiment_final']
requirement = df['req_final']

formatted_review = []

# replace punctuations with space
for i in range(0, len(review)):
    result = re.sub('[^a-z]', ' ', str(review[i])) 
    formatted_review.append(result)
review = formatted_review
review[0]

'the notification badges are showing up on my iphone   plus but when i open app there not there  this has happened since last update   also quit interrupting videos with stupid ads  annoying '

In [11]:
review_train, review_test, category_train, category_test, sentiment_train, sentiment_test, req_train, req_test = train_test_split(review, category, sentiment, requirement, train_size=0.8, test_size=0.2, random_state=9, shuffle=True)

In [12]:
# Fits a model and returns a trained model and its accuracy on test set
def model_evaluation(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    print('Accuracy score = {}'.format(accuracy))
    return model, accuracy

In [13]:
# Returns the best parameters for a classifier and its corresponding vectorizer
def get_best_params(param_grid, vectorizer, classifier, x_train, y_train):
    pipeline = Pipeline([('vect' , vectorizer), ('clf', classifier)])
    gs = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    gs.fit(x_train, y_train)
    best_parameters = gs.best_params_
    return best_parameters

In [14]:
# Returns the train/test sets in vectorized format
def to_vector(vectorizer, review_train, review_test):
    x_train = vectorizer.fit_transform(review_train).toarray()
    print('Number of features = {}'.format(len(vectorizer.vocabulary_)))
    x_test = vectorizer.transform(review_test).toarray()
    return x_train, x_test

In [15]:
# Compare two models and return the best one with the corresponding vectorizer
def get_best_model(model_a, vect_a, accuracy_a, model_b, vect_b, accuracy_b):
    vectorizer =vect_a if accuracy_a > accuracy_b else vect_b
    model = model_a if accuracy_a > accuracy_b else model_b
    return vectorizer, model

In [16]:
# Parameters for the vectorizer and LR classifier
param_grid = [
    {'vect__ngram_range': [(1,1), (1,2), (1,3), (1,4), (2,2), (2,3), (2,4)],
    'vect__min_df': [0.005, 0.0005, 0.00005],
    'clf__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
]

# Parameters for the vectorizer and SVC classifier
param_grid_svc = [
    {'vect__ngram_range': [(1,1), (1,2), (1,3), (1,4), (2,2), (2,3), (2,4)],
    'vect__min_df': [0.005, 0.0005, 0.00005],
    'clf__tol': [0.001, 0.0001, 0.00001, 0.000001]}
]


In [17]:
sentiment_test=sentiment_test.astype('int')
sentiment_train=sentiment_train.astype('int')

# Best parameters for LR and vectorizer for predicting SENTIMENT
best_lr_sent = get_best_params(param_grid, TfidfVectorizer(), LogisticRegression(), review_train, sentiment_train)
best_lr_sent

{'clf__penalty': 'l2',
 'clf__solver': 'newton-cg',
 'vect__min_df': 0.0005,
 'vect__ngram_range': (1, 2)}

In [18]:
lr = LogisticRegression(penalty=best_lr_sent['clf__penalty'], solver=best_lr_sent['clf__solver'])
vect_lr_sent = TfidfVectorizer(ngram_range=best_lr_sent['vect__ngram_range'], min_df=best_lr_sent['vect__min_df'])

x_train, x_test = to_vector(vect_lr_sent, review_train, review_test)
model_lr_sent, lr_sent_accuracy = model_evaluation(lr, x_train, x_test, sentiment_train, sentiment_test)

Number of features = 11466
Accuracy score = 0.6910714285714286


In [19]:
# Best parameters for SVC and vectorizer for predicting SENTIMENT
best_svc_sent = get_best_params(param_grid_svc, TfidfVectorizer(), LinearSVC(), review_train, sentiment_train)
best_svc_sent

{'clf__tol': 0.001, 'vect__min_df': 0.0005, 'vect__ngram_range': (1, 3)}

In [20]:
svc = LinearSVC(tol=best_svc_sent['clf__tol'])
vect_svc_sent = TfidfVectorizer(ngram_range=best_svc_sent['vect__ngram_range'], min_df=best_svc_sent['vect__min_df'])

x_train, x_test = to_vector(vect_svc_sent, review_train, review_test)
model_svc_sent, svc_sent_accuracy = model_evaluation(svc, x_train, x_test, sentiment_train, sentiment_test)

Number of features = 16198
Accuracy score = 0.6732142857142858


In [21]:
final_vect, final_model = get_best_model(model_lr_sent, vect_lr_sent, lr_sent_accuracy, model_svc_sent, vect_svc_sent, svc_sent_accuracy)
pickle.dump(final_vect, open('vect_sent.pickle', 'wb'))
pickle.dump(final_model, open('sentiment_classifier.pickle', 'wb'))

In [22]:
# Best parameters for LR and vectorizer for predicting CATEGORY
best_lr_cat = get_best_params(param_grid, TfidfVectorizer(), LogisticRegression(), review_train, category_train)
best_lr_cat

{'clf__penalty': 'none',
 'clf__solver': 'saga',
 'vect__min_df': 0.0005,
 'vect__ngram_range': (1, 2)}

In [23]:
lr = LogisticRegression(penalty=best_lr_cat['clf__penalty'], solver=best_lr_cat['clf__solver'])
vect_lr_cat = TfidfVectorizer(ngram_range=best_lr_cat['vect__ngram_range'], min_df=best_lr_cat['vect__min_df'])

x_train, x_test = to_vector(vect_lr_cat, review_train, review_test)
model_lr_cat, lr_cat_accuracy = model_evaluation(lr, x_train, x_test, category_train, category_test)


Number of features = 11466
Accuracy score = 0.7428571428571429


In [24]:
# Best parameters for SVC and vectorizer for predicting CATEGORY
best_svc_cat = get_best_params(param_grid_svc, TfidfVectorizer(), LinearSVC(), review_train, category_train)
best_svc_cat

{'clf__tol': 0.001, 'vect__min_df': 0.0005, 'vect__ngram_range': (1, 2)}

In [25]:
svc = LinearSVC(tol=best_svc_cat['clf__tol'])
vect_svc_cat = TfidfVectorizer(ngram_range=best_svc_cat['vect__ngram_range'], min_df=best_svc_cat['vect__min_df'])

x_train, x_test = to_vector(vect_svc_cat, review_train, review_test)
model_svc_cat, svc_cat_accuracy = model_evaluation(svc, x_train, x_test, category_train, category_test)

Number of features = 11466
Accuracy score = 0.7482142857142857


In [26]:
final_vect, final_model = get_best_model(model_lr_cat, vect_lr_cat, lr_cat_accuracy, model_svc_cat, vect_svc_cat, svc_cat_accuracy)
pickle.dump(final_vect, open('vect_cat.pickle', 'wb'))
pickle.dump(final_model, open('category_classifier.pickle', 'wb'))