In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import json
import numpy as np
def retrieve_data(fileName):
  # Opening JSON file
  f = open(fileName)
  # returns JSON object as a dictionary
  data = json.load(f)
  return data

def extract_review_and_label(data, label):
  labels = [label] * len(data)
  reviews = [d['stopwords_removal_lemmatization'] for d in data]
  return reviews, labels  

In [None]:
feature_data = retrieve_data("Feature.json")
feature_reviews, feature_labels = extract_review_and_label(feature_data, 0)

user_experience_data = retrieve_data("UserExperience.json")
user_experience_reviews, user_experience_labels = extract_review_and_label(user_experience_data, 1)

rating_data = retrieve_data("Rating.json")
rating_reviews, rating_labels = extract_review_and_label(rating_data, 2)

bug_data = retrieve_data("Bug.json")
bug_reviews, bug_labels = extract_review_and_label(bug_data, 3)

reviews = np.concatenate((feature_reviews, user_experience_reviews, rating_reviews, bug_reviews))
labels = np.concatenate((feature_labels, user_experience_labels, rating_labels, bug_labels))

randomize = np.arange(len(reviews))
np.random.shuffle(randomize)
reviews = reviews[randomize]
labels = labels[randomize]

In [None]:
import re 
from nltk import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer 
import pandas as pd

def ignore_top_n_tfidf(k,reviews):
  tfidf_vectorizer=TfidfVectorizer(use_idf=True) 

  combined = ""
  for review in reviews:
    for token in review.split(' '):
      combined += " " + token
  tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform([combined])

  first_vector_tfidfvectorizer=tfidf_vectorizer_vectors
  
  df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
  df = df.sort_values(by=["tfidf"],ascending=False)
  to_ignore = [w for w in list(df.index) if w not in stopwords.words("english")][-k:]
  return to_ignore

def preprocess(review):

  temp = ""
  lst = []
  lemmatizer = WordNetLemmatizer()

  temp = re.sub('[^a-zA-Z]',' ',review).strip()
  temp = temp.lower()
  for w in temp.split():
    lst.append(w)
  temp = " ".join(lst)  
  temp = " ".join([w for w in list(set(temp.split())) if len(w) > 2 and w not in to_ignore]) #and w not in stopwords.words("english")

  return temp

to_ignore = []
for dataset in [feature_reviews, user_experience_reviews, rating_reviews,bug_reviews]:
  to_ignore.append(ignore_top_n_tfidf(10,dataset))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
results = []
for r in reviews:
  results.append(preprocess(r))

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(results)]
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=alpha,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

def extract_features():
  return model.docvecs.vectors_docs

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [None]:
X_train, y_train = extract_features(), labels

In [None]:
from sklearn.metrics import accuracy_score
import warnings; warnings.simplefilter('ignore')
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

def calculate_accuracy_score(y_true, y_predicted):
  return accuracy_score(y_true, y_predicted)

def cross_validation(features, target, index):
  slice_length = int(len(features) / 5)
  train = np.delete(features, slice(slice_length * (index), slice_length * (index + 1)), 0)
  train_target = np.delete(target, slice(slice_length * (index), slice_length * (index + 1)))
  validation = features[slice_length * (index) : slice_length * (index + 1), :]
  validation_target = target[slice_length * (index) : slice_length * (index + 1)]
  return train, train_target, validation, validation_target

accuracies = np.zeros((3, 5))
for i in range(5):
  train_set, train_target, validation_set, validation_target = cross_validation(X_train, y_train, i)
  
  # SVM 
  svclassifier = SVC(kernel="linear",class_weight={0:12.66,1:6.15,2:1.5,3:10.1}, random_state = 42)
  svclassifier.fit(train_set, train_target)
  y_pred = svclassifier.predict(validation_set)
  accuracy = calculate_accuracy_score(validation_target, y_pred)
  accuracies[0][i] = accuracy

  # DECISION TREE
  decisiontree = DecisionTreeClassifier(random_state = 42,max_depth=3, min_samples_leaf=5, class_weight={0:12.66,1:6.15,2:1.5,3:10.1})
  decisiontree.fit(X_train, y_train)
  y_pred = decisiontree.predict(validation_set)
  accuracy = calculate_accuracy_score(validation_target, y_pred)
  accuracies[1][i] = accuracy

  #LR
  lr = LogisticRegression(random_state=42, class_weight={0:12.66,1:6.15,2:1.5,3:10.1})
  lr.fit(X_train, y_train)
  y_pred = lr.predict(validation_set)
  accuracy = calculate_accuracy_score(validation_target, y_pred)
  accuracies[2][i] = accuracy

In [None]:
accuracies

array([[0.57640751, 0.63806971, 0.57774799, 0.61126005, 0.6233244 ],
       [0.47453083, 0.47855228, 0.48793566, 0.46514745, 0.50938338],
       [0.70509383, 0.73190349, 0.69839142, 0.71581769, 0.7386059 ]])