In [1]:
import pandas as pd

In [2]:
l_raw = pd.read_excel("data/LData.xlsx", encoding='utf-8', index_col='Unnamed: 0')
l_raw.head(1)

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
52505.0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional


In [3]:
l_raw.info()
l = l_raw

<class 'pandas.core.frame.DataFrame'>
Float64Index: 3000 entries, 52505.0 to 45413.0
Data columns (total 14 columns):
app_name           3000 non-null object
user_id            3000 non-null object
user_name          2993 non-null object
date               3000 non-null object
country            3000 non-null object
version            3000 non-null object
score              3000 non-null int64
topic              2994 non-null object
review             2996 non-null object
url                2850 non-null object
review_id          2850 non-null float64
category_final     3000 non-null object
sentiment_final    2796 non-null object
req_final          1081 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 351.6+ KB


# DATA CLEANING

* Reindex

In [4]:
l = l.reset_index(drop=True)
l.head()

Unnamed: 0,app_name,user_id,user_name,date,country,version,score,topic,review,url,review_id,category_final,sentiment_final,req_final
0,Facebook,246193109,Help is herr,2017-06-21 00:00:00,United States,97,1,Notifications not showing up,The notification badges are showing up on my i...,https://itunes.apple.com/WebObjects/MZStore.wo...,53163.0,requirement,neutral,functional
1,Facebook,43034279,javamdnss,2017-06-16 00:00:00,United States,97,1,Hate it!,Why do they make changes we don't need? Now th...,https://itunes.apple.com/WebObjects/MZStore.wo...,53905.0,other,very negative,
2,Facebook,496978255,,2017-05-27 00:00:00,Hong Kong,94,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,https://itunes.apple.com/WebObjects/MZStore.wo...,47401.0,other,negative,
3,Facebook,139595037,Gilbertiggy,2017-05-26 00:00:00,United Kingdom,94,1,To many updates!,This app is always having an update for someth...,https://itunes.apple.com/WebObjects/MZStore.wo...,42233.0,requirement,negative,functional
4,Facebook,180832062,Princess Lou 24,2017-06-01 00:00:00,United Kingdom,94,1,Photo albums,Just spent an hour trying to upload photos and...,https://itunes.apple.com/WebObjects/MZStore.wo...,42066.0,requirement,negative,non-functional


* Drop unneccessary columns

In [5]:
l = l.drop(['user_id', 'url', 'review_id', 'version', 'user_name', 'app_name', 'date', 'req_final'], axis=1)
l.head(1)

Unnamed: 0,country,score,topic,review,category_final,sentiment_final
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


* Rename label columns

In [6]:
l = l.rename(columns = {
    "category_final": "category",
    "sentiment_final": "sentiment"
}, errors="raise")
l.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


* Remove NaN rows

In [7]:
l = l.dropna()
l.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2795 entries, 0 to 2999
Data columns (total 6 columns):
country      2795 non-null object
score        2795 non-null int64
topic        2795 non-null object
review       2795 non-null object
category     2795 non-null object
sentiment    2795 non-null object
dtypes: int64(1), object(5)
memory usage: 152.9+ KB


In [8]:
# https://docs.python.org/2/library/string.html
import string
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

In [9]:
def remove_punctuation(data, column):
    """
    Uses string.punctuation list to remove unwarranted characeters
    !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    """
    punctuation_table = str.maketrans('', '', string.punctuation)
    stripped = [review.translate(punctuation_table) for review in data[column]]
    return stripped

def lowercase(data, column):
    review_arr = data[column].to_list()
    return [review.lower() for review in review_arr]

def asciionly(data, column):
    new_reviews = []
    for review in data[column]:
        review = re.sub('[^A-Za-z0-9\s]', '', review)
        new_reviews.append(review)
    return new_reviews

def new_line_to_space(data, column):
    new_reviews = []
    for review in data[column]:
        nr = review.replace('\n', ' ')
        nr = nr.replace('\t', '') # remove tab as well
        new_reviews.append(nr)
    return new_reviews

def remove_single_characters(data, column):
    new_reviews = []
    for review in data[column]:
        words = review.split(" ")
        new_words = []
        for index, word in enumerate(words):
            if (len(word) > 1):
                new_words.append(word)
        new_review = " ".join(word for word in new_words)
        new_reviews.append(new_review)
    return new_reviews

def remove_non_english(data, column):
    words = set(nltk.corpus.words.words())
    new_reviews = []
    for review in data[column]:
        new_review = " ".join(w for w in nltk.wordpunct_tokenize(review) if w.lower() in words or not w.isalpha())
        new_reviews.append(new_review)
    return new_reviews

def remove_stopwords(data, column):
    stop_words = stopwords.words('english')
    new_reviews = []
    for review in data[column]:
        new_words = []
        for word in review.split(" "):
            if word not in stop_words:
                new_words.append(word)
        new_review = " ".join(w for w in new_words)
        new_reviews.append(new_review)  
    return new_reviews

def tokenize_words(data, column):
    tokenized_reviews = []
    for review in data[column]:
        tokenized_reviews.append(word_tokenize(review))
    return tokenized_reviews

def stem(data, column):
    porter = PorterStemmer()
    new_reviews = []
    for review_tokens in data[column]:
        stemmed = [porter.stem(word) for word in review_tokens]
        new_reviews.append(stemmed)
    return new_reviews

def lemmatize(data, column):
    wnt = WordNetLemmatizer()
    new_reviews = []
    for review_tokens in data[column]:
        lemmas = []   
        for word in review_tokens:
            lemma = wnt.lemmatize(word, pos='v')
            lemmas.append(lemma)
        new_reviews.append(lemmas)
    return new_reviews

In [10]:
# testing
review = "The notification badges are showing up on my iphone when i don't want them to and stuff I can't believe this"
rdi = { 'review' : [review] }
te = pd.DataFrame(rdi)

In [11]:
te['review'] = tokenize_words(te, 'review')
te['review'][0]

['The',
 'notification',
 'badges',
 'are',
 'showing',
 'up',
 'on',
 'my',
 'iphone',
 'when',
 'i',
 'do',
 "n't",
 'want',
 'them',
 'to',
 'and',
 'stuff',
 'I',
 'ca',
 "n't",
 'believe',
 'this']

In [12]:
type(te['review'][0])

list

In [13]:
te = lemmatize(te, 'review')
te

[['The',
  'notification',
  'badge',
  'be',
  'show',
  'up',
  'on',
  'my',
  'iphone',
  'when',
  'i',
  'do',
  "n't",
  'want',
  'them',
  'to',
  'and',
  'stuff',
  'I',
  'ca',
  "n't",
  'believe',
  'this']]

In [14]:
l.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral


In [15]:
norm_l = l.copy()
norm_l.head()

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,The notification badges are showing up on my i...,requirement,neutral
1,United States,1,Hate it!,Why do they make changes we don't need? Now th...,other,very negative
2,Hong Kong,1,Useless function n poor experience,Story is useless n annoying to user. \nCan't s...,other,negative
3,United Kingdom,1,To many updates!,This app is always having an update for someth...,requirement,negative
4,United Kingdom,1,Photo albums,Just spent an hour trying to upload photos and...,requirement,negative


In [16]:
def normalize(data, column):
    data[column] = lowercase(data, column)
    data[column] = remove_punctuation(data, column)
    data[column] = asciionly(data, column)
    data[column] = new_line_to_space(data, column)
    data[column] = remove_single_characters(data, column)
    data[column] = remove_stopwords(data, column)
    data[column] = tokenize_words(data, column)
#     data[column] = stem(data, column)
    data[column] = lemmatize(data, column)
    return data[column]

In [17]:
norm_l['review'] = normalize(norm_l, 'review')
norm_l.head(1)

Unnamed: 0,country,score,topic,review,category,sentiment
0,United States,1,Notifications not showing up,"[notification, badge, show, iphone, plus, open...",requirement,neutral


In [18]:
r = norm_l['review'][321]
r

['cant', 'remove', 'reactions', 'interface', 'unusable', 'stupid']

# CLASSIFIER TRAINING

In [136]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics

from sklearn.model_selection import cross_val_score

In [20]:
def collapse_labels(labels):
    new_labels = []
    pos = ['positive', 'very positive', 'positve', 'postive']
    neg = ['negative', 'very negative']
    for label in labels:
        label = label.lower()
        if label in pos:
            label = "positive"
        elif label in neg:
            label = "negative"
        new_labels.append(label)
    return pd.Series(new_labels)

def stringify_reviews(reviews):
    new_reviews = []
    for review in reviews:
        new_reviews.append(str(review))
    return new_reviews

## SENTIMENT

In [21]:
labels = collapse_labels(norm_l['sentiment'])
reviews = stringify_reviews(norm_l['review'])

In [181]:
from sklearn.pipeline import Pipeline
import numpy as np

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
count_vectorizer = CountVectorizer(min_df=25, ngram_range=(1,2))

def train_clfs(reviews, labels, mf=5000):
    """
    Encode target labels with value between 0 and n_classes-1.
    """
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    
    """
    Split training and test data (70/30)
    """
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        reviews,
        encoded_labels,
        random_state=8,
        train_size=0.7
    )
    
    """
    TF-IDF
    """
    tfidf_vectorizer.fit(reviews) # on the entire vocabulary
    
    X_train_tfidf = tfidf_vectorizer.transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    """
    Models
    """
    nb = naive_bayes.MultinomialNB().fit(X_train_tfidf, y_train)
    nb_y_preds = nb.predict(X_test_tfidf)
    print_performance("multinomial naive bayes",
                     y_test, nb_y_preds)
    
    SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto').fit(X_train_tfidf, y_train)
    SVM_y_preds = SVM.predict(X_test_tfidf)
    print_performance("svm", y_test, SVM_y_preds)
        
    count_vectorizer.fit(X_train)
    X_train_vect = count_vectorizer.transform(X_train)
    X_test_vect = count_vectorizer.transform(X_test)
    
    lr = LogisticRegression(solver='liblinear', multi_class='auto').fit(X_train_vect, y_train)
    lr_y_preds = lr.predict(X_test_vect)
    print_performance("logistic regression", y_test, lr_y_preds)
    
    rf = RandomForestClassifier(max_depth=50).fit(X_train_tfidf, y_train)
    X_vect_test_tfidf = tfidf_vectorizer.transform(X_test)
    rf_y_preds = rf.predict(X_vect_test_tfidf)
    print_performance("random forest", y_test, rf_y_preds)
    
    sgd = SGDClassifier(loss='hinge', penalty='l2',
                             alpha=1e-3, random_state=42,
                             max_iter=5, tol=None).fit(X_train_tfidf, y_train)
    sgd_y_preds = sgd.predict(X_vect_test_tfidf)
    print_performance("sgd", y_test, sgd_y_preds)
    
def print_performance(title, test, predicted):
    print(f'/\/\/\/\/ {title}')
    print("Recall\t\t",recall_score(test, predicted, average="weighted")*100)
    print()
    print("Precision\t", precision_score(test, predicted, average='weighted')*100)
    print()
    print("F1\t\t", f1_score(test, predicted, average='weighted')*100, "\n")
    

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                         alpha=1e-3, random_state=42,
                         max_iter=8, tol=None)),
])
    
def train_clfs2(reviews, labels, mf=5000):
    encoder = LabelEncoder()
    e_labels = encoder.fit_transform(labels)
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(reviews, e_labels,
                                                                        random_state=42, train_size=0.7)
    
    text_clf.fit(X_train, y_train)
    
    y_pred = text_clf.predict(X_test)
    print(np.mean(y_pred == y_test))
    print(metrics.classification_report(y_test, y_pred, target_names=['positive', 'neutral', 'negative']))

In [182]:
def get_tfidf(reviews, X_train):
    tfidf_vectorizer.fit(reviews)
    X_train_tfidf = tfidf_vectorizer.transform(X_train)
    return X_train_tfidf
    
def get_count_vect(X_train, X_test):
    count_vectorizer.fit(X_train)
    X_train_vect = count_vectorizer.transform(X_train)
    return X_train_vect

In [183]:
train_clfs2(reviews, labels)

0.6567342073897497
              precision    recall  f1-score   support

    positive       0.64      0.63      0.63       299
     neutral       0.55      0.47      0.50       232
    negative       0.74      0.82      0.78       308

    accuracy                           0.66       839
   macro avg       0.64      0.64      0.64       839
weighted avg       0.65      0.66      0.65       839



In [184]:
train_clfs(reviews, labels)

/\/\/\/\/ multinomial naive bayes
Recall		 66.15017878426698

Precision	 66.70088015702542

F1		 64.49623885060626 

/\/\/\/\/ svm
Recall		 67.34207389749702

Precision	 67.40061457203662

F1		 67.36049779767703 

/\/\/\/\/ logistic regression
Recall		 61.14421930870083

Precision	 60.73714073131784

F1		 60.877959094766396 

/\/\/\/\/ random forest
Recall		 65.07747318235995

Precision	 64.39140090253483

F1		 64.29030139797496 

/\/\/\/\/ sgd
Recall		 67.46126340882003

Precision	 66.68513014297989

F1		 66.74201356098311 



In [None]:
# add extra features blindly
lf = l.copy()
lf.head(1)

In [186]:
f_reviews = []

"""
Take the country, score, and topic field, and concatenate with the review string.
"""
for index, row in lf.iterrows():
    country = str(row['country'])
    score = str(row['score'])
    topic = str(row['topic'])
    review = str(row['review'])
    features = country + " " + score + " " + topic + " " + review
    f_reviews.append(features)
    
lf['review'] = f_reviews
lf['review'] = normalize(lf, 'review')

f_reviews = stringify_reviews(lf['review'])

In [187]:
train_clfs2(f_reviews, labels)

0.6734207389749702
              precision    recall  f1-score   support

    positive       0.65      0.66      0.65       299
     neutral       0.58      0.46      0.51       232
    negative       0.74      0.85      0.79       308

    accuracy                           0.67       839
   macro avg       0.66      0.66      0.65       839
weighted avg       0.66      0.67      0.67       839



In [188]:
train_clfs(f_reviews, labels)

/\/\/\/\/ multinomial naive bayes
Recall		 67.34207389749702

Precision	 67.09115623856893

F1		 65.60951060580464 

/\/\/\/\/ svm
Recall		 68.89153754469606

Precision	 69.14623635671052

F1		 69.00342921910872 

/\/\/\/\/ logistic regression
Recall		 63.17044100119189

Precision	 62.88423495372807

F1		 62.9456187191635 

/\/\/\/\/ random forest
Recall		 67.93802145411205

Precision	 67.38084277536915

F1		 67.44584728565385 

/\/\/\/\/ sgd
Recall		 70.20262216924911

Precision	 69.5406349130178

F1		 69.6127972478808 



# HYPERPARAM TUNING

In [197]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

parameters = {
    'vect__ngram_range': [(1,1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=1)

In [198]:
encoder = LabelEncoder()
e_labels = encoder.fit_transform(labels)

X_train, X_test, y_train, y_test = model_selection.train_test_split(f_reviews, e_labels,
                                                                    random_state=42, train_size=0.6)

In [199]:
gs_clf = gs_clf.fit(X_train, y_train)

In [200]:
gs_clf.best_score_

0.6696588486140725

In [201]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


# CROSS VALIDATION

In [202]:
scores = cross_val_score(text_clf, f_reviews, labels, cv=5)

In [203]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.64 (+/- 0.04)


# CONFUSION MATRIX

In [205]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(text_clf, X_train, y_train, cv=5)
confusion_matrix(y_train, y_train_pred)

array([[407, 109,  67],
       [171, 241,  97],
       [ 65,  45, 475]])

In [None]:
# SCIKIT-LEARN MOVIE REVIEW DATASET (SENTIMENT ANALYSIS)