### Load some modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
sub_dir = '/Users/dominicbates/Documents/GitHub/app-review-classifier/'
os.chdir(sub_dir)
import app_review_classifier.text_processing as processing

### Load review data

In [2]:
df = pd.read_csv('/Users/dominicbates/Documents/GitHub/app-review-classifier/data/classified_sample_2nd_sep.csv')

In [3]:
m_sample = (df['Your Name'] == 'Dom')
cleaned_df = df[m_sample] 

# Remove nulls
cleaned_df = cleaned_df[pd.notnull(cleaned_df['Category'])]
cleaned_df = cleaned_df[pd.notnull(cleaned_df['Sentiment'])].reset_index(drop=True)


In [8]:
cleaned_df.head()

Unnamed: 0,Date,Rating,Title,App,OS,Review,uniqueid,row_created_date,Category,Sentiment,Your Name
0,15-Apr-20,5,Beautiful App - Login Woes,The Economist,iOS,"Love the app, beautiful and love the audio int...",apple_The Economist_sirbeeth,01/09/2022 10:00,B,M,Dom
1,05-Sep-19,1,World in Brief is repeated in Today’s Agenda,Espresso,iOS,"The Espresso was good earlier, reading it used...",apple_Espresso_Abyss312,01/09/2022 10:00,J,N,Dom
2,20-Sep-21,5,Being Brief,The Economist,iOS,The Economist is to be complimented on its “Mo...,apple_The Economist_DVJake,01/09/2022 12:00,J,P,Dom
3,20-Aug-22,5,ESSENTIAL,The Economist,iOS,Good writing is good thinking with ink (or pix...,apple_The Economist_jhatpro,01/09/2022 12:00,J,P,Dom
4,08-Mar-21,1,,The Economist,Android,You need to pay for everything?!,google_The Economist_Swadesh,01/09/2022 10:00,O,N,Dom


### Do text processing and embedding

In [74]:
def split_sample(df, frac_train = 0.9, frac_validate = 0.1):
  
    # Shuffle dataframe
    df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
    
    # Indexes
    train_index = int(len(df)*frac_train)
    validate_index = int(len(df)*(frac_train+frac_validate))

    # Sample dataframe
    df_train = df_shuffled[:train_index]
    df_validate = df_shuffled[train_index:validate_index]
    df_test = df_shuffled[validate_index:]
    
    return df_train, df_validate, df_test


def process_all_samples(cleaner, embedder, df_train, df_validate, df_test):
    train_X = embedder_w2v.apply(cleaner.process_raw_text(df_train['Review']))
    validate_X = embedder_w2v.apply(cleaner.process_raw_text(df_validate['Review']))
    test_X = embedder_w2v.apply(cleaner.process_raw_text(df_test['Review']))
    return train_X, validate_X, test_X



In [34]:
cleaner = processing.TextCleaner(config = {'stop_words':True,
                                           'ngrams':True,
                                           'lemmatization':True})

# Train text_cleaner on full sample
cleaned_text_all = cleaner.process_raw_text(df['Review'],train_ngrams = True)

# Train embedders on full sample
embedder_w2v = processing.Embedder(method='word2vec')
embedder_tfidf = processing.Embedder(method='tfidf')
embedder_w2v.fit(cleaned_text_all)
embedder_tfidf.fit(cleaned_text_all)


Cleaning up text and removing stopwords...
- Text cleaned in: 0.19 seconds

Training ngrams...
- Training done in: 1.11 seconds

Loading spacy model...
- Model loaded in: 0.52 seconds

Lemmatizing...
- Lemmatizing done in: 25.4 seconds
Fitting word2vec model...
- Setting up model...
- Done!
- Building Vocab...
- Built!
- Training Model...
- Trained!
Fitting tfidf model...
- Done!


In [110]:
df_train, df_validate, df_test = split_sample(cleaned_df, 0.8, 0)
X_train, X_validate, X_test = process_all_samples(cleaner, embedder_w2v, df_train, df_validate, df_test)
y_train = df_train['Category'].values
y_validate = df_validate['Category'].values
y_test = df_test['Category'].values

Cleaning up text and removing stopwords...
- Text cleaned in: 0.01 seconds

Loading spacy model...
- Model loaded in: 0.54 seconds

Lemmatizing...
- Lemmatizing done in: 1.1 seconds
Cleaning up text and removing stopwords...
- Text cleaned in: 0.0 seconds

Loading spacy model...
- Model loaded in: 0.53 seconds

Lemmatizing...
- Lemmatizing done in: 0.0 seconds
Cleaning up text and removing stopwords...
- Text cleaned in: 0.0 seconds

Loading spacy model...
- Model loaded in: 0.52 seconds

Lemmatizing...
- Lemmatizing done in: 0.35 seconds


### Train some models

In [111]:
# import xgboost as xgb
from xgboost import XGBClassifier

In [151]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def print_performance(vals_y, pred_y, output_accuracy=False, output_confusion=False):
    print('\nConfusion matrix:')
    conf = confusion_matrix(vals_y, pred_y)
    print(conf)

    print('\nAccuracy:')
    acc = accuracy_score(vals_y, pred_y)
    print(acc)

    if output_accuracy is True:
        return acc
    elif output_confusion is True:
        return conf

In [113]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(objective='multi:softprob')

In [118]:
model.predict(X_train)
print_performance(model.predict(X_train), y_train)
print_performance(model.predict(X_test), y_test)

Confusion matrix:
[[ 65   0   0   0   0]
 [  0  75   0   0   0]
 [  0   0  47   0   0]
 [  0   0   0  19   0]
 [  0   1   0   0 123]]

Accuracy:
0.996969696969697
Confusion matrix:
[[10  3  1  2  2]
 [ 6 13  0  1  0]
 [ 0  0  9  0  3]
 [ 0  1  0  2  2]
 [ 0  0  3  1 24]]

Accuracy:
0.6987951807228916


In [150]:
n=3
print(df_test['Review'].iloc[n])
print('\nActual:',y_test[n])
print('\nPred:',model.predict(X_test)[n])
    


Please merge Kindle subscription with electronic one, add espresso on Kindle. This standard app doesnt show espresso. Moreover espresso is shown only 2PM, while settings set to Europe and I'm in Europe. (reinstalling app doesn't help)

Actual: O

Pred: A


In [None]:
I just tried building a quick model to predict sentiment (using word2vec embedding + xgboost) and it actually does a decent job already without any optimisation, and a tiny sample! So I'm fairly hopeful, it'll work once we get some more reviews


