### Load some modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
sub_dir = '/Users/dominicbates/Documents/GitHub/app-review-classifier/'
os.chdir(sub_dir)
import app_review_classifier.text_processing as processing

### Load review data

In [13]:
def split_sample(df, frac_train = 0.9, frac_validate = 0.1):
  
    # Shuffle dataframe
    df_shuffled = df.sample(frac=1, random_state=123).reset_index(drop=True)
    
    # Indexes
    train_index = int(len(df)*frac_train)
    validate_index = int(len(df)*(frac_train+frac_validate))

    # Sample dataframe
    df_train = df_shuffled[:train_index]
    df_validate = df_shuffled[train_index:validate_index]
    df_test = df_shuffled[validate_index:]
    
    return df_train, df_validate, df_test

In [2]:
df = pd.read_csv('/Users/dominicbates/Documents/GitHub/app-review-classifier/data/classified_sample_2nd_sep.csv')

In [3]:
m_sample = (df['Your Name'] == 'Dom')
cleaned_df = df[m_sample] 

# Remove nulls
cleaned_df = cleaned_df[pd.notnull(cleaned_df['Category'])]
cleaned_df = cleaned_df[pd.notnull(cleaned_df['Sentiment'])].reset_index(drop=True)


In [8]:
cleaned_df.head()

Unnamed: 0,Date,Rating,Title,App,OS,Review,uniqueid,row_created_date,Category,Sentiment,Your Name
0,15-Apr-20,5,Beautiful App - Login Woes,The Economist,iOS,"Love the app, beautiful and love the audio int...",apple_The Economist_sirbeeth,01/09/2022 10:00,B,M,Dom
1,05-Sep-19,1,World in Brief is repeated in Today’s Agenda,Espresso,iOS,"The Espresso was good earlier, reading it used...",apple_Espresso_Abyss312,01/09/2022 10:00,J,N,Dom
2,20-Sep-21,5,Being Brief,The Economist,iOS,The Economist is to be complimented on its “Mo...,apple_The Economist_DVJake,01/09/2022 12:00,J,P,Dom
3,20-Aug-22,5,ESSENTIAL,The Economist,iOS,Good writing is good thinking with ink (or pix...,apple_The Economist_jhatpro,01/09/2022 12:00,J,P,Dom
4,08-Mar-21,1,,The Economist,Android,You need to pay for everything?!,google_The Economist_Swadesh,01/09/2022 10:00,O,N,Dom


### Do text processing and embedding

In [5]:
cleaner = processing.TextCleaner(config = {'stop_words':True,
                                           'ngrams':True,
                                           'lemmatization':True})

# Train on full sample
cleaned_text_all = cleaner.process_raw_text(df['Review'],train_ngrams = True)

# Apply to cleaned sample
cleaned_text = cleaner.process_raw_text(cleaned_df['Review'])



Cleaning up text and removing stopwords...
- Text cleaned in: 0.18 seconds

Training ngrams...
- Training done in: 0.98 seconds

Loading spacy model...
- Model loaded in: 0.61 seconds

Lemmatizing...
- Lemmatizing done in: 23.57 seconds
Cleaning up text and removing stopwords...
- Text cleaned in: 0.01 seconds

Loading spacy model...
- Model loaded in: 0.51 seconds

Lemmatizing...
- Lemmatizing done in: 1.27 seconds


In [6]:
embedder_w2v = processing.Embedder(method='word2vec')
embedder_tfidf = processing.Embedder(method='tfidf')

embedder_w2v.fit(cleaned_text_all)
embedder_tfidf.fit(cleaned_text_all)


Fitting word2vec model...
- Setting up model...
- Done!
- Building Vocab...
- Built!
- Training Model...
- Trained!
Fitting tfidf model...
- Done!


In [30]:
# Split sample
df_train, ignore, df_test = split_sample(cleaned_df, 0.9, 0)

train_X = embedder_w2v.apply(cleaner.process_raw_text(df_train['Review']))
test_X = embedder_w2v.apply(cleaner.process_raw_text(df_test['Review']))

Cleaning up text and removing stopwords...
- Text cleaned in: 0.01 seconds

Loading spacy model...
- Model loaded in: 0.55 seconds

Lemmatizing...
- Lemmatizing done in: 1.21 seconds
Cleaning up text and removing stopwords...
- Text cleaned in: 0.0 seconds

Loading spacy model...
- Model loaded in: 0.57 seconds

Lemmatizing...
- Lemmatizing done in: 0.15 seconds


In [12]:
df.sample(frac=1).reset_index(drop=True)

Unnamed: 0,Date,Rating,Title,App,OS,Review,uniqueid,row_created_date,Category,Sentiment,Your Name
0,08-May-20,4,,The Economist,Android,Very informative.,google_The Economist_philippe lam,01/09/2022 10:00,,,
1,02-Aug-19,5,,Espresso,Android,best news app ever,google_Espresso_A Google user,01/09/2022 10:00,,,
2,27-Jun-18,5,,Espresso,Android,A well-degined app that allows me to start my ...,google_Espresso_A Google user,01/09/2022 10:00,,,
3,11-Mar-22,4,,Espresso,Android,"Despite occasional bugs, I like the app and en...",google_Espresso_Hesam Shahriari,01/09/2022 10:00,,,
4,23-Jul-22,2,No Split View Support,The Economist,iOS,The app doesn’t support split view / multitask...,apple_The Economist_StevenNYC,01/09/2022 10:00,,,
...,...,...,...,...,...,...,...,...,...,...,...
9110,25-Jan-19,5,Quick read,Espresso,iOS,A good quick but thorough update on the world ...,apple_Espresso_big AL G,01/09/2022 10:00,,,
9111,20-Nov-21,3,,The Economist,Android,Reports are not fair and balanced. They promot...,google_The Economist_Ivan Shim,01/09/2022 10:00,J,N,Dom
9112,06-Jul-20,1,Waste of money,The Economist,iOS,Just got billed $189.99 for renewal. For past ...,apple_The Economist_BlueGrouse,01/09/2022 12:00,,,
9113,01-Aug-22,1,Racist and Propagandist media,The Economist,iOS,They used to be good at telling economical iss...,apple_The Economist_Dead waffle,01/09/2022 10:00,,,
