### Load modules

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.utils import simple_preprocess
import spacy

# import spacy
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')

### Load data and create fake categories

In [2]:
reviews_df = pd.read_parquet('/Users/dominicbates/Documents/GitHub/app-review-classifier/data/reviews_sample.parquet')


In [3]:
reviews_df['classification'] = ''
reviews_df.loc[(reviews_df['rating']>=4), ['classification']] = 'Positive'
reviews_df.loc[(reviews_df['rating']<=2), ['classification']] = 'Negative'
reviews_df.loc[(reviews_df['rating']==3), ['classification']] = 'Mixed'


In [453]:
reviews_df.sample(500,random_state=1)['review'].iloc[66]


# # Main topics:
# - Bug reports "This thing is broken"
# - 1/2 word positive review, e.g. "Great", "Nice App"
# - Positive review "Amazing journalism... great app..."
# - Negative feedback "Too expensive... not very good..."
# - Suggestions "I wish..."
# - Mixed "Great app, but X..." "Love the economist, but..."

# Coule have:
# Categories:
# Bug report, positive feedback, negative feedback (non-bug report), short comment, mixed

# Or could have:
# Sentiment: positive, negative, mixed
# Type: bug report, feedback, short comment, mixed...

# Could aslso split by journalism vs app vs subscription/price?
# What happens is it's like great app, but this specific thing is broken?

'Great overview; down to the point.'

In [459]:
reviews_df.sample(500,random_state=1)[['date','rating','app','review']].to_csv('/Users/dominicbates/Documents/GitHub/app-review-classifier/data/sample_review_data.csv', index=False)



In [458]:
reviews_df.sample(200,random_state=1)[['date','rating','app','review']]

Unnamed: 0,date,rating,app,review
316,2021-01-04 13:03:44,1,The Economist (Apple),I have been using the app for several years an...
1117,2021-05-14 21:35:36,1,The Economist (Apple),Version 3.0.3 is garbage: unresponsive for a f...
10390,2018-08-24 09:34:28,5,Espresso (Google),Great
2223,2021-09-19 15:26:18,5,The Economist (Apple),Super
14985,2014-11-06 16:32:18,5,Espresso (Google),Well Done
...,...,...,...,...
11219,2017-10-16 18:27:48,5,Espresso (Google),good mix of topics short and brief to the point
13922,2016-03-26 10:09:28,5,Espresso (Google),A useful different take on world news
10491,2018-07-23 17:53:14,4,Espresso (Google),Nice interesting selection of daily news snipp...
13519,2016-04-15 20:05:38,5,Espresso (Google),The reading time exactly matches my commute. M...


### Set up samples

In [4]:
m_train = (reviews_df['date'] < '2021-06-01')
m_valid = (reviews_df['date'] >= '2021-06-01') & (reviews_df['date'] < '2022-01-01')
m_test = (reviews_df['date'] >= '2022-01-01')

print('Total data size:',len(reviews_df))
print('----------------------')
print('Training size:',m_train.sum())
print('Validation size:',m_valid.sum())
print('Test size:',m_test.sum())

df_train = reviews_df[m_train].reset_index(drop=True)
df_valid = reviews_df[m_valid].reset_index(drop=True)
df_test = reviews_df[m_test].reset_index(drop=True)


Total data size: 14986
----------------------
Training size: 11911
Validation size: 1619
Test size: 1456


### Embedder class

In [216]:
# # Default embedder configs
# default_configs = {'tfidf':{'min_df':5, # Min number of occurances of word to consider
#                             'ngram_range':(1, 1)}, # Ngram range
                   
#                    'word2vec':{'min_count':5, # Min number of occurances of word to consider
#                                'vector_size':100, # Dimension of embedding matrix (i.e. number of embedding features)
#                                'window':5, # Size of window to consider
#                                'workers':3, # Number of partitions during training
#                                'sg':1}, # Training algorothm (CBOW(0) or skip gram(1))
                   
#                    'glove':{}}


# # Custom stop words
# my_stop_words = ({'namely', 'between', 'up', 'whither', 'them', 'beside', 'your', 'about', 'hence', 'former', 'ours', 'itself', 'or', 'these', 'their', 'those', 'has', 're', 'next', 'hereupon', 'whether', 'latter', 'towards', 'over', 'yourselves', 'himself', 'beforehand', 'you', 'wherever', 'another', 'than', 'do', 'around', 'him', 'upon', 'been', 'an', 'me', 'toward', 'within', 'of', 'whole', 'ca', 'once', 'nor', 'thru', 'seeming', 'already', 'keep', 'so', 'mine', 'others', 'until', 'move', 'ourselves', 'other', 'where', 'thereupon', 'she', 'am', 'without', 'again', 'hereby', 'be', 'someone', 'sometime', 'used', 'go', 'everyone', 'some', 'then', 'see', 'to', 'seemed', 'i', 'become', 'whatever', 'and', 'what', 'that', 'thence', 'too', 'whenever', 'whereupon', 'can', 'his', 'just', 'due', 'thereby', 'done', 'name', 'none', 'part', 'noone', 'since', 'doing', 'meanwhile', 'via', 'herself', 'also', 'amount', 'seems', 'say', 'get', 'through', 'show', 'made', 'such', 'a', 'as', 'hers', 'the', 'my', 'whereafter', 'in', 'themselves', 'which', 'something', 'put', 'it', 'by', 'may', 'who', 'various', 'whence', 'throughout', 'during', 'hereafter', 'had', 'after', 'under', 'few', 'using', 'whom', 'will', 'though', 'its', 'might', 'across', 'most', 'above', 'how', 'regarding', 'being', 'our', 'afterwards', 'behind', 'make', 'almost', 'each', 'side', 'along', 'much', 'while', 'any', 'elsewhere', 'many', 'this', 'own', 'us', 'would', 'does', 'latterly', 'anything', 'when', 'are', 'with', 'onto', 'even', 'did', 'thereafter', 'yours', 'all', 'is', 'third', 'if', 'somewhere', 'nothing', 'because', 'wherein', 'whoever', 'somehow', 'either', 'every', 'out', 'whose', 'front', 'take', 'both', 'they', 'for', 'empty', 'anyone', 'back', 'formerly', 'whereby', 'full', 'here', 'into', 'myself', 'we', 'became', 'from', 'seem', 'anywhere', 'besides', 'herein', 'ever', 'at', 'her', 'must', 'therein', 'nobody','well', 'give', 'per', 'indeed', 'down', 'still', 'on', 'could', 'although', 'amongst', 'there', 'was', 'else', 'first', 'further', 'have', 'several', 'yourself', 'beyond', 'now'}
#                  | {'ill','i','id','ive','im','mine'} | 
#                  {'you','youll','youre','your','youd','youve','yours'} | 
#                  {'he','hell','hes','hed','his'} | 
#                  {'she','shell','shed','hers'} |
#                  {'they','theyre','theyd','their','theirs','theyve'} |
#                  {'weve','wed','well','our','ours'} |
#                  {'isnt','wont','shant','d','x'})



# class Embedder:
#     '''
#     Class for performing embedding (TFIDF) on clean text
#     '''
#     def __init__(self,
#                  method='tfidf',
#                  config = None):

#         # Check method exists
#         possible_methods = ['tfidf','word2vec','glove']
#         if method not in possible_methods:
#             raise ValueError('Method not recognised. try one of: '+str(possible_methods))
#         else:
#             self.method = method
            
#         # Set config params
#         if config is None:
#             self.config = default_configs[method]
#         else:
#             self.config = config
            
#         # Set up models
#         if self.method is 'tfidf':
#             self.embedder = TfidfVectorizer(min_df=self.config['min_df'], 
#                                             ngram_range=self.config['ngram_range'])
#         elif self.method is 'word2vec':
#             self.embedder = None # Gets created when training
#         elif self.method is 'glove':
#             print('not done yet')
        
        
#     def remove_stopwords_row(self, doc, min_n=3):
#         '''
#         Removes stop words from single row (e.g. 'here is some text')
#         '''
#         txt = [token for token in doc if token not in my_stop_words]
#         # return None if not enough words left
#         if len(txt) >= min_n:
#             return ' '.join(txt)
#         else:
#             return None
    
    
#     def remove_stopwords(self, text_df):

#         '''
#         Cleans whole text column and removes stop words e.g. df['cleaned_text'] = remove_stopwords(df['raw_text'])
#         '''
#         print('Cleaning up text...')
#         t1 = time.time()
#         cleaned_text = [re.sub("[^a-zA-Z ]", '', str(row)).lower() for row in text_df] # Doesn't do anything for this dataset (remove non alpha-numeric?)
#         cleaned_text = [re.sub(' +', ' ', str(row)).lower().lstrip(' ') for row in cleaned_text] # Remove multiple spaces
#         cleaned_text = [self.remove_stopwords_row(doc.split(' ')) for doc in cleaned_text] # Remove stop words
#         print('Text cleaned in: {} seconds'.format(round((time.time() - t1), 2)))

#         return cleaned_text

    
#     def lemmatization(self, texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):

#         # Load model
#         print('Loading spacy model...')
#         nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#         print('Model loaded!')
        
#         texts_out = []
#         for sent in texts:
#             doc = nlp(" ".join(sent)) 
#             texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
#         return texts_out
        
        
        
#     def process_raw_text(self, raw_text, ngram_threshold=200):

#         def sentence_to_words(sentences):
#             for sentence in sentences:
#                 yield(simple_preprocess(str(sentence), deacc=True, min_len=1, max_len=20))  # deacc=True removes punctuations

#         # Remove stop words
#         cleaned_text = test_embedder.remove_stopwords(df_train['review'])

#         # Turn to list of words and remove punctuation
#         words = list(sentence_to_words(cleaned_text))

#         # Build the bigram and trigram models
#         bigram = Phrases(words, min_count=1, threshold=ngram_threshold) # higher threshold fewer phrases.
#         trigram = Phrases(bigram[words], min_count=1, threshold=ngram_threshold)  
#         bigram_mod = Phraser(bigram)
#         trigram_mod = Phraser(trigram)
        
#         # Get final word list
#         words_and_ngrams = [trigram_mod[bigram_mod[doc]] for doc in words]
        
#         # Lemmatise
#         final_words_lemmatised = self.lemmatization(words_and_ngrams)

#         return final_words_lemmatised
        
        
        
#     def fit(self, text):
#         if self.method is 'tfidf':
#             self.embedder.fit(text)
#         elif self.method is 'word2vec':
#             self.embedder = Word2Vec(text, 
#                                      min_count=self.config['min_count'],
#                                      vector_size = self.config['vector_size'],
#                                      workers = self.config['workers'],
#                                      window = self.config['window'],
#                                      sg = self.config['sg'])
#             print('not done yet')
#         elif self.method is 'glove':
#             print('not done yet')
            
            
#     def apply(self, text):
#         if self.method is 'tfidf':
#             return self.embedder.transform(text)
#         elif self.method is 'word2vec':
#             print('not done yet')
#         elif self.method is 'glove':
#             print('not done yet')
        

In [375]:
# # Default embedder configs
# default_configs = {'tfidf':{'min_df':5, # Min number of occurances of word to consider
#                             'ngram_range':(1, 1)}, # Ngram range
                   
#                    'word2vec':{'min_count':5, # Min number of occurances of word to consider
#                                'vector_size':100, # Dimension of embedding matrix (i.e. number of embedding features)
#                                'window':5, # Size of window to consider
#                                'workers':3, # Number of partitions during training
#                                'sg':1}, # Training algorothm (CBOW(0) or skip gram(1))
                   
#                    'glove':{}}


# Custom stop words
my_stop_words = ({'namely','between','up','whither','them','beside','your','about','hence','former','ours','itself','or','these','their','those','has','re','next','hereupon','whether','latter','towards','over','yourselves','himself','beforehand','you','wherever','another','than','do','around','him','upon','been','an','me','toward','within','of','whole','ca','once','nor','thru','seeming','already','keep','so','mine','others','until','move','ourselves','other','where','thereupon','she','am','without','again','hereby','be','someone','sometime','used','go','everyone','some','then','see','to','seemed','i','become','whatever','and','what','that','thence','too','whenever','whereupon','can','his','just','due','thereby','done','name','none','part','noone','since','doing','meanwhile','via','herself','also','amount','seems','say','get','through','show','made','such','a','as','hers','the','my','whereafter','in','themselves','which','something','put','it','by','may','who','various','whence','throughout','during','hereafter','had','after','under','few','using','whom','will','though','its','might','across','most','above','how','regarding','being','our','afterwards','behind','make','almost','each','side','along','much','while','any','elsewhere','many','this','own','us','would','does','latterly','anything','when','are','with','onto','even','did','thereafter','yours','all','is','third','if','somewhere','nothing','because','wherein','whoever','somehow','either','every','out','whose','front','take','both','they','for','empty','anyone','back','formerly','whereby','full','here','into','myself','we','became','from','seem','anywhere','besides','herein','ever','at','her','must','therein','nobody','well','give','per','indeed','down','still','on','could','although','amongst','there','was','else','first','further','have','several','yourself','beyond','now'}
                 | {'ill','i','id','ive','im','mine', 'you','youll','youre','your','youd','youve','yours', 'he','hell','hes','hed','his','she','shell','shed','hers','they','theyre','theyd','their','theirs','theyve','weve','wed','well','our','ours','isnt','wont','shant','d','x'})



class TextCleaner:
    def __init__(self, 
                 config=None):
        
        self.bigram_model = None
        self.trigram_model = None
        
        # Set config params
        if config is None:
            self.config = default_configs['textcleaner']
        else:
            self.config = config
        self.use_stop_words = self.config['stop_words']
        self.use_ngrams = self.config['ngrams']
        self.use_lemmatization = self.config['lemmatization']
        
    def remove_stopwords_row(self, doc, min_n=3):
        '''
        Removes stop words from single row (e.g. 'here is some text')
        '''
        txt = [token for token in doc if token not in my_stop_words]
        # return None if not enough words left
        if len(txt) >= min_n:
            return ' '.join(txt)
        else:
            return None
    
    
    def remove_stopwords(self, text_df):

        '''
        Cleans whole text column and removes stop words e.g. df['cleaned_text'] = remove_stopwords(df['raw_text'])
        '''
        print('\nCleaning up text and removing stopwords...')
        t1 = time.time()
        cleaned_text = [re.sub("[^a-zA-Z ]", '', str(row)).lower() for row in text_df] # Doesn't do anything for this dataset (remove non alpha-numeric?)
        cleaned_text = [re.sub(' +', ' ', str(row)).lower().lstrip(' ') for row in cleaned_text] # Remove multiple spaces
        cleaned_text = [self.remove_stopwords_row(doc.split(' ')) for doc in cleaned_text] # Remove stop words
        print('Text cleaned in: {} seconds'.format(round((time.time() - t1), 2)))

        return cleaned_text

    
    def lemmatization(self, texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):

        # Load model
        print('\nLoading spacy model...')
        t1 = time.time()
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        print('Model loaded in: {} seconds'.format(round((time.time() - t1), 2)))
        
        # Lemmatize text
        print('\nLemmasizing...')
        t1 = time.time()
        
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
        print('Lemmatizing done in: {} seconds'.format(round((time.time() - t1), 2)))
        return texts_out
        
        
    def train_ngrams(self, words, ngram_threshold):
        # Build the bigram and trigram models
        print('\nTraining ngrams...')
        t1 = time.time()

        bigram = Phrases(words, min_count=1, threshold=ngram_threshold) # higher threshold fewer phrases.
        trigram = Phrases(bigram[words], min_count=1, threshold=ngram_threshold)  
        self.bigram_model = Phraser(bigram)
        self.trigram_model = Phraser(trigram)

        print('Training done in: {} seconds'.format(round((time.time() - t1), 2)))


    def process_raw_text(self, raw_text, 
                               train_ngrams = False, 
                               ngram_threshold=200):

        def sentence_to_words(sentences):
            for sentence in sentences:
                yield(simple_preprocess(str(sentence), deacc=True, min_len=1, max_len=20))  # deacc=True removes punctuations

        # Remove stop words (if required)
        if self.use_stop_words == True:
            cleaned_text = self.remove_stopwords(raw_text)
        else:
            cleaned_text = raw_text

        # Turn to list of words and remove punctuation
        words = list(sentence_to_words(cleaned_text))

        # Retrain ngrams (if required)
        if train_ngrams == True:
            self.train_ngrams(words, ngram_threshold)
            
        # Apply ngrams (if required)
        if self.use_ngrams == True:
            if self.bigram_model is None:
                raise ValueError('Bigram/Trigram models not trained yet! Set train_ngrams=True when calling process_raw_text() to train and save the ngram models')
            else:
                words = [self.trigram_model[self.bigram_model[doc]] for doc in words]
        
        # Lemmatise (if required)
        if self.use_lemmatization == True:
            final_words = self.lemmatization(words)
        else:
            final_words = words

        return final_words


        
#     def fit(self, text):
#         if self.method is 'tfidf':
#             self.embedder.fit(text)
#         elif self.method is 'word2vec':
#             self.embedder = Word2Vec(text, 
#                                      min_count=self.config['min_count'],
#                                      vector_size = self.config['vector_size'],
#                                      workers = self.config['workers'],
#                                      window = self.config['window'],
#                                      sg = self.config['sg'])
#             print('not done yet')
#         elif self.method is 'glove':
#             print('not done yet')
            
            
#     def apply(self, text):
#         if self.method is 'tfidf':
#             return self.embedder.transform(text)
#         elif self.method is 'word2vec':
#             print('not done yet')
#         elif self.method is 'glove':
#             print('not done yet')
        

In [474]:
# Default embedder configs
default_configs = {'textcleaner':{'stop_words':True,
                                  'ngrams':True,
                                  'lemmatization':False},
    
                   'tfidf':{'min_df':5, # Min number of occurances of word to consider
                            'ngram_range':(1, 1)}, # Ngram range
                   
                   'word2vec':{'min_count':5, # Min number of occurances of word to consider
                               'vector_size':50, # Dimension of embedding matrix (i.e. number of embedding features)
                               'window':5, # Size of window to consider
                               'workers':4, # Number of partitions during training
                               'sg':1}} # Training algorothm (CBOW(0) or skip gram(1))


class Embedder:
    '''
    Class for performing embedding on cleaned text. Can use TfidfVectorizer or word2vec
    '''
    def __init__(self,
                 method='tfidf',
                 config = None):

        # Check method exists
        possible_methods = ['tfidf','word2vec']
        if method not in possible_methods:
            raise ValueError('Method not recognised. try one of: '+str(possible_methods))
        else:
            self.method = method
            
        # Set config params
        if config is None:
            self.config = default_configs[method]
        else:
            self.config = config
            
        # Set up models
        if self.method is 'tfidf':
            self.embedder = TfidfVectorizer(min_df=self.config['min_df'], 
                                            ngram_range=self.config['ngram_range'])
        elif self.method is 'word2vec':
            self.embedder = None # Gets created when training
        elif self.method is 'glove':
            print('not done yet')
        
        
    def fit(self, sentences):
        
        if self.method is 'tfidf':
            self.embedder.fit([' '.join(sentence) for sentence in sentences]) # Requires single string with spaces
            
        elif self.method is 'word2vec':
            print('Word2Vec: Setting up model...')
            self.embedder = Word2Vec(min_count=self.config['min_count'],
                                     vector_size = self.config['vector_size'],
                                     workers = self.config['workers'],
                                     window = self.config['window'],
                                     sg = self.config['sg'])
            print('Done!\nWord2Vec: Building Vocab...')
            self.embedder.build_vocab(sentences, progress_per=1000)
            print('Done!\nWord2Vec: Training Model...')
            self.embedder.train(sentences, total_examples=self.embedder.corpus_count, epochs=50, report_delay=1)
            print('Done!')
            
            
    def apply(self, sentences):
        
        if self.method is 'tfidf':
            return self.embedder.transform([' '.join(sentence) for sentence in sentences]) # Requires single string with spaces
        
        elif self.method is 'word2vec':
            
            words = set(test_embedder.embedder.wv.index_to_key)

            # Get vectors of each word
            word_vectors = np.array([np.array([test_embedder.embedder.wv[i] for i in ls if i in words])
                                     for ls in input_sentences], dtype=object)
            # Average this for all sentences
            sentence_vectors = []
            for v in word_vectors:
                if v.size > 1:
                    sentence_vectors.append(v.mean(axis=0))
                else:
                    sentence_vectors.append(np.zeros(self.config['vector_size'], dtype=float))


            return np.array(sentence_vectors)


In [475]:
# test_embedder.fit(df_train['review'])
# test_embedder.apply(df_train['review'])

# Clean text
import logging
logging.basicConfig(level=logging.INFO)
text_cleaner = TextCleaner()
input_sentences = text_cleaner.process_raw_text(df_train['review'],
                                                train_ngrams = True)
# test_embedder = Embedder(method = 'tfidf')
# test_embedder.fit(input_sentences)
# test_embedder.apply(input_sentences)
test_embedder = Embedder(method = 'word2vec')
test_embedder.fit(input_sentences)
test_embedder.apply(input_sentences)

Cleaning up text...
Text cleaned in: 0.18 seconds


INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:PROGRESS: at sentence #10000, processed 117964 words and 77352 word types
INFO:gensim.models.phrases:collected 84906 token types (unigram + bigrams) from a corpus of 134591 words and 11911 sentences
INFO:gensim.models.phrases:merged Phrases<84906 vocab, min_count=1, threshold=200, max_vocab_size=40000000>
INFO:gensim.utils:Phrases lifecycle event {'msg': 'built Phrases<84906 vocab, min_count=1, threshold=200, max_vocab_size=40000000> in 0.17s', 'datetime': '2022-08-17T15:25:47.056761', 'gensim': '4.2.0', 'python': '3.7.4 (default, Sep  7 2019, 18:27:02) \n[Clang 10.0.1 (clang-1001.0.46.4)]', 'platform': 'Darwin-19.6.0-x86_64-i386-64bit', 'event': 'created'}
INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
I

Word2Vec: Setting up model...
Done!
Word2Vec: Building Vocab...
Done!
Word2Vec: Training Model...


INFO:gensim.models.word2vec:EPOCH 0: training on 132105 raw words (94147 effective words) took 0.1s, 815806 effective words/s
INFO:gensim.models.word2vec:EPOCH 1: training on 132105 raw words (93926 effective words) took 0.1s, 763732 effective words/s
INFO:gensim.models.word2vec:EPOCH 2: training on 132105 raw words (94088 effective words) took 0.1s, 783121 effective words/s
INFO:gensim.models.word2vec:EPOCH 3: training on 132105 raw words (93956 effective words) took 0.1s, 756126 effective words/s
INFO:gensim.models.word2vec:EPOCH 4: training on 132105 raw words (94070 effective words) took 0.1s, 771198 effective words/s
INFO:gensim.models.word2vec:EPOCH 5: training on 132105 raw words (94185 effective words) took 0.1s, 781740 effective words/s
INFO:gensim.models.word2vec:EPOCH 6: training on 132105 raw words (94089 effective words) took 0.1s, 806710 effective words/s
INFO:gensim.models.word2vec:EPOCH 7: training on 132105 raw words (94154 effective words) took 0.1s, 813055 effective 

Done!


array([[-0.08024027, -0.00156436,  0.0378052 , ..., -0.17058206,
         0.13666224,  0.0883866 ],
       [-0.14573821, -0.07508348, -0.09012739, ..., -0.08947746,
         0.28290763,  0.2316771 ],
       [-0.20022322, -0.17697008, -0.14460512, ..., -0.19816059,
         0.05982444,  0.1092599 ],
       ...,
       [-0.11474831,  0.03621515, -0.05771099, ..., -0.03151458,
         0.16252038,  0.05168508],
       [-0.05519113, -0.05267667, -0.17226692, ..., -0.21060862,
         0.23082063,  0.12517489],
       [-0.01745497,  0.00426032, -0.00174709, ...,  0.01497765,
        -0.00139504, -0.00324988]])

In [355]:
# size = 50

# words = set(test_embedder.embedder.wv.index_to_key)

# word_vectors = np.array([np.array([test_embedder.embedder.wv[i] for i in ls if i in words])
#                          for ls in input_sentences], dtype=object)
# sentence_vectors = []
# for v in word_vectors:
#     if v.size > 1:
#         sentence_vectors.append(v.mean(axis=0))
#     else:
#         sentence_vectors.append(np.zeros(50, dtype=float))



# X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
#                          for ls in X_test])

In [476]:
# np.array(sentence_vectors)

In [494]:
# 50 embedding size
test_embedder.embedder.wv.most_similar('absolute')


[('experience', 0.6580408215522766),
 ('app', 0.6275638341903687),
 ('religiously', 0.6201767921447754),
 ('cause', 0.6189985871315002),
 ('gotten', 0.6082072854042053),
 ('implementation', 0.5763404369354248),
 ('content', 0.5664331912994385),
 ('intuitive', 0.5533095598220825),
 ('largely', 0.5527818202972412),
 ('nearly', 0.5476468205451965)]

In [237]:
test_cleaner = TextCleaner()
# test_embedder.fit(df_train['review'])
# test_embedder.apply(df_train['review'])

In [241]:
test_cleaner.process_raw_text(df_train['review'],
                               apply_stop_words = False,
                               apply_ngrams = False,
                               apply_lemmatization = False,
                               train_ngrams = False)


UnboundLocalError: local variable 'cleaned_text' referenced before assignment

In [197]:


n_gram_threshold = 200

def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True, min_len=1, max_len=20))  # deacc=True removes punctuations


# Remove stop words
cleaned_text = test_embedder.clean_text(df_train['review'])
        
# Turn to list of words, and remove punctuation
data_words = list(sent_to_words(cleaned_text))

# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=1, threshold=n_gram_threshold) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], min_count=1, threshold=n_gram_threshold)  

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)
final_words = [trigram_mod[bigram_mod[doc]] for doc in data_words]



Cleaning up text...
Text cleaned in: 0.2 seconds


In [205]:
final_words[24]

['old',
 'app',
 'gold_standard',
 'magazine',
 'apps',
 'functionality',
 'perfect',
 'let',
 'immerse',
 'weeks',
 'magazine',
 'no',
 'distractions',
 'felt',
 'closer',
 'print',
 'version',
 'menu_bar',
 'top',
 'always',
 'visible',
 'allowing',
 'click',
 'week',
 'want',
 'away',
 'scroll',
 'downread',
 'shouldnt',
 'refer',
 'weekthats',
 'section',
 'magazine',
 'confusing',
 'call',
 'magazine',
 'app',
 'never',
 'listen',
 'audio',
 'version',
 'why',
 'page',
 'dont',
 'why',
 'mess',
 'perfection',
 'use',
 'classic',
 'app',
 'but',
 'apparently',
 'doesnt',
 'recognize',
 'economist',
 'subscription',
 'frustrating']

In [183]:
bigram_mod.__dict__

{'threshold': 300,
 'min_count': 1,
 'delimiter': '_',
 'scoring': <function gensim.models.phrases.original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count)>,
 'connector_words': frozenset(),
 'phrasegrams': {'college_student': 3537.75,
  'washington_post': 1787.494736842105,
  'york_times': 412.7375,
  'images_advance': 303.23571428571427,
  'portrait_mode': 317.49038461538464,
  'restore_purchase': 851.187969924812,
  'tried_deleting': 449.71398305084745,
  'deleting_reinstalling': 795.99375,
  'israel_crimes': 7075.5,
  'social_media': 450.25909090909096,
  'cover_cover': 318.2648896293211,
  'went_wrong': 548.9612068965517,
  'top_notch': 333.75,
  'conversion_rates': 1415.1,
  'minor_inconveniences': 975.9310344827586,
  'customer_service': 365.25442477876106,
  'check_marks': 379.04464285714283,
  'accidentally_click': 355.4677033492823,
  'web_browser': 312.15441176470586,
  'big_fan': 402.25145772594755,
  'gold_standard': 816.403846153846

In [7]:
# https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook



In [8]:
# df_train['review']

In [13]:
# test = [row for row in df_train['review']] # Generator object () only calculates when called vs [] which calculates now
# Doesn't do anything for our data - checked this


In [16]:

# def cleaning(doc, min_n=3):
#     # Lemmatizes and removes stopwords
#     # doc needs to be a spacy Doc object
#     txt = [token.lemma_ for token in doc if not token.is_stop]
#     # Word2Vec uses context words to learn the vector representation of a target word,
#     # if a sentence is only one or two words long,
#     # the benefit for the training is very small
#     if len(txt) >= min_n:
#         return ' '.join(txt)


# def cleaning(doc, min_n=3):
#     # Lemmatizes and removes stopwords
#     # doc needs to be a spacy Doc object
#     txt = [token for token in doc if token not in my_stop_words]
#     # Word2Vec uses context words to learn the vector representation of a target word,
#     # if a sentence is only one or two words long,
#     # the benefit for the training is very small
#     if len(txt) >= min_n:
#         return ' '.join(txt)

In [38]:
# Custom stop words
my_stop_words = ({'namely', 'between', 'up', 'whither', 'them', 'beside', 'your', 'about', 'hence', 'former', 'ours', 'itself', 'or', 'these', 'their', 'those', 'has', 're', 'next', 'hereupon', 'whether', 'latter', 'towards', 'over', 'yourselves', 'himself', 'beforehand', 'you', 'wherever', 'another', 'than', 'do', 'around', 'him', 'upon', 'been', 'an', 'me', 'toward', 'within', 'of', 'whole', 'ca', 'once', 'nor', 'thru', 'seeming', 'already', 'keep', 'so', 'mine', 'others', 'until', 'move', 'ourselves', 'other', 'where', 'thereupon', 'she', 'am', 'without', 'again', 'hereby', 'be', 'someone', 'sometime', 'used', 'go', 'everyone', 'some', 'then', 'see', 'to', 'seemed', 'i', 'become', 'whatever', 'and', 'what', 'that', 'thence', 'too', 'whenever', 'whereupon', 'can', 'his', 'just', 'due', 'thereby', 'done', 'name', 'none', 'part', 'noone', 'since', 'doing', 'meanwhile', 'via', 'herself', 'also', 'amount', 'seems', 'say', 'get', 'through', 'show', 'made', 'such', 'a', 'as', 'hers', 'the', 'my', 'whereafter', 'in', 'themselves', 'which', 'something', 'put', 'it', 'by', 'may', 'who', 'various', 'whence', 'throughout', 'during', 'hereafter', 'had', 'after', 'under', 'few', 'using', 'whom', 'will', 'though', 'its', 'might', 'across', 'most', 'above', 'how', 'regarding', 'being', 'our', 'afterwards', 'behind', 'make', 'almost', 'each', 'side', 'along', 'much', 'while', 'any', 'elsewhere', 'many', 'this', 'own', 'us', 'would', 'does', 'latterly', 'anything', 'when', 'are', 'with', 'onto', 'even', 'did', 'thereafter', 'yours', 'all', 'is', 'third', 'if', 'somewhere', 'nothing', 'because', 'wherein', 'whoever', 'somehow', 'either', 'every', 'out', 'whose', 'front', 'take', 'both', 'they', 'for', 'empty', 'anyone', 'back', 'formerly', 'whereby', 'full', 'here', 'into', 'myself', 'we', 'became', 'from', 'seem', 'anywhere', 'besides', 'herein', 'ever', 'at', 'her', 'must', 'therein', 'nobody','well', 'give', 'per', 'indeed', 'down', 'still', 'on', 'could', 'although', 'amongst', 'there', 'was', 'else', 'first', 'further', 'have', 'several', 'yourself', 'beyond', 'now'}
 | {'ill','i','id','ive','im','mine'} | 
 {'you','youll','youre','your','youd','youve','yours'} | 
 {'he','hell','hes','hed','his'} | 
 {'she','shell','shed','hers'} |
 {'they','theyre','theyd','their','theirs','theyve'} |
 {'weve','wed','well','our','ours'} |
 {'isnt','wont','shant','shall','d','x','is'})

    
# def load_spacy_model():
    
#     nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disable Named Entity Recognition for speed

#     # Custom stop words
#     my_stop_words = {'namely', 'between', 'up', 'whither', 'them', 'beside', 'your', 'about', 'hence', 'always', 'former', 'ours', 'itself', 'or', 'these', 'their', 'those', 'has', 're', 'next', 'hereupon', 'whether', 'latter', 'towards', 'over', 'yourselves', 'himself', 'beforehand', 'yet', 'you', 'more', 'wherever', 'another', 'than', 'do', 'off', 'around', 'him', 'upon', 'been', 'an', 'me', 'toward', 'within', 'of', 'whole', 'ca', 'once', 'nor', 'thru', 'seeming', 'already', 'keep', 'so', 'mine', 'others', 'one', 'until', 'move', 'ourselves', 'other', 'where', 'thereupon', 'she', 'am', 'without', 'again', 'hereby', 'rather', 'be', 'someone', 'thus', 'sometime', 'used', 'go', 'everyone', 'some', 'then', 'see', 'to', 'seemed', 'i', 'become', 'whatever', 'perhaps', 'and', 'what', 'that', 'thence', 'too', 'whenever', 'whereupon', 'can', 'his', 'just', 'due', 'thereby', 'done', 'name', 'whereas', 'none', 'part', 'noone', 'since', 'doing', 'therefore', 'meanwhile', 'via', 'herself', 'also', 'amount', 'seems', 'bottom', 'say', 'get', 'through', 'show', 'made', 'top', 'such', 'a', 'as', 'hers', 'the', 'my', 'whereafter', 'in', 'themselves', 'which', 'something', 'put', 'it', 'by', 'may', 'who', 'various', 'whence', 'throughout', 'during', 'hereafter', 'had', 'only', 'after', 'under', 'few', 'otherwise', 'using', 'whom', 'will', 'though', 'its', 'might', 'below', 'across', 'most', 'above', 'how', 'regarding', 'being', 'our', 'afterwards', 'behind', 'make', 'not', 'almost', 'each', 'side', 'along', 'much', 'while', 'any', 'but', 'elsewhere', 'many', 'this', 'own', 'us', 'would', 'does', 'latterly', 'unless', 'anything', 'when', 'are', 'enough', 'with', 'onto', 'even', 'did', 'thereafter', 'yours', 'moreover', 'all', 'is', 'third', 'anyway', 'however', 'nevertheless', 'if', 'somewhere', 'nothing', 'because', 'wherein', 'whoever', 'somehow', 'either', 'every', 'out', 'whose', 'front', 'take', 'both', 'they', 'for', 'empty', 'anyone', 'back', 'formerly', 'whereby', 'alone', 'full', 'here', 'into', 'myself', 'except', 'we', 'became', 'from', 'seem', 'anywhere', 'besides', 'herein', 'ever', 'at', 'her', 'must', 'therein', 'nobody','well', 'give', 'per', 'indeed', 'down', 'still', 'on', 'could', 'although', 'anyhow', 'amongst', 'there', 'often', 'was', 'else', 'first', 'further', 'no', 'have', 'several', 'yourself', 'beyond', 'now'}
#     my_stop_words = (my_stop_words | 
#      {'ill','i','id','ive','im','mine'} | 
#      {'you','youll','youre','your','youd','youve','yours'} | 
#      {'he','hell','hes','hed','his'} | 
#      {'she','shell','shed','hers'} |
#      {'they','theyre','theyd','their','theirs','theyve'} |
#      {'weve','wed','well','our','ours'} |
#      {'isnt','wont','shant','d'})

#     original_stop_words = nlp.Defaults.stop_words

#     nlp.Defaults.stop_words -= original_stop_words
#     nlp.Defaults.stop_words |= my_stop_words

#     return nlp



In [32]:

# # # Load model
# # t1 = time.time()
# # # print('Loading models...')
# # # nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disable Named Entity Recognition for speed
# # # nlp.Defaults.stop_words = my_stop_words
# # nlp = load_spacy_model()
# # print('Models loaded in: {} mins'.format(round((time.time() - t1) / 60, 2)))

# # pipe(): Process texts as a stream, and yield Doc objects in order. This is usually more efficient than processing texts one-by-one.
# print('Cleaning up text...')
# t2 = time.time()
# #brief_cleaning = [re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_train['review']] # Doesn't do anything for this dataset (remove non alpha-numeric?)
# brief_cleaning = [re.sub("[^a-zA-Z ]", '', str(row)).lower() for row in df_train['review']] # Doesn't do anything for this dataset (remove non alpha-numeric?)
# next_cleaning = [re.sub(' +', ' ', str(row)).lower().lstrip(' ') for row in brief_cleaning] # Doesn't do anything for this dataset (remove non alpha-numeric?)
# final_text = [cleaning(doc.split(' ')) for doc in next_cleaning]



# # txt = [cleaning(doc) for doc in nlp.pipe(next_cleaning, batch_size=5000, n_process=-1)]
# # txt = [cleaning(doc.split(' ')) for doc in next_cleaning]


# print('Text cleaned in: {} mins'.format(round((time.time() - t2) / 60, 2)))


Models loaded in: 0.01 mins
Cleaning up text...
Text cleaned in: 0.0 mins


In [54]:
def clean_text(text_df):

    # pipe(): Process texts as a stream, and yield Doc objects in order. This is usually more efficient than processing texts one-by-one.
    print('Cleaning up text...')
    t1 = time.time()
    brief_cleaning = [re.sub("[^a-zA-Z ]", '', str(row)).lower() for row in text_df] # Doesn't do anything for this dataset (remove non alpha-numeric?)
    next_cleaning = [re.sub(' +', ' ', str(row)).lower().lstrip(' ') for row in brief_cleaning] # Remove multiple spaces
    final_text = [cleaning(doc.split(' ')) for doc in next_cleaning] # Remove stop words
    print('Text cleaned in: {} seconds'.format(round((time.time() - t1), 2)))

    return final_text



df_train['cleaned_text'] = clean_text(df_train['review']) 


# df_train['review']
df_train['cleaned_text']


Cleaning up text...
Text cleaned in: 0.19 seconds


0        never personally thought writing review news a...
1        wish download images advance offline access li...
2        disabled health issues past couple years walk ...
3        good news source very two stars however really...
4        love listening economist audio edition particu...
                               ...                        
11906              download more reviews tell base review 
11907    great app big fan digestible news often week t...
11908    expect economist app very fluid responsive sig...
11909    happy economist taking finishability mantra cr...
11910                                                 None
Name: cleaned_text, Length: 11911, dtype: object

In [103]:
sent = [row.split(' ') for row in df_train['cleaned_text'] if row is not None]
phrases = Phrases(sent, min_count=100)
df_train['final_text'] = phrases[df_train['cleaned_text']]


ValueError: Length of values (10645) does not match length of index (11911)

In [104]:
phrases[df_train['cleaned_text']]

['never personally thought writing review news app year old college student generally skeptical news organizations taking class heavy usage economist articles why professor uses lesson plans factual multiple months contemplating not subscription finally pulled trigger app seamless minus features wish app like search option viewing history easier access friends articles wish half star less but stars enjoy content feel trust economist deliver news digestible palatable way enjoy able audibly listen articles understanding whats going world tired traditional american news companies like nbc fox cbs cnn washington post new york times try economist economist offers new level detail honesty factuality trust appreciate comes news reporting intend resubscribe more introductory subscription economist far understand not editorials opinion pieces offer thought news reporting deeper understanding current events',
 'wish download images advance offline access like text audio like classic app least im

In [97]:
phrases[df_train['cleaned_text']][8]

'only recently subscribed economist initial impression favorable reporting more balanced accustomed financial times european publication read daily basis value european perspective american experiment evolving years sense desperation political left america want received davos try convince world abandoned principles revolution favor woke globalism truth americans never heard davos not interested globalist ideas president obama but hear american media'

In [89]:
df_train['cleaned_text'].values[3]

'good news source very two stars however really expensive year unless complain threaten cancel not sure whats harder divorce cancelling news subscription cant easily app contact app look faqs log point find change subscription theres button says cancel subscription asked why given sales pitch writing based reason order forward cancellation required chat sales representative joins chat asked provide address why wanted cancel tried offering discount finally rep cancelled subscription entire process took minutes making hard cancel purpose people giving money bad business'

In [116]:
from collections import defaultdict
word_freq = defaultdict(int)
for sent in df_train['cleaned_text']:
    if sent is not None:
        for i in sent.split(' '):
            word_freq[i] += 1

sorted(word_freq, key=word_freq.get, reverse=True)[:100]

['app',
 'news',
 'great',
 'economist',
 'but',
 'read',
 'day',
 'morning',
 'not',
 'articles',
 'good',
 'world',
 'daily',
 'content',
 'very',
 'way',
 'audio',
 'espresso',
 'like',
 'more',
 'love',
 'start',
 'time',
 'quick',
 'one',
 'subscription',
 'reading',
 'no',
 'excellent',
 'easy',
 'stories',
 'use',
 'article',
 'only',
 'concise',
 'short',
 'version',
 'nice',
 'new',
 'really',
 'update',
 'perfect',
 'summary',
 'work',
 'events',
 'dont',
 'best',
 'always',
 'magazine',
 'brief',
 'better',
 'issues',
 'doesnt',
 'edition',
 'days',
 'informative',
 'please',
 'cant',
 'information',
 'works',
 'need',
 'global',
 'weekly',
 'download',
 'important',
 'current',
 'quality',
 'written',
 'source',
 'ads',
 'want',
 'coffee',
 'why',
 'find',
 'going',
 'issue',
 'enjoy',
 'access',
 'interesting',
 'listen',
 'simple',
 'little',
 'fix',
 'minutes',
 'useful',
 'open',
 'should',
 'gives',
 'apps',
 'before',
 'free',
 'right',
 'top',
 'however',
 'point',
 

In [111]:
df_train['cleaned_text'][0]

'never personally thought writing review news app year old college student generally skeptical news organizations taking class heavy usage economist articles why professor uses lesson plans factual multiple months contemplating not subscription finally pulled trigger app seamless minus features wish app like search option viewing history easier access friends articles wish half star less but stars enjoy content feel trust economist deliver news digestible palatable way enjoy able audibly listen articles understanding whats going world tired traditional american news companies like nbc fox cbs cnn washington post new york times try economist economist offers new level detail honesty factuality trust appreciate comes news reporting intend resubscribe more introductory subscription economist far understand not editorials opinion pieces offer thought news reporting deeper understanding current events'

In [96]:
# spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
# spacy_stopwords.remove('not')
nlp.

set()