### Import modules

In [1]:
from utils import * 

import numpy as np
import pandas as pd
from pprint import pprint
import os
import matplotlib.pyplot as plt
from collections import defaultdict

# Gensim
from gensim.test.utils import datapath
from gensim.test.utils import common_texts, get_tmpfile

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.option_context('display.max_colwidth', 500);

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


config = get_config('config.yaml')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\enlik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [2]:
# Import ldamallet model
ldamallet = gensim.models.wrappers.LdaMallet.load(datapath('model'))
ldamallet = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [8]:
dictionary = pd.read_pickle('preprocessed_data/dictionary.pkl')
# topics = ["Platform/Device", "User Experience", "Value", "Service", "Trouble-shooting", "Shows"]
topics = ["Trouble-shooting", "User-Experience", "Cost-Value", "Time", "Service", "Others"]

In [9]:
def make_prediction_ldamallet(input):
    """
    Output:
    Returns (list of topics, list of probs) in a descending order of probabilities
    """
    clean_text = NLPpipe().preprocess(pd.Series(input))
    term_doc_new = [dictionary.doc2bow(text) for text in clean_text]
    if input is not None:
        percentages = [perc for topic, perc in ldamallet[term_doc_new][0]]
        indices = np.argsort(percentages)[::-1]
        return list(zip([topics[index] for index in indices], [100*np.round(percentages[index], 3) for index in indices]))

    return None

In [10]:
df_val = pd.read_pickle('raw_data/all_reviews_p2_6000_val.pkl')
# df_val
reviews = df_val.review.values.tolist()
# reviews
review = reviews[0]
review

"Your drivers are great BUT your support is no good. Why can't it be like Uber where you can actually speak to someone. Instead of getting computer generated responses or like in my case, no response at all. The three stars are for the support. Also I have received a single discount to this account since I installed it three months ago. I'm very disappointed in Bolt, I thought they were there for their clients."

In [11]:
pairs_ldamallet = make_prediction_ldamallet(review)

Making bigrams...
Lemmatizing...


In [12]:
pairs_ldamallet

[('Cost-Value', 62.1),
 ('Others', 19.0),
 ('Time', 13.3),
 ('Trouble-shooting', 1.9),
 ('Service', 1.9),
 ('User-Experience', 1.9)]

## Create LDA Mallet prediction pickle file

In [15]:
df_val_pp = df_val.review.astype(str)
df_val_pp

0       Your drivers are great BUT your support is no ...
1       Loving the app, but can only give it a medium ...
2       The response time with regards to customer ser...
3       This has been happening for some time now and ...
4       The drivers almost always never have change an...
                              ...                        
5995    So comfortable, reliable,fast and more especia...
5996    Great experience...currently on a great discou...
5997    If a driver doesn't make the cut for Uber, the...
5998    Very efficient and convenient as well as the r...
5999    Excellent,helpful and makes life simple,very u...
Name: review, Length: 6000, dtype: object

In [16]:
cleaned_docs = remove_things(df_val_pp)

lists_of_words = list(sentences_to_words(cleaned_docs))
lists_of_words_no_stops = remove_stopwords(lists_of_words)

ngrams = make_bigrams(lists_of_words_no_stops)

data_lemmatized = lemmatize(ngrams, allowed_postags=['NOUN'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
term_doc = [id2word.doc2bow(text) for text in texts]

# View
print(term_doc[:1])

tf_idf = models.TfidfModel(term_doc, smartirs='ntc')[term_doc]
tf_idf[0]

Making bigrams...
Lemmatizing...
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 2), (10, 1)]]


[(0, 0.22024834744227595),
 (1, 0.26782861291908583),
 (2, 0.23095828765815235),
 (3, 0.40723575553825214),
 (4, 0.1750796523957451),
 (5, 0.06195743698186419),
 (6, 0.2500162927546846),
 (7, 0.4379534161292862),
 (8, 0.2317592951692477),
 (9, 0.4238999553133572),
 (10, 0.3719802619622322)]

In [17]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=term_doc, texts=data_lemmatized, df=df_val)

Getting main topic for document...
0  1000  2000  3000  4000  5000  

In [19]:
df_dominant_topic = find_dominant_topic_in_each_doc(df_topic_sents_keywords, df=df_val)
print("Finding the dominant topic in each document")
df_dominant_topic.head(5).style.set_properties(subset=['review'], **{'width': '600px'})

Finding the dominant topic in each document


Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,rating,review
0,2.0,0.549,"driver, app, ride, trip, service, money, charge, amount, card, payment","['driver', 'support', 'computer', 'response', 'case', 'response', 'star', 'support', 'discount', 'account', 'month', 'thought', 'client']",3,"Your drivers are great BUT your support is no good. Why can't it be like Uber where you can actually speak to someone. Instead of getting computer generated responses or like in my case, no response at all. The three stars are for the support. Also I have received a single discount to this account since I installed it three months ago. I'm very disappointed in Bolt, I thought they were there for their clients."
1,1.0,0.4886,"driver, car, service, passenger, time, app, customer, rider, trip, route","['rating', 'moment', 'driver', 'rating', 'trip', 'driver', 'apartment', 'ride', 'review', 'drive', 'destination']",3,"Loving the app, but can only give it a medium rating at the moment as the drivers you give a bad rating on still gets your trip. I had a bad experience with a driver outside my apartment, but he always gets my rides as he's parked there when with my one star and bad review (refused to drive to the destination, rude and aggressive)."
2,2.0,0.8523,"driver, app, ride, trip, service, money, charge, amount, card, payment","['response', 'thank', 'reimbursement', 'part', 'customer', 'promo', 'part', 'situation', 'part', 'company', 'extra_money', 'people', 'card', 'dodgy', 'stick', 'amount']",4,The response time with regards to customer service was great and thank you. Kindly improve on the reimbursement part because it's not every customer who'll appreciate a promo. That part put some of us in a very sticky situation financially especially now that we have COVID-19. The last part would be for your company not to always take extra money just because people are paying with cards. That's dodgy. Stick to the same amount whether I'm paying cash or with a card.
3,0.0,0.5496,"app, location, ride, option, update, time, work, issue, phone, number","['time', 'show', 'estimate', 'amount', 'destination', 'location', 'price', 'way', 'uninstall', 'rubbish', 'app']",1,This has been happening for some time now and i always keep quiet. You will show me an estimate of the amount for the destination and when i get to my location the price i will see will be way higher than what you showed. I will stop using you. Just decided to give this feedback before i uninstall this rubbish app
4,2.0,0.5462,"driver, app, ride, trip, service, money, charge, amount, card, payment","['driver', 'option', 'change', 'time', 'transfer', 'trust', 'card', 'cash', 'please_fix', 'problem', 'way']",3,"The drivers almost always never have change and I'm left with no option but to leave my 100-300 naira change with them which I obviously don't intend to. It's not all the time we're able to transfer and not everyone trust using their cards with the app, for those of us that use cash please fix this problem whatever way, it's really annoying."


In [20]:
df_dominant_topic

Unnamed: 0,Dominant_Topic,Perc_Contribution,Keywords,Text,rating,review
0,2.0,0.5490,"driver, app, ride, trip, service, money, charg...","[driver, support, computer, response, case, re...",3,Your drivers are great BUT your support is no ...
1,1.0,0.4886,"driver, car, service, passenger, time, app, cu...","[rating, moment, driver, rating, trip, driver,...",3,"Loving the app, but can only give it a medium ..."
2,2.0,0.8523,"driver, app, ride, trip, service, money, charg...","[response, thank, reimbursement, part, custome...",4,The response time with regards to customer ser...
3,0.0,0.5496,"app, location, ride, option, update, time, wor...","[time, show, estimate, amount, destination, lo...",1,This has been happening for some time now and ...
4,2.0,0.5462,"driver, app, ride, trip, service, money, charg...","[driver, option, change, time, transfer, trust...",3,The drivers almost always never have change an...
...,...,...,...,...,...,...
5995,4.0,0.4979,"service, love, ride, driver, app, price, time,...",[discount],5,"So comfortable, reliable,fast and more especia..."
5996,4.0,0.4980,"service, love, ride, driver, app, price, time,...",[discount],5,Great experience...currently on a great discou...
5997,2.0,0.3115,"driver, app, ride, trip, service, money, charg...","[driver, cut, join, ride, hitch, app, money, s...",1,"If a driver doesn't make the cut for Uber, the..."
5998,2.0,0.3565,"driver, app, ride, trip, service, money, charg...","[well, rate]",5,Very efficient and convenient as well as the r...


In [27]:
lda_preds = df_dominant_topic.Dominant_Topic.values.tolist()
# lda_preds = lda_preds.values.to_list()
lda_preds

[2.0,
 1.0,
 2.0,
 0.0,
 2.0,
 3.0,
 5.0,
 3.0,
 1.0,
 2.0,
 2.0,
 1.0,
 5.0,
 0.0,
 2.0,
 1.0,
 3.0,
 4.0,
 2.0,
 2.0,
 3.0,
 0.0,
 4.0,
 2.0,
 1.0,
 3.0,
 0.0,
 5.0,
 4.0,
 5.0,
 2.0,
 5.0,
 5.0,
 2.0,
 2.0,
 0.0,
 4.0,
 2.0,
 1.0,
 2.0,
 3.0,
 4.0,
 3.0,
 2.0,
 4.0,
 1.0,
 5.0,
 2.0,
 0.0,
 2.0,
 5.0,
 2.0,
 4.0,
 5.0,
 0.0,
 4.0,
 3.0,
 3.0,
 2.0,
 3.0,
 5.0,
 5.0,
 1.0,
 1.0,
 5.0,
 5.0,
 1.0,
 4.0,
 2.0,
 0.0,
 1.0,
 1.0,
 2.0,
 2.0,
 0.0,
 0.0,
 4.0,
 2.0,
 0.0,
 2.0,
 3.0,
 4.0,
 3.0,
 1.0,
 2.0,
 5.0,
 4.0,
 3.0,
 3.0,
 2.0,
 4.0,
 2.0,
 3.0,
 2.0,
 1.0,
 0.0,
 1.0,
 4.0,
 2.0,
 3.0,
 2.0,
 2.0,
 3.0,
 2.0,
 5.0,
 4.0,
 2.0,
 5.0,
 3.0,
 2.0,
 4.0,
 0.0,
 4.0,
 0.0,
 1.0,
 2.0,
 2.0,
 2.0,
 1.0,
 2.0,
 4.0,
 2.0,
 3.0,
 5.0,
 4.0,
 2.0,
 0.0,
 3.0,
 0.0,
 1.0,
 3.0,
 5.0,
 3.0,
 5.0,
 1.0,
 2.0,
 1.0,
 5.0,
 4.0,
 0.0,
 5.0,
 4.0,
 2.0,
 5.0,
 1.0,
 2.0,
 3.0,
 4.0,
 2.0,
 2.0,
 2.0,
 4.0,
 1.0,
 3.0,
 3.0,
 4.0,
 3.0,
 5.0,
 2.0,
 2.0,
 0.0,
 5.0,
 5.0,
 3.0,
 1.0,
 3.0,
 2.0

In [32]:
# pd.to_pickle(lda_preds, 'prediction/LDA_preds.pkl')
lda_preds.to_csv('prediction/LDA_preds.csv')

AttributeError: 'list' object has no attribute 'to_csv'

In [31]:
LDA_preds_test_read = pd.read_pickle('prediction/LDA_preds.pkl')
LDA_preds_test_read

[2.0,
 1.0,
 2.0,
 0.0,
 2.0,
 3.0,
 5.0,
 3.0,
 1.0,
 2.0,
 2.0,
 1.0,
 5.0,
 0.0,
 2.0,
 1.0,
 3.0,
 4.0,
 2.0,
 2.0,
 3.0,
 0.0,
 4.0,
 2.0,
 1.0,
 3.0,
 0.0,
 5.0,
 4.0,
 5.0,
 2.0,
 5.0,
 5.0,
 2.0,
 2.0,
 0.0,
 4.0,
 2.0,
 1.0,
 2.0,
 3.0,
 4.0,
 3.0,
 2.0,
 4.0,
 1.0,
 5.0,
 2.0,
 0.0,
 2.0,
 5.0,
 2.0,
 4.0,
 5.0,
 0.0,
 4.0,
 3.0,
 3.0,
 2.0,
 3.0,
 5.0,
 5.0,
 1.0,
 1.0,
 5.0,
 5.0,
 1.0,
 4.0,
 2.0,
 0.0,
 1.0,
 1.0,
 2.0,
 2.0,
 0.0,
 0.0,
 4.0,
 2.0,
 0.0,
 2.0,
 3.0,
 4.0,
 3.0,
 1.0,
 2.0,
 5.0,
 4.0,
 3.0,
 3.0,
 2.0,
 4.0,
 2.0,
 3.0,
 2.0,
 1.0,
 0.0,
 1.0,
 4.0,
 2.0,
 3.0,
 2.0,
 2.0,
 3.0,
 2.0,
 5.0,
 4.0,
 2.0,
 5.0,
 3.0,
 2.0,
 4.0,
 0.0,
 4.0,
 0.0,
 1.0,
 2.0,
 2.0,
 2.0,
 1.0,
 2.0,
 4.0,
 2.0,
 3.0,
 5.0,
 4.0,
 2.0,
 0.0,
 3.0,
 0.0,
 1.0,
 3.0,
 5.0,
 3.0,
 5.0,
 1.0,
 2.0,
 1.0,
 5.0,
 4.0,
 0.0,
 5.0,
 4.0,
 2.0,
 5.0,
 1.0,
 2.0,
 3.0,
 4.0,
 2.0,
 2.0,
 2.0,
 4.0,
 1.0,
 3.0,
 3.0,
 4.0,
 3.0,
 5.0,
 2.0,
 2.0,
 0.0,
 5.0,
 5.0,
 3.0,
 1.0,
 3.0,
 2.0

In [29]:
# df_netflix_bert_pred = pd.read_pickle('../sandbox/netflix_app-jung-akim/prediction/BERT_preds.pkl')
df_netflix_bert_pred = pd.read_pickle('prediction/BERT_preds.pkl')
df_netflix_bert_pred

[2.0,
 1.0,
 2.0,
 3.0,
 2.0,
 2.0,
 0.0,
 3.0,
 2.0,
 5.0,
 0.0,
 2.0,
 1.0,
 1.0,
 3.0,
 0.0,
 4.0,
 5.0,
 1.0,
 1.0,
 4.0,
 0.0,
 4.0,
 2.0,
 1.0,
 4.0,
 3.0,
 4.0,
 1.0,
 3.0,
 4.0,
 1.0,
 2.0,
 4.0,
 5.0,
 5.0,
 4.0,
 3.0,
 1.0,
 5.0,
 4.0,
 5.0,
 5.0,
 5.0,
 4.0,
 1.0,
 1.0,
 1.0,
 2.0,
 0.0,
 1.0,
 2.0,
 0.0,
 4.0,
 3.0,
 0.0,
 5.0,
 1.0,
 1.0,
 5.0,
 1.0,
 0.0,
 2.0,
 0.0,
 3.0,
 2.0,
 4.0,
 2.0,
 2.0,
 0.0,
 2.0,
 0.0,
 5.0,
 4.0,
 3.0,
 4.0,
 0.0,
 2.0,
 0.0,
 3.0,
 5.0,
 3.0,
 5.0,
 0.0,
 1.0,
 5.0,
 4.0,
 0.0,
 5.0,
 1.0,
 4.0,
 4.0,
 4.0,
 3.0,
 0.0,
 0.0,
 4.0,
 0.0,
 1.0,
 5.0,
 2.0,
 1.0,
 4.0,
 1.0,
 4.0,
 5.0,
 5.0,
 3.0,
 2.0,
 2.0,
 2.0,
 0.0,
 4.0,
 4.0,
 1.0,
 0.0,
 4.0,
 2.0,
 1.0,
 1.0,
 3.0,
 4.0,
 2.0,
 5.0,
 2.0,
 0.0,
 0.0,
 2.0,
 5.0,
 4.0,
 3.0,
 5.0,
 3.0,
 3.0,
 2.0,
 4.0,
 1.0,
 0.0,
 3.0,
 3.0,
 3.0,
 4.0,
 5.0,
 4.0,
 5.0,
 2.0,
 1.0,
 1.0,
 2.0,
 4.0,
 4.0,
 5.0,
 3.0,
 3.0,
 5.0,
 2.0,
 4.0,
 3.0,
 1.0,
 5.0,
 5.0,
 3.0,
 3.0,
 0.0,
 4.0,
 2.0,
 1.0

In [22]:
df_netflix_lda_pred = pd.read_pickle('../sandbox/netflix_app-jung-akim/prediction/LDA_preds.pkl')
df_netflix_lda_pred

[4,
 0,
 4,
 0,
 0,
 1,
 0,
 5,
 4,
 0,
 0,
 4,
 1,
 1,
 4,
 1,
 2,
 3,
 0,
 1,
 2,
 4,
 5,
 0,
 0,
 2,
 0,
 5,
 0,
 5,
 2,
 3,
 5,
 3,
 2,
 2,
 2,
 1,
 5,
 1,
 0,
 0,
 0,
 1,
 3,
 1,
 4,
 5,
 0,
 5,
 3,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 5,
 0,
 0,
 4,
 0,
 5,
 2,
 2,
 0,
 0,
 4,
 4,
 4,
 1,
 0,
 1,
 0,
 3,
 4,
 4,
 4,
 5,
 0,
 5,
 0,
 2,
 0,
 0,
 4,
 4,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 3,
 1,
 4,
 0,
 1,
 4,
 5,
 4,
 0,
 0,
 0,
 4,
 4,
 5,
 0,
 0,
 0,
 4,
 2,
 5,
 4,
 3,
 1,
 5,
 4,
 4,
 4,
 0,
 0,
 3,
 2,
 4,
 3,
 0,
 3,
 3,
 0,
 2,
 0,
 0,
 1,
 1,
 3,
 0,
 5,
 1,
 1,
 1,
 2,
 3,
 0,
 5,
 0,
 5,
 5,
 1,
 0,
 0,
 0,
 2,
 5,
 2,
 4,
 2,
 2,
 0,
 1,
 1,
 4,
 2,
 1,
 5,
 0,
 2,
 0,
 4,
 0,
 0,
 0,
 4,
 2,
 0,
 0,
 3,
 0,
 1,
 4,
 0,
 1,
 5,
 4,
 5,
 2,
 1,
 0,
 0,
 2,
 0,
 1,
 0,
 4,
 1,
 3,
 1,
 2,
 0,
 4,
 4,
 1,
 2,
 4,
 0,
 5,
 1,
 5,
 0,
 1,
 1,
 0,
 0,
 5,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 2,
 0,
 5,
 3,
 0,
 4,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 1,
 4,
 5,
