# imports and functions

In [5]:
import sys
import os
#%cd /content/drive/My Drive/Full stack data school/progetto
!pip install gensim -q
!pip install nltk -q

In [6]:
import pandas as pd
import numpy as np
import pickle 
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('wordnet')

stemmer = SnowballStemmer("english")

[nltk_data] Downloading package wordnet to /Users/Debora/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))           
    return result

# poreprocess to train a new lda model
def lda_preprocess_train_new_model(preprocessed_list_of_strings, no_below = 50, no_above = 1, keep_n = 400):
  dictionary = gensim.corpora.Dictionary(preprocessed_list_of_strings)
  dictionary.filter_extremes(no_below = no_below, no_above = no_above, keep_n = keep_n)
  bow_corpus = [dictionary.doc2bow(el) for el in preprocessed_list_of_strings]
  return bow_corpus, dictionary

# load data

In [None]:
# load googleplaystore_user_reviews.csv
df = pd.read_csv('DATA/raw/googleplaystore_user_reviews.csv')
df = df.dropna()

# load apps_dataset.csv
apps = pd.read_csv('DATA/pre-processed/clean_app_data.csv')

# train new model

In [None]:
# filter the dataset according to your criteria
filtered_df = df[df['Sentiment'] == 'Negative']

clean_text = filtered_df.Translated_Review.apply(preprocess)
bow_corpus, dictionary = lda_preprocess_train_new_model(clean_text)

In [None]:
########### train model ###############

lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                        num_topics = 5, 
                                        id2word = dictionary,                                    
                                        passes = 10)

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

In [None]:
########## save model ############

# with open('new_model.model', 'wb') as f: 
#     pickle.dump(lda_model, f)

# with open('dictionary.model', 'wb') as f: 
#     pickle.dump(dictionary, f)

# use trained model

In [None]:
with open('lda_topics.model', 'rb') as f: 
    model = pickle.load(f)
  
with open('lda_dictionary.model', 'rb') as f: 
    lda_dictionary = pickle.load(f)

with open('topics_dict.model', 'rb') as f:
  topics_dict = pickle.load(f)

# topic 0: booking problems, customer service useless
# topic 1: UI problems, app not responsive, slow
# topic 2: in-app purchase necessary to play the game
# topic 3: account connection problem (google account?)
# topic 4: update problems

In [None]:
unseen = 'This is a review, booking the accomodation was fake.'

bow_vector = lda_dictionary.doc2bow(preprocess(unseen))

for index, score in sorted(model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, topics_dict[model.print_topic(index, 5)]))

Score: 0.46366870403289795	 Topic: booking problems, customer service problems
Score: 0.3845175504684448	 Topic: UI problems, app not responsive
Score: 0.05170731619000435	 Topic: update and version problems
Score: 0.05007593706250191	 Topic: in-app purchase necessary to play the game, bad game experience
Score: 0.05003046244382858	 Topic: account connection problems


# additional analysis

In [None]:
apps_category = dict(zip(apps.App.tolist(), apps.Category.tolist()))

def get_category(apps_category, app):
  try:
    return apps_category[app]
  except KeyError:
    return 'unknown'

df['Category'] = df.App.apply(lambda x: get_category(apps_category, x))

In [None]:
# using trained model
def get_label(lda_dictionary, model, topics_dict, string):
  bow_vector = lda_dictionary.doc2bow(preprocess(string))
  topic = []
  for index, score in sorted(model[bow_vector], key=lambda tup: -1*tup[1]):
    topic.append(topics_dict[model.print_topic(index, 5)])
  return topic[0]

def get_score(lda_dictionary, model, topics_dict, string):
  bow_vector = lda_dictionary.doc2bow(preprocess(string))
  scores = []
  for index, score in sorted(model[bow_vector], key=lambda tup: -1*tup[1]):
    scores.append(score)
  return scores[0]


def insight(df, category, sentiment_polarity, lda_dictionary, model, topics_dict):
  # bad_apps = df[ (df.Category == category) & (df.Sentiment_Polarity <= sentiment_polarity) ] # filtro 
  bad_apps = df[(df.Sentiment_Polarity <= sentiment_polarity) ]
  bad_apps['topic_score'] = bad_apps.Translated_Review.apply(lambda x: get_score(lda_dictionary, model, topics_dict, x))
  bad_apps['topic_label'] = bad_apps.Translated_Review.apply(lambda x: get_label(lda_dictionary, model, topics_dict, x))
  # print(bad_apps)
  return pd.DataFrame(bad_apps.groupby('App').topic_label.value_counts()).rename(columns = {'topic_label': 'Total'})

# filter data as you want, only taking negative reviews, not neutrale reviews
# bad_games = df[(df.Category == 'tools') & (df.Sentiment_Polarity <= -0.1)]


# bad_games['topic_score'] = bad_games.Translated_Review.apply(lambda x: get_score(lda_dictionary, model, topics_dict, x))
# bad_games['topic_label'] = bad_games.Translated_Review.apply(lambda x: get_label(lda_dictionary, model, topics_dict, x))

# use your dataframe instead of bad_game
# results = pd.DataFrame(bad_games.groupby('App').topic_label.value_counts()).rename(columns = {'topic_label': 'Total'})
# results.head(10)

In [None]:
bad_apps_dict = dict()
categories = apps.Category.value_counts().index.tolist()[:6]
print(categories)
for el in categories:
  bad_apps_dict[el] = insight(df = df, 
                             category = el, 
                             sentiment_polarity = -0.1,
                             lda_dictionary = lda_dictionary,
                             model = model,
                             topics_dict = topics_dict)
  display(bad_apps_dict[el])

In [None]:
for category, val in bad_apps_dict.items():
  print(category)
  display(val.groupby(by = 'topic_label').sum())
  print('\n')


# case study Vogue

In [None]:
all_temp  = insight(df = df, 
        category = el, 
        sentiment_polarity = -0.1,
        lda_dictionary = lda_dictionary,
        model = model,
        topics_dict = topics_dict)

In [None]:
all_temp.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
App,topic_label,Unnamed: 2_level_1
10 Best Foods for You,"UI problems, app not responsive",6
10 Best Foods for You,"booking problems, customer service problems",2
10 Best Foods for You,update and version problems,2
104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,"UI problems, app not responsive",1
11st,"booking problems, customer service problems",2
11st,update and version problems,2
11st,"UI problems, app not responsive",1
1800 Contacts - Lens Store,"booking problems, customer service problems",4
1800 Contacts - Lens Store,"in-app purchase necessary to play the game, bad game experience",2
1LINE – One Line with One Touch,"in-app purchase necessary to play the game, bad game experience",6


In [None]:
temp = np.array(all_temp.index.tolist())

In [None]:
len(all_temp.values)

1942

In [None]:
all = pd.DataFrame()
all['app'] = temp[:, 0]
all['topic'] = temp[:, 1]
all['total'] = all_temp.values

In [None]:
all[all.topic == 'UI problems, app not responsive'].sort_values(by = 'total').tail(15)
# the app Block Puzzle has 40 reports of the in-app purchase problems

Unnamed: 0,app,topic,total
1486,Free Foreclosure Real Estate Search by USHUD.com,"UI problems, app not responsive",9
668,"CBS Sports App - Scores, News, Stats & Watch Live","UI problems, app not responsive",9
868,Colorfy: Coloring Book for Adults - Free,"UI problems, app not responsive",10
1511,FreshBooks Classic,"UI problems, app not responsive",10
1428,Fly Delta,"UI problems, app not responsive",10
270,Apartments.com Rental Search,"UI problems, app not responsive",10
1679,Google PDF Viewer,"UI problems, app not responsive",11
490,Be A Legend: Soccer,"UI problems, app not responsive",11
55,ABC News - US & World News,"UI problems, app not responsive",11
1476,Free Dating App - Meet Local Singles - Flirt Chat,"UI problems, app not responsive",14


In [None]:
app = 'Fashion in Vogue' 
apps[apps.App == app]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1316,Fashion in Vogue,lifestyle,3.8,1797,6.8,100000.0,Free,0.0,Everyone,lifestyle,2016-09-27,20.0,43.0


In [None]:
apps[apps.App == app].Reviews.values

array([1797])

In [None]:
# % of negative reviews which are due to in-app purchase necessary to play the game, ba...
bad_ux = all[(all.app == app) & (all.topic =='UI problems, app not responsive')].total.values/df[df.App == app].Sentiment.value_counts()[1]

In [None]:
all_pos_neg_reviews = df[df.App == app].Sentiment.value_counts()/len(df[df.App == app])

In [None]:
print(f' estimated reached users', int(all_pos_neg_reviews[1]*apps[apps.App == app].Installs.values*bad_ux) )

 estimated reached users 13274


In [None]:
print(f' users complaining for any reason {round(all_pos_neg_reviews[1]*100, 2)}%',
      f' which is {round(bad_ux[0]*all_pos_neg_reviews[1]*100, 2)}% of the total \n\n' )

tot_vogue_downloads = 100000*2 + 10000*7
print(f'there are 2 vogue apps (germany, italy) with 100.000+ downloads and 7 vogue apps(spain, uk, russia, france, india, greece+australia, poland) with 10.000+ downloads\n \
for a total of {tot_vogue_downloads}+ downloads, which will give us {round(tot_vogue_downloads*0.1327, 2)}')

 users complaining for any reason 30.97%  which is 13.27% of the total 


there are 2 vogue apps (germany, italy) with 100.000+ downloads and 7 vogue apps(spain, uk, russia, france, india, greece+australia, poland) with 10.000+ downloads
 for a total of 270000+ downloads, which will give us 35829.0


# case study agar.io

In [None]:
all_temp  = insight(df = df, 
        category = el, 
        sentiment_polarity = -0.1,
        lda_dictionary = lda_dictionary,
        model = model,
        topics_dict = topics_dict)

In [None]:
all_temp.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
App,topic_label,Unnamed: 2_level_1
10 Best Foods for You,"UI problems, app not responsive",6
10 Best Foods for You,"booking problems, customer service problems",2
10 Best Foods for You,update and version problems,2
104 找工作 - 找工作 找打工 找兼職 履歷健檢 履歷診療室,"UI problems, app not responsive",1
11st,"booking problems, customer service problems",2
11st,update and version problems,2
11st,"UI problems, app not responsive",1
1800 Contacts - Lens Store,"booking problems, customer service problems",4
1800 Contacts - Lens Store,"in-app purchase necessary to play the game, bad game experience",2
1LINE – One Line with One Touch,"in-app purchase necessary to play the game, bad game experience",6


In [None]:
temp = np.array(all_temp.index.tolist())

In [None]:
len(all_temp.values)

1942

In [None]:
all = pd.DataFrame()
all['app'] = temp[:, 0]
all['topic'] = temp[:, 1]
all['total'] = all_temp.values

In [None]:
all[all.topic == 'in-app purchase necessary to play the game, bad game experience'].sort_values(by = 'total').tail(10)
# the app Block Puzzle has 40 reports of the in-app purchase problems

Unnamed: 0,app,topic,total
1590,Gardenscapes,"in-app purchase necessary to play the game, ba...",22
899,Cooking Fever,"in-app purchase necessary to play the game, ba...",24
136,Agar.io,"in-app purchase necessary to play the game, ba...",24
32,8 Ball Pool,"in-app purchase necessary to play the game, ba...",34
967,DEAD TARGET: FPS Zombie Apocalypse Survival Games,"in-app purchase necessary to play the game, ba...",34
585,Bowmasters,"in-app purchase necessary to play the game, ba...",38
543,Block Puzzle,"in-app purchase necessary to play the game, ba...",40
734,Candy Crush Soda Saga,"in-app purchase necessary to play the game, ba...",56
239,Angry Birds Classic,"in-app purchase necessary to play the game, ba...",64
732,Candy Crush Saga,"in-app purchase necessary to play the game, ba...",78


In [None]:
app = 'Agar.io' 
apps[apps.App == app]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1541,Agar.io,game,4.2,3816799,32.0,100000000.0,Free,0.0,Everyone,action,2018-07-23,228.0,403.0


In [None]:
apps[apps.App == app].Reviews.values

array([3816799])

In [None]:
# % of negative reviews which are due to in-app purchase necessary to play the game, ba...
bad_ux = all[(all.app == app) & (all.topic =='in-app purchase necessary to play the game, bad game experience')].total.values/df[df.App == app].Sentiment.value_counts()[1]

In [None]:
all_pos_neg_reviews = df[df.App == app].Sentiment.value_counts()/len(df[df.App == app])

In [None]:
print(f' estimated reached users', int(all_pos_neg_reviews[1]*apps[apps.App == app].Installs.values*bad_ux) )

 estimated reached users 17647058


In [None]:
print(f' users complaining for any reason {round(all_pos_neg_reviews[1]*100, 2)}%', 'users reached by addressing the poroblem',
      int(all_pos_neg_reviews[1]*apps[apps.App == app].Reviews.values*bad_ux), 
      f' which is {round(bad_ux[0]*all_pos_neg_reviews[1]*100, 2)}% of the total' )

 users complaining for any reason 44.12% users reached by addressing the poroblem 673552  which is 17.65% of the total


In [None]:
17.65*5.6

98.83999999999999