<a href="https://colab.research.google.com/github/enliktjioe/master-thesis-2021/blob/main/sandbox/Topic_Modelling_using_LDA_Uber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## References

- https://medium.com/analytics-vidhya/play-store-app-reviews-textual-data-topic-modelling-using-lda-f24bdbd2910d

In [1]:
# !pip install google_play_scraper
# !pip install sklearn

## Libraries needed

In [2]:
import pandas as pd
from google_play_scraper.features.reviews import Sort, reviews_all, reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Mining app review from Bolt (Google Play Store)

In [3]:
result = reviews_all('com.ubercab',sleep_milliseconds=0,lang='en', country='us')

## Create dataframe of the reviews

In [4]:
df = pd.DataFrame(result)
unique_users  = len(df['userName'].unique())
unknown_users = len(df[df['userName']=='A Google user'])
total_reviews = len(df)
mean = df['score'].mean()
print(f'Total textual reviews: {len(result)} \n')
print(f'Total unique users : {unique_users}')
print(f'Total unknown users: {unknown_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users - unknown_users}\n')
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total textual reviews: 999521 

Total unique users : 749788
Total unknown users: 142458
Total users who gave multiple reviews: 107275

Average rating for this app based on the textual reviews: 3.62 



## Extract all reviews with rating below 4

In [5]:
df_tm = df[df['score']<=3]
df_tm = df_tm[df_tm.content.str.len()>=30]
print(f'Remaining textual reviews: {len(df_tm)} \n')


Remaining textual reviews: 272456 



## Get Relevant Columns for Topic Modelling


In [6]:
df_tm = df_tm[['reviewId','content']].drop_duplicates()
df_tm.dropna(inplace=True)
df_tm = df_tm.reset_index().drop(columns='index')
print(f'Remaining textual reviews: {len(df_tm)} \n')


Remaining textual reviews: 272456 



## Create document term matrix of the reviews

In [7]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(df_tm['content'])
dtm

<272456x42596 sparse matrix of type '<class 'numpy.int64'>'
	with 3677245 stored elements in Compressed Sparse Row format>

## Using LDA for topic modelling

In [8]:
LDA = LatentDirichletAllocation(n_components=5,random_state=1)
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=1, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

### Extract the topics and corresponding top20 (high frequency) words


In [9]:
for index,topic in enumerate(LDA.components_):
    print(f'topic #{index} : ')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])

topic #0 : 
['try', 'don', 'worst', 'new', 'just', 'working', 'sign', 'help', 'able', 'login', 'time', 'email', 'tried', 'password', 'use', 'number', 'phone', 'uber', 'account', 'app']
topic #1 : 
['paytm', 'time', 'pay', 'location', 'doesn', 'add', 'issue', 'use', 'credit', 'ride', 'update', 'support', 'cash', 'service', 'option', 'customer', 'uber', 'card', 'payment', 'app']
topic #2 : 
['ko', 'ka', 'nhi', 'ho', 'nahi', 'es', 'los', 'para', 'ki', 'por', 'hi', 'não', 'en', 'app', 'el', 'se', 'la', 'uber', 'hai', 'que']
topic #3 : 
['experience', 'money', 'charge', 'customer', 'bad', 'location', 'pay', 'cancelled', 'worst', 'app', 'cab', 'service', 'charged', 'cancel', 'trip', 'time', 'drivers', 'ride', 'uber', 'driver']
topic #4 : 
['way', 'people', 'driver', 'high', 'time', 'taxi', 'company', 'don', 'good', 'lyft', 'fare', 'just', 'car', 'like', 'service', 'use', 'price', 'app', 'drivers', 'uber']


## Combine the topic modelling results with the base dataset

In [10]:
topic_results = LDA.transform(dtm)
df_topic_results = pd.DataFrame(topic_results, columns=[
    '0_Booking',
    '1_Payment/Promo' ,
    '2_App'            ,
    '3_Services'  ,
    '4_Others' 
])
df_result = pd.merge(df_tm, df_topic_results,  how='inner', left_index=True, right_index=True )
df_output = pd.merge(df, df_result,  how='left', on=[ 'reviewId','content' ])

df_output.to_csv('app_reviews_bms.csv')

## Example DataViz using [Tableau](https://public.tableau.com/profile/pankaj.kumar.shukla#!/vizhome/AppReviews_v2/GooglePlayStoreappreviewanalysis)


![dataviz_topic_modelling_lda_1.jpg](attachment:dataviz_topic_modelling_lda_1.jpg)



![dataviz_topic_modelling_lda_2.jpg](attachment:dataviz_topic_modelling_lda_2.jpg)