In [1]:
import pyspark
from pyspark.sql.functions import col,when,size,min, max,length
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master('local[*]').appName('topic_modelling').getOrCreate()

In [3]:
df_parquet = spark.read.parquet("part-00000-a8a717e8-bf83-42e3-a0b7-084971053d99-c000.snappy.parquet")

In [4]:
# general understanding of the dataset
df_parquet.printSchema()
print(df_parquet.count())
df_parquet.describe().show()

root
 |-- app_name: string (nullable = true)
 |-- bundle: string (nullable = true)
 |-- description: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- os_platform: string (nullable = true)

865
+-------+--------------------+--------------------+--------------------+-----------+
|summary|            app_name|              bundle|         description|os_platform|
+-------+--------------------+--------------------+--------------------+-----------+
|  count|                 865|                 865|                 865|        865|
|   mean|              2048.0|1.0872837703941176E9|                null|       null|
| stddev|                 NaN|  3.16782382724795E8|                null|       null|
|    min|1000 Pantun Rayua...|          1003730047|"Best Game for Ca...|    ANDROID|
|    max|🦁🐼Jungle Animal...| xyz.goro3goro.igo19|🥉
Aplikasi ini m...|        IOS|
+-------+--------------------+--------------------+-------------

In [5]:
df_parquet.select(col('app_name')).show(10,False)

+--------------------------------------------------+
|app_name                                          |
+--------------------------------------------------+
|Black Box - Movie Listing                         |
|Knife Throwing Max                                |
|爆料公社                                          |
|Lidow - Photo Editor & Collage                    |
|Vidstitch Frames for Instagram                    |
|Where's my Cat? -Escape Game-                     |
|Martial Art Wallpaper                             |
|더 스파이크                                       |
|Image detector                                    |
|Horoscope - Face Secret - See Future - Make me old|
+--------------------------------------------------+
only showing top 10 rows



In [6]:
# size of the genre list is between 1 and 3
df_size_genres = df_parquet.select(size(col('genres')).alias("size"))
#df_size_genres.show()
df_size_genres.agg(min('size'),max('size')).show()

+---------+---------+
|min(size)|max(size)|
+---------+---------+
|        1|        3|
+---------+---------+



In [7]:
# length of 'description' column 
df_description_length = df_parquet.select(length(col('description')).alias('length'))
df_description_length.agg(min('length'),max('length')).show()

+-----------+-----------+
|min(length)|max(length)|
+-----------+-----------+
|          9|       4000|
+-----------+-----------+



In [8]:
# os_platform
df_parquet.select(col('os_platform')).distinct().show()

+-----------+
|os_platform|
+-----------+
|    ANDROID|
|        IOS|
+-----------+



In [10]:
# no null value in 'description' and 'genres' column
df_parquet.filter(col("description").isNull()).show()
df_parquet.filter(col("genres").isNull()).show()
df_parquet.filter(col("app_name").isNull()).show()
df_parquet.filter(col("bundle").isNull()).show()
df_parquet.filter(col("os_platform").isNull()).show()

+--------+------+-----------+------+-----------+
|app_name|bundle|description|genres|os_platform|
+--------+------+-----------+------+-----------+
+--------+------+-----------+------+-----------+

+--------+------+-----------+------+-----------+
|app_name|bundle|description|genres|os_platform|
+--------+------+-----------+------+-----------+
+--------+------+-----------+------+-----------+

+--------+------+-----------+------+-----------+
|app_name|bundle|description|genres|os_platform|
+--------+------+-----------+------+-----------+
+--------+------+-----------+------+-----------+

+--------+------+-----------+------+-----------+
|app_name|bundle|description|genres|os_platform|
+--------+------+-----------+------+-----------+
+--------+------+-----------+------+-----------+

+--------+------+-----------+------+-----------+
|app_name|bundle|description|genres|os_platform|
+--------+------+-----------+------+-----------+
+--------+------+-----------+------+-----------+



In [19]:
# create a column and put in the detected language code
from langdetect import detect
# app_name_result = df_parquet.select(col('description')).collect()
# # create udf function to add a new column
# import pyspark.sql.functions as F
# from pyspark.sql.types import *

# def detect_lang(value):
#   return detect(value)

# #convert to a UDF Function by passing in the function and return type of function
# udf_detect_lang = F.udf(detect_lang, StringType())
# df_with_code = df_parquet.withColumn('lang_code', udf_detect_lang(df_parquet.description))
# df_with_code.show(5)

# easier method to convert to pandas then apply then function
df_parquet_pd = df_parquet.toPandas()
df_parquet_pd['lang_code'] = df_parquet_pd['description'].apply(getCode)
df_parquet_pd

Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code
0,Black Box - Movie Listing,958957112,Black Box Movie Listing App is a collection li...,"[Entertainment, Magazines & Newspapers]",IOS,en
1,Knife Throwing Max,1491396285,you will have fun in this game\n100+ challengi...,"[Games_Sports, Games_Role Playing]",IOS,en
2,爆料公社,1387765782,全台最大社群「爆料公社」APP 上線啦，我們把社群解放了，不只有勁爆的爆料內容，還貼心的加上...,[Social Networking],IOS,zh-tw
3,Lidow - Photo Editor & Collage,894532288,layout/grid/collage、Square/no crop for instagr...,[Photo & Video],IOS,en
4,Vidstitch Frames for Instagram,712908978,"■ #Vidstitch\n■ Featured on 148apps, Stelapps,...","[Photo & Video, Social Networking]",IOS,en
...,...,...,...,...,...,...
860,Barber Hair Salon & Beard Makeover,com.hmg.haircutgames.beardsalon.barberhaircut,Barber hair makeover salon game is for those w...,"[ENTERTAINMENT, FAMILY_PRETEND]",ANDROID,en
861,DJ Remix Offline 2020,com.mpro.djremixoffline2020,Dj Remix Offline 2020 Mp3 merupakan aplikasi m...,[MUSIC_AND_AUDIO],ANDROID,id
862,Sahih Bukhari (English),com.bangladroid.sahihbukhari,We have made this application from the book of...,[BOOKS_AND_REFERENCE],ANDROID,en
863,Two Minute English,com.astrobix.twominuteenglish,Improve your spoken English rapidly with the h...,[EDUCATION],ANDROID,en


In [13]:
getCode = lambda x:detect(x)
sent = "this is eng"
getCode(sent)

'en'

In [31]:
# different language code
df_parquet_pd.groupby(['lang_code']).count()

Unnamed: 0_level_0,app_name,bundle,description,genres,os_platform
lang_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ar,10,10,10,10,10
bn,15,15,15,15,15
cs,1,1,1,1,1
de,1,1,1,1,1
en,686,686,686,686,686
es,2,2,2,2,2
fr,4,4,4,4,4
hr,2,2,2,2,2
id,82,82,82,82,82
it,2,2,2,2,2


In [36]:
# observe "language code" column, check whether NULL value exists
df_parquet_pd[df_parquet_pd.lang_code.isnull()]

Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code


In [48]:
# inspect 'description' in english
df_parquet_pd.loc[df_parquet_pd['lang_code']=='en']

Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code
0,Black Box - Movie Listing,958957112,Black Box Movie Listing App is a collection li...,"[Entertainment, Magazines & Newspapers]",IOS,en
1,Knife Throwing Max,1491396285,you will have fun in this game\n100+ challengi...,"[Games_Sports, Games_Role Playing]",IOS,en
3,Lidow - Photo Editor & Collage,894532288,layout/grid/collage、Square/no crop for instagr...,[Photo & Video],IOS,en
4,Vidstitch Frames for Instagram,712908978,"■ #Vidstitch\n■ Featured on 148apps, Stelapps,...","[Photo & Video, Social Networking]",IOS,en
5,Where's my Cat? -Escape Game-,1259238703,The cat is hiding and won't come out!\nFind th...,"[Entertainment, Games_Family, Games_Simulation]",IOS,en
...,...,...,...,...,...,...
859,Mining Dictionary,best2018apps.miningdictionary,Thousands of Mining Words and Terms\nIf you ha...,[BOOKS_AND_REFERENCE],ANDROID,en
860,Barber Hair Salon & Beard Makeover,com.hmg.haircutgames.beardsalon.barberhaircut,Barber hair makeover salon game is for those w...,"[ENTERTAINMENT, FAMILY_PRETEND]",ANDROID,en
862,Sahih Bukhari (English),com.bangladroid.sahihbukhari,We have made this application from the book of...,[BOOKS_AND_REFERENCE],ANDROID,en
863,Two Minute English,com.astrobix.twominuteenglish,Improve your spoken English rapidly with the h...,[EDUCATION],ANDROID,en


In [47]:
# inspect 'description' in english
df_parquet_pd.loc[df_parquet_pd['lang_code']!='en']

Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code
2,爆料公社,1387765782,全台最大社群「爆料公社」APP 上線啦，我們把社群解放了，不只有勁爆的爆料內容，還貼心的加上...,[Social Networking],IOS,zh-tw
7,더 스파이크,com.daerisoft.thespikelive,게임의 난이도가 조금 어려울수도 있습니다.\n연습을 조금 하시면 더 쉽게 적응하실 ...,[GAME_SPORTS],ANDROID,ko
12,Cara Menggunakan Jam Imoo Anak,com.jamimoo.guide,Imoo Watch Phone hadir dengan berbagai fitur m...,[BOOKS_AND_REFERENCE],ANDROID,id
13,สวัสดี วันเสาร์,com.thdev.saturday,แอพที่กำลังฮิตตอนนี้ เป็นแอพที่รวบรวมคำสวัสดี ...,[SOCIAL],ANDROID,th
20,Lagu Dangdut Hamdan ATT Offline,com.zakiy.hamdan,Aplikasi Ini Bersifat Offline Sehingga Tidak M...,[MUSIC_AND_AUDIO],ANDROID,id
...,...,...,...,...,...,...
842,O'zbek - Rusda Tarjimon,com.linguaapps.translator.uzbek.ru,O'zbek - Rusda Tarjimon dasturi - bepul va ul...,[BOOKS_AND_REFERENCE],ANDROID,id
854,Muu Ke Tan Gai,com.vnmacstudio.muuketangai,"Bạn là một chàng trai tài năng, điển trai nhưn...",[BOOKS_AND_REFERENCE],ANDROID,vi
855,الأم في الفقه الشافعي,so.ateya.ahmed.elom_shafay,مصدر الكتاب : موقع نداء الايمان\n[موقع نداء ال...,[BOOKS_AND_REFERENCE],ANDROID,ar
858,Lagu Galau Offline 2020,com.kermaxdevteam.lagugalaupatahhati,Kumpulan lagu galau patah hati terbaru 2020 pa...,[MUSIC_AND_AUDIO],ANDROID,id


## Building LDA model - topic modelling from below
### Using only EN partion of data

In [52]:
# Building LDA model
df_parquet_en = df_parquet_pd.loc[df_parquet_pd['lang_code']=='en']
df_parquet_en.shape

(686, 6)

In [76]:
df_spark = spark.createDataFrame(df_parquet_en)

In [77]:
# df_parquet=df_parquet.drop("description_token")
df_spark.show()

+--------------------+--------------------+--------------------+--------------------+-----------+---------+
|            app_name|              bundle|         description|              genres|os_platform|lang_code|
+--------------------+--------------------+--------------------+--------------------+-----------+---------+
|Black Box - Movie...|           958957112|Black Box Movie L...|[Entertainment, M...|        IOS|       en|
|  Knife Throwing Max|          1491396285|you will have fun...|[Games_Sports, Ga...|        IOS|       en|
|Lidow - Photo Edi...|           894532288|layout/grid/colla...|     [Photo & Video]|        IOS|       en|
|Vidstitch Frames ...|           712908978|■ #Vidstitch
■ Fe...|[Photo & Video, S...|        IOS|       en|
|Where's my Cat? -...|          1259238703|The cat is hiding...|[Entertainment, G...|        IOS|       en|
|Martial Art Wallp...|com.martialart.wa...|Martial arts are ...|   [PERSONALIZATION]|    ANDROID|       en|
|      Image detector|com.sa

## For each description, it is treated as a seperate document, and apply LDA model on it, and extract top 3 topics

In [112]:
# Step 1. Text cleasing with punctuations
from pyspark.sql.functions import regexp_replace
REGEX = '[,\\-.!?@#$%^&*+/]'
df_spark = df_spark.withColumn("description_clean",regexp_replace(df_spark.description,REGEX,' '))
df_spark.show()

+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+
|            app_name|              bundle|         description|              genres|os_platform|lang_code|   description_clean|
+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+
|Black Box - Movie...|           958957112|Black Box Movie L...|[Entertainment, M...|        IOS|       en|Black Box Movie L...|
|  Knife Throwing Max|          1491396285|you will have fun...|[Games_Sports, Ga...|        IOS|       en|you will have fun...|
|Lidow - Photo Edi...|           894532288|layout/grid/colla...|     [Photo & Video]|        IOS|       en|layout grid colla...|
|Vidstitch Frames ...|           712908978|■ #Vidstitch
■ Fe...|[Photo & Video, S...|        IOS|       en|■  Vidstitch
■ Fe...|
|Where's my Cat? -...|          1259238703|The cat is hiding...|[Entertainment, G...|        IOS|

In [119]:
# Step 2. Tokenization
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='description_clean',outputCol='description_token')
df_spark = tokenizer.transform(df_spark)
df_spark.show()

+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+
|            app_name|              bundle|         description|              genres|os_platform|lang_code|   description_clean|   description_token|
+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+
|Black Box - Movie...|           958957112|Black Box Movie L...|[Entertainment, M...|        IOS|       en|Black Box Movie L...|[black, box, movi...|
|  Knife Throwing Max|          1491396285|you will have fun...|[Games_Sports, Ga...|        IOS|       en|you will have fun...|[you, will, have,...|
|Lidow - Photo Edi...|           894532288|layout/grid/colla...|     [Photo & Video]|        IOS|       en|layout grid colla...|[layout, grid, co...|
|Vidstitch Frames ...|           712908978|■ #Vidstitch
■ Fe...|[Photo & Video, S...|        IOS|   

In [121]:
# Step 3. Remove stopword
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover(inputCol="description_token",outputCol="description_no_stop")
# stopwords.getStopWords()
df_spark = stopwords.transform(df_spark)
df_spark.show()

+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+
|            app_name|              bundle|         description|              genres|os_platform|lang_code|   description_clean|   description_token| description_no_stop|
+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+
|Black Box - Movie...|           958957112|Black Box Movie L...|[Entertainment, M...|        IOS|       en|Black Box Movie L...|[black, box, movi...|[black, box, movi...|
|  Knife Throwing Max|          1491396285|you will have fun...|[Games_Sports, Ga...|        IOS|       en|you will have fun...|[you, will, have,...|[fun, game, 100, ...|
|Lidow - Photo Edi...|           894532288|layout/grid/colla...|     [Photo & Video]|        IOS|       en|layout grid colla...|[layout, grid, co

In [128]:
# Iterate each row and build LDA for each one of it
df_pd_desc_final = df_spark.toPandas()

In [148]:
# TODO
row1 = df_pd_desc_final['description_no_stop'][0]
len(row1)

326

In [136]:
# Step 4. TF-IDF countvector, Transform documents to document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95) #ignore terms that have a document frequency strictly higher than the given threshold
dtm = cv.fit_transform(row1)

In [142]:
# Step 5. use TF-IDF vector to do LDA model
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=5,random_state=123) #select only top 5 topics
LDA.fit_transform(dtm)

array([[0.10000588, 0.10000713, 0.59997482, 0.10000644, 0.10000573],
       [0.10000517, 0.59997861, 0.10000551, 0.10000567, 0.10000504],
       [0.10000371, 0.1000045 , 0.10000395, 0.10000406, 0.59998378],
       ...,
       [0.10000926, 0.10001124, 0.59996031, 0.10001015, 0.10000903],
       [0.59990089, 0.10002764, 0.1000243 , 0.10002497, 0.10002221],
       [0.10001013, 0.10001229, 0.10001081, 0.10001111, 0.59995566]])

In [144]:
LDA.components_.shape

(5, 141)

In [151]:
len(cv.get_feature_names())

141

In [153]:
type(topic)

numpy.ndarray

In [155]:
for index,topic in enumerate(LDA.components_):
    print(index)
    print(topic)

0
[0.20003705 0.20003705 0.20004667 3.19983625 0.20003705 0.20003297
 0.20003751 0.20004266 0.20003432 0.20004152 1.19980185 0.20003751
 0.20004667 1.19980185 1.19980185 0.2000333  0.20004266 0.20003605
 0.20003258 1.19980185 0.20004667 0.20003526 0.20003101 1.19980185
 1.19980185 0.20004667 0.20003857 1.19980185 2.19982796 0.20004052
 1.19980185 1.19980185 0.20004667 0.20004667 0.20004266 1.19980185
 0.20003605 1.19980185 0.20004152 0.20004152 0.20004667 0.20003605
 0.20004052 0.20004266 0.20003705 0.20003751 2.19982796 0.20003751
 1.19980185 0.20003751 0.20004152 1.19980185 0.20004667 0.20004266
 0.20004667 2.19982796 0.20003751 0.20004266 1.19980185 0.20003751
 0.20004152 0.20003258 0.20004266 0.20003751 0.20004266 0.20004152
 1.19980185 0.20003264 0.20004052 0.20003751 0.20004152 0.20004667
 5.19984267 0.20003432 0.20004667 0.20003706 6.19984424 0.20004152
 0.20004152 0.20003388 0.20004152 0.20004667 0.20004266 0.20003857
 0.20003751 0.20004152 0.20004266 0.20004152 0.20004052 0.20

In [None]:
# Step 6. Combine result back to dataset. Insert the result into three sepearate columns.


## Build Text cleaning pipeline

## Convert Pandas DataFrame to Spark DataFrame

In [43]:
df_parquet = spark.createDataFrame(df_parquet_pd)

In [44]:
spark_df

DataFrame[app_name: string, bundle: string, description: string, genres: array<string>, os_platform: string, lang_code: string]