In [1]:
import pyspark
from pyspark.sql.functions import col,when,size,min, max,length
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master('local[*]')\
    .appName('keyword_extraction_model')\
    .getOrCreate()

In [17]:
df_parquet = spark.read.parquet("part-00000-a8a717e8-bf83-42e3-a0b7-084971053d99-c000.snappy.parquet")

In [18]:
# create a column and put in the detected language code
from langdetect import detect
getCode = lambda x:detect(x)
# sent = "this is eng"
# getCode(sent)

In [19]:
# create a column and put in the detected language code
from langdetect import detect
# app_name_result = df_parquet.select(col('description')).collect()
# # create udf function to add a new column
# import pyspark.sql.functions as F
# from pyspark.sql.types import *

# def detect_lang(value):
#   return detect(value)

# #convert to a UDF Function by passing in the function and return type of function
# udf_detect_lang = F.udf(detect_lang, StringType())
# df_with_code = df_parquet.withColumn('lang_code', udf_detect_lang(df_parquet.description))
# df_with_code.show(5)

# easier method to convert to pandas then apply then function
df_parquet_pd = df_parquet.toPandas()
df_parquet_pd['lang_code'] = df_parquet_pd['description'].apply(getCode)
df_parquet_pd


Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code
0,Black Box - Movie Listing,958957112,Black Box Movie Listing App is a collection li...,"[Entertainment, Magazines & Newspapers]",IOS,en
1,Knife Throwing Max,1491396285,you will have fun in this game\n100+ challengi...,"[Games_Sports, Games_Role Playing]",IOS,en
2,爆料公社,1387765782,全台最大社群「爆料公社」APP 上線啦，我們把社群解放了，不只有勁爆的爆料內容，還貼心的加上...,[Social Networking],IOS,zh-tw
3,Lidow - Photo Editor & Collage,894532288,layout/grid/collage、Square/no crop for instagr...,[Photo & Video],IOS,en
4,Vidstitch Frames for Instagram,712908978,"■ #Vidstitch\n■ Featured on 148apps, Stelapps,...","[Photo & Video, Social Networking]",IOS,en
...,...,...,...,...,...,...
860,Barber Hair Salon & Beard Makeover,com.hmg.haircutgames.beardsalon.barberhaircut,Barber hair makeover salon game is for those w...,"[ENTERTAINMENT, FAMILY_PRETEND]",ANDROID,en
861,DJ Remix Offline 2020,com.mpro.djremixoffline2020,Dj Remix Offline 2020 Mp3 merupakan aplikasi m...,[MUSIC_AND_AUDIO],ANDROID,id
862,Sahih Bukhari (English),com.bangladroid.sahihbukhari,We have made this application from the book of...,[BOOKS_AND_REFERENCE],ANDROID,en
863,Two Minute English,com.astrobix.twominuteenglish,Improve your spoken English rapidly with the h...,[EDUCATION],ANDROID,en


## Map with language code name

In [20]:
import pandas as pd
code_ref = pd.read_excel('lang_code.xlsx')

In [7]:
# code_ref

## Join language name table

In [21]:
df_parquet_pd = pd.merge(df_parquet_pd,code_ref,how = 'left',on = 'lang_code')

In [9]:
# df_parquet_pd

In [9]:
# df = df_parquet_pd[df_parquet_pd.language_name.isnull()]

In [22]:
df_parquet_pd.groupby('language_name').count().sort_values(by='app_name', ascending=False)

Unnamed: 0_level_0,app_name,bundle,description,genres,os_platform,lang_code
language_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
English,686,686,686,686,686,686
Indonesian,83,83,83,83,83,83
Vietnamese,16,16,16,16,16,16
Bengali,16,16,16,16,16,16
Chinese,12,12,12,12,12,12
Arabic,10,10,10,10,10,10
Korean,9,9,9,9,9,9
Thai,9,9,9,9,9,9
French,4,4,4,4,4,4
Japanese,4,4,4,4,4,4


## Filter out en, cn, id languages and apply existing models, the rest will be handled with default en model

In [23]:
df_spark = spark.createDataFrame(df_parquet_pd)

df_spark_en = df_spark.filter(col("lang_code")=='en')
df_spark_cn = df_spark.filter(col("language_name")=='Chinese')
df_spark_id = df_spark.filter(col("lang_code")=='id')
df_spark_rest = df_spark.filter((col("lang_code")!='id') & (col("language_name")!='Chinese') & (col("lang_code")!='en'))

num_en = df_spark_en.count()
num_cn = df_spark_cn.count()
num_id = df_spark_id.count()
num_rest = df_spark_rest.count()

print("English:",num_en,"Chinese:",num_cn,"Bahasa:",num_id,"Rest languages:",num_rest)
print("percentage covered:",(num_en+num_cn+num_id)/df_spark.count())

English: 686 Chinese: 12 Bahasa: 83 Rest languages: 84
percentage covered: 0.9028901734104047


## Text Pre-processing
- Remove punctuation
- Tokenization
- Stemming
- Remove stopwords

In [24]:
topn = 10
# English part
import tfidf_model  
tfidf_english_model = tfidf_model.tfidf_eng_model(df_spark_en,topn)
df_pd_en = tfidf_english_model.get_pd_keyword()

In [25]:
# Chinese part
tfidf_chinese_model = tfidf_model.tfidf_cn_model(df_spark_cn,topn)
df_pd_cn = tfidf_chinese_model.get_pd_keyword()

In [26]:
# Bahasa part
tfidf_bahasa_model = tfidf_model.tfidf_id_model(df_spark_id,topn)
df_pd_id = tfidf_bahasa_model.get_pd_keyword()

In [27]:
# join the result set 
frame = [df_pd_en,df_pd_cn,df_pd_id]
result = pd.concat(frame,ignore_index = True)

In [14]:
result.columns

Index(['app_name', 'bundle', 'description', 'genres', 'os_platform',
       'lang_code', 'language_name', 'description_clean', 'description_token',
       'description_lemm', 'description_no_stop', 'description_final',
       'description_keyword_list', 'description_stem'],
      dtype='object')

In [28]:
result_new = result.loc[:,["app_name","bundle","description","genres","os_platform","lang_code","description_keyword_list"]]

In [29]:
df_desc_final_spark = spark.createDataFrame(result_new)

# create new columns to store keyword
for i in range(topn):
        df_desc_final_spark = df_desc_final_spark.withColumn("keyword-"+str(i+1),df_desc_final_spark['description_keyword_list'][i])

# save the final keyword spreadsheet to local
df_desc_final = df_desc_final_spark.toPandas()
df_desc_final.to_csv('desc_keyword_final.csv',encoding='utf_8_sig')

# print output result
print("Keyword list for English/Chinese/Bahasa is generated.")

Keyword list for English/Chinese/Bahasa is generated.


In [None]:
# # Rest part
# tfidf_rest_model = tfidf_model.tfidf_rest_model(df_spark_rest,spark)
# tfidf_rest_model.getKeyword()

In [None]:
# Param:

#     regex
#     tokenizer
#     stemmer/lemmetizer
#     stopword list
#     spark_dataset_language_portion
#     topn


# import nltk
# from nltk.corpus import stopwords

# def get_param(code):
#     param_dict = {}
    
#     if code == 'en':
#         REGEX = '[_,\\-.!?@#$%^&*+/\d]'
#         stopword_list = stopwords.words('english')

#     elif code == 'zh-tw' or code == 'zh-cn':
#         REGEX = ''
#         stopword_list = []
#     elif code == 'id':
#         REGEX = ''
#         stopword_list = []
#     else:
#         REGEX = ''
#         stopword_list = []
        
#     param_dict['regex'] = REGEX
#     param_dict['stopword_list'] = stopword_list
    
#     return param_dict