In [1]:
import pyspark
from pyspark.sql.functions import col,when,size,min, max,length
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master('local[*]').appName('topic_modelling').getOrCreate()

In [3]:
df_parquet = spark.read.parquet("part-00000-a8a717e8-bf83-42e3-a0b7-084971053d99-c000.snappy.parquet")

In [4]:
# create a column and put in the detected language code
from langdetect import detect
getCode = lambda x:detect(x)
sent = "this is eng"
getCode(sent)

'en'

In [5]:
# create a column and put in the detected language code
from langdetect import detect
# app_name_result = df_parquet.select(col('description')).collect()
# # create udf function to add a new column
# import pyspark.sql.functions as F
# from pyspark.sql.types import *

# def detect_lang(value):
#   return detect(value)

# #convert to a UDF Function by passing in the function and return type of function
# udf_detect_lang = F.udf(detect_lang, StringType())
# df_with_code = df_parquet.withColumn('lang_code', udf_detect_lang(df_parquet.description))
# df_with_code.show(5)

# easier method to convert to pandas then apply then function
df_parquet_pd = df_parquet.toPandas()
df_parquet_pd['lang_code'] = df_parquet_pd['description'].apply(getCode)
df_parquet_pd


Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code
0,Black Box - Movie Listing,958957112,Black Box Movie Listing App is a collection li...,"[Entertainment, Magazines & Newspapers]",IOS,en
1,Knife Throwing Max,1491396285,you will have fun in this game\n100+ challengi...,"[Games_Sports, Games_Role Playing]",IOS,en
2,爆料公社,1387765782,全台最大社群「爆料公社」APP 上線啦，我們把社群解放了，不只有勁爆的爆料內容，還貼心的加上...,[Social Networking],IOS,zh-tw
3,Lidow - Photo Editor & Collage,894532288,layout/grid/collage、Square/no crop for instagr...,[Photo & Video],IOS,en
4,Vidstitch Frames for Instagram,712908978,"■ #Vidstitch\n■ Featured on 148apps, Stelapps,...","[Photo & Video, Social Networking]",IOS,en
...,...,...,...,...,...,...
860,Barber Hair Salon & Beard Makeover,com.hmg.haircutgames.beardsalon.barberhaircut,Barber hair makeover salon game is for those w...,"[ENTERTAINMENT, FAMILY_PRETEND]",ANDROID,en
861,DJ Remix Offline 2020,com.mpro.djremixoffline2020,Dj Remix Offline 2020 Mp3 merupakan aplikasi m...,[MUSIC_AND_AUDIO],ANDROID,id
862,Sahih Bukhari (English),com.bangladroid.sahihbukhari,We have made this application from the book of...,[BOOKS_AND_REFERENCE],ANDROID,en
863,Two Minute English,com.astrobix.twominuteenglish,Improve your spoken English rapidly with the h...,[EDUCATION],ANDROID,en


In [6]:
# load with only English data
df_parquet_en = df_parquet_pd.loc[df_parquet_pd['lang_code']=='en']
df_parquet_en.shape

(687, 6)

In [8]:
df_spark = spark.createDataFrame(df_parquet_en)

In [51]:
# Step 1. Text cleasing with punctuations
from pyspark.sql.functions import regexp_replace
REGEX = '[,\\-.!?@#$%^&*+/\d]'
df_spark = df_spark.withColumn("description_clean",regexp_replace(df_spark.description,REGEX,' '))
df_spark.show()

+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+
|            app_name|              bundle|         description|              genres|os_platform|lang_code|   description_clean|   description_token| description_no_stop|
+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+
|Black Box - Movie...|           958957112|Black Box Movie L...|[Entertainment, M...|        IOS|       en|Black Box Movie L...|[black, box, movi...|[black, box, movi...|
|  Knife Throwing Max|          1491396285|you will have fun...|[Games_Sports, Ga...|        IOS|       en|you will have fun...|[you, will, have,...|[fun, game, 100, ...|
|Lidow - Photo Edi...|           894532288|layout/grid/colla...|     [Photo & Video]|        IOS|       en|layout grid colla...|[layout, grid, co

In [57]:
# Step 2. Tokenization
# df_spark = df_spark.drop("description_token")
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='description_clean',outputCol='description_token')
df_spark = tokenizer.transform(df_spark)
df_spark.show()

+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+
|            app_name|              bundle|         description|              genres|os_platform|lang_code|   description_clean| description_no_stop|   description_token|
+--------------------+--------------------+--------------------+--------------------+-----------+---------+--------------------+--------------------+--------------------+
|Black Box - Movie...|           958957112|Black Box Movie L...|[Entertainment, M...|        IOS|       en|Black Box Movie L...|[black, box, movi...|[black, box, movi...|
|  Knife Throwing Max|          1491396285|you will have fun...|[Games_Sports, Ga...|        IOS|       en|you will have fun...|[fun, game, 100, ...|[you, will, have,...|
|Lidow - Photo Edi...|           894532288|layout/grid/colla...|     [Photo & Video]|        IOS|       en|layout grid colla...|[layout, grid, co

In [58]:
# Step 3. Remove stopword
# df_spark = df_spark.drop("description_no_stop")
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover(inputCol="description_token",outputCol="description_no_stop")
# stopwords.getStopWords()
df_spark = stopwords.transform(df_spark)

In [59]:
df_pd_desc_final = df_spark.toPandas()

### Note: IDF vector must be trained with large corpus, otherwise lose the advance of IDF

In [60]:
# get the "description" column
joinF= lambda x:" ".join(x)
df_pd_desc_final["description_join"] = df_pd_desc_final["description_no_stop"].apply(joinF)

In [61]:
corpus_list = df_pd_desc_final["description_join"].tolist()

In [62]:
# Step 4. TF-IDF countvector, Transform documents to document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.95) #ignore terms that have a document frequency strictly higher than the given threshold
dtm = cv.fit_transform(corpus_list)

In [63]:
dtm.shape

(687, 12089)

In [64]:
feature_names = cv.get_feature_names()

In [65]:
feature_names

['_________',
 '__________',
 '___________________________________________',
 '________features____',
 '_hours',
 '_w',
 'aac',
 'aagaz',
 'aaja',
 'aana',
 'aap',
 'aapake',
 'aapko',
 'aarti',
 'aashiq',
 'ab',
 'abacus',
 'abandon',
 'abc',
 'abdul',
 'abdullah',
 'abilities',
 'ability',
 'ablar',
 'able',
 'ablution',
 'abnormal',
 'aboard',
 'aboki',
 'about',
 'aboutads',
 'aboutus',
 'above',
 'abraxas',
 'abreast',
 'abridged',
 'abroad',
 'abs',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbing',
 'abstract',
 'abstraction',
 'abstracts',
 'abu',
 'abundant',
 'abx',
 'ac',
 'academic',
 'academy',
 'acca',
 'accelerate',
 'acceleration',
 'accelerator',
 'accelerometer',
 'accent',
 'accents',
 'accept',
 'acceptance',
 'accepted',
 'access',
 'access_coarse_location',
 'access_fine_location',
 'accessibility',
 'accessible',
 'accessing',
 'accessories',
 'accident',
 'accidental',
 'accidentally',
 'accidently',
 'acclaim',
 'accolades',
 'accompanied',
 'acc

In [66]:
cv.vocabulary_

{'black': 1097,
 'box': 1272,
 'movie': 6771,
 'listing': 6055,
 'app': 484,
 'collection': 1986,
 'list': 6050,
 'movies': 6772,
 'tv': 10865,
 'shows': 9344,
 'cater': 1630,
 'diverse': 2959,
 'communities': 2062,
 'around': 591,
 'world': 11601,
 'simple': 9386,
 'way': 11375,
 'organize': 7275,
 'make': 6271,
 'wish': 11533,
 'aim': 254,
 'buy': 1459,
 'watch': 11358,
 'later': 5867,
 'pulls': 8175,
 'show': 9335,
 'information': 5139,
 'imdb': 5016,
 'website': 11400,
 'view': 11219,
 'details': 2771,
 'director': 2893,
 'release': 8537,
 'date': 2585,
 'casts': 1614,
 'studio': 9955,
 'etc': 3467,
 'offline': 7156,
 'contains': 2220,
 'news': 6985,
 'media': 6458,
 'section': 9135,
 'informs': 5143,
 'users': 11073,
 'events': 3485,
 'occurring': 7121,
 'postings': 7876,
 'detailed': 2770,
 'newly': 6984,
 'released': 8538,
 'local': 6092,
 'plays': 7750,
 'popular': 7838,
 'general': 4228,
 'population': 7840,
 'purchase': 8193,
 'one': 7188,
 'subscriptions': 9991,
 'gain': 415

In [71]:
# Step 5. TfidfTransformer to compute the IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidfTransformer.fit(dtm)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [68]:
# get the document that we want to extract from
doc_test = row1 = df_pd_desc_final['description_join'][0]

# generate tfidf vector for given doc
tf_idf_vector = tfidfTransformer.transform(cv.transform([doc_test]))

In [69]:
# Return a COOrdinate representation of this matrix
coo_matrix = tf_idf_vector.tocoo()
sorted_items = sort_coo(coo_matrix)

# extract keywords, topn
keywords = extract_topn_from_vector(feature_names,sorted_items,10)

# print result
for k in keywords:
    print(k,keywords[k])


_________ 0.315
__________ 0.261
___________________________________________ 0.238
________features____ 0.201
_hours 0.198
_w 0.186
aac 0.167
aagaz 0.165
aaja 0.165
aana 0.159


In [49]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col,coo_matrix.data)
    return sorted(tuples,key=lambda x:(x[1],x[0]),reverse=True)

def extract_topn_from_vector(feature_names,sorted_items,topn):
    """get the feature names according to the tf-idf score of top-n items"""
    
    #use only top-n itmes
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    features_vals = []
    
    # word index and corresponding tf-idf score
    for index,score in sorted_items:
        
        #feature name and score
        score_vals.append(round(score,3))
        features_vals.append(feature_names[index])
        
    # create a tuples of feature,score
    results = {}
    for index in range(len(features_vals)):
        results[feature_names[index]] = score_vals[index]
        
    return results