In [12]:
import pyspark
from pyspark.sql.functions import col,when,size,min, max,length
from pyspark.sql import SparkSession

In [13]:
spark = SparkSession.builder.master('local[*]').appName('topic_modelling').getOrCreate()

In [14]:
df_parquet = spark.read.parquet("part-00000-a8a717e8-bf83-42e3-a0b7-084971053d99-c000.snappy.parquet")

In [16]:
# create a column and put in the detected language code
from langdetect import detect
getCode = lambda x:detect(x)
sent = "this is eng"
getCode(sent)

'en'

In [17]:
# create a column and put in the detected language code
from langdetect import detect
# app_name_result = df_parquet.select(col('description')).collect()
# # create udf function to add a new column
# import pyspark.sql.functions as F
# from pyspark.sql.types import *

# def detect_lang(value):
#   return detect(value)

# #convert to a UDF Function by passing in the function and return type of function
# udf_detect_lang = F.udf(detect_lang, StringType())
# df_with_code = df_parquet.withColumn('lang_code', udf_detect_lang(df_parquet.description))
# df_with_code.show(5)

# easier method to convert to pandas then apply then function
df_parquet_pd = df_parquet.toPandas()
df_parquet_pd['lang_code'] = df_parquet_pd['description'].apply(getCode)
df_parquet_pd

Unnamed: 0,app_name,bundle,description,genres,os_platform,lang_code
0,Black Box - Movie Listing,958957112,Black Box Movie Listing App is a collection li...,"[Entertainment, Magazines & Newspapers]",IOS,en
1,Knife Throwing Max,1491396285,you will have fun in this game\n100+ challengi...,"[Games_Sports, Games_Role Playing]",IOS,en
2,爆料公社,1387765782,全台最大社群「爆料公社」APP 上線啦，我們把社群解放了，不只有勁爆的爆料內容，還貼心的加上...,[Social Networking],IOS,zh-tw
3,Lidow - Photo Editor & Collage,894532288,layout/grid/collage、Square/no crop for instagr...,[Photo & Video],IOS,en
4,Vidstitch Frames for Instagram,712908978,"■ #Vidstitch\n■ Featured on 148apps, Stelapps,...","[Photo & Video, Social Networking]",IOS,en
...,...,...,...,...,...,...
860,Barber Hair Salon & Beard Makeover,com.hmg.haircutgames.beardsalon.barberhaircut,Barber hair makeover salon game is for those w...,"[ENTERTAINMENT, FAMILY_PRETEND]",ANDROID,en
861,DJ Remix Offline 2020,com.mpro.djremixoffline2020,Dj Remix Offline 2020 Mp3 merupakan aplikasi m...,[MUSIC_AND_AUDIO],ANDROID,id
862,Sahih Bukhari (English),com.bangladroid.sahihbukhari,We have made this application from the book of...,[BOOKS_AND_REFERENCE],ANDROID,en
863,Two Minute English,com.astrobix.twominuteenglish,Improve your spoken English rapidly with the h...,[EDUCATION],ANDROID,en


In [18]:
# load with only English data
df_parquet_en = df_parquet_pd.loc[df_parquet_pd['lang_code']=='en']
df_parquet_en.shape

(687, 6)

In [21]:
row1 = df_parquet_en['description'][0]

In [23]:
row1

'Black Box Movie Listing App is a collection list of movies and TV shows that cater to diverse communities all around the world.  The app is a simple way to organize your movie collection, make a list of all your own movies, or your wish list movies that you aim to buy or watch later.  The app pulls movie and TV show information from the IMDb website. View details such as director, release date, casts, studio and etc in your offline list.\n\nThe Black Box Movie Listing App contains a \'News and Media\' section that informs app users of diverse events that are occurring around the world.  Some of the news postings are detailed information on any newly released movies, TV shows, and local plays that are popular to the general population.  \n\nUsers can now purchase one of our in-app subscriptions to gain premium access to news content in the News & Media section.  The subscription also removes advertisement from specific areas of the Black Box app.  There are 3 different subscriptions to

In [22]:
import spacy
import pytextrank
import en_core_web_sm

# example text
# text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."
text = row1 

# load a spaCy model, depending on language, scale, etc.
nlp = en_core_web_sm.load()

# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

doc = nlp(text)

# examine the top-ranked phrases in the document
for p in doc._.phrases:
    print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
    print(p.chunks)

0.0989     4  movies
[movies, movie, the movies, your movies]
0.0811     2  black box movie listing app
[Black Box Movie Listing App, The Black Box Movie Listing App]
0.0808     1  app users
[app users]
0.0733     1  monthly ad removal subscription
[Monthly Ad Removal Subscription]
0.0635     1  social media platforms
[social media platforms]
0.0630     3  users
[Users, Users, a user]
0.0617     2  tv
[TV, TV]
0.0614     3  app
[app, The app, The app]
0.0607     1  diverse events
[diverse events]
0.0607     1  news content
[news content]
0.0595     1  your movie collection
[your movie collection]
0.0591     1  tv shows
[TV shows]
0.0568     1  detailed information
[detailed information]
0.0566     1  shows
[shows]
0.0564     1  release date
[release date]
0.0564     1  the movie trailer
[the movie trailer]
0.0545     1  more features
[More features]
0.0545     1  privacy policy
[Privacy Policy]
0.0542     2  february 22nd
[February 22nd, February 22nd]
0.0534     1  premium access
[pre