# Topic Modeling using LDA

In [1]:
!pip install stanza

# packages to store and manipulate data
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style='white', color_codes=True)
sns.set_context(rc={"font.family":"sans","font.size":24,"axes.titlesize":24,"axes.labelsize":24})

# model building package
import sklearn

# package to clean text
import re

from nltk.corpus import stopwords
import nltk 

import warnings
warnings.filterwarnings("ignore")

from preprocessor import TwitterPreprocessor

import stanza 

Collecting stanza
  Downloading stanza-1.0.1-py3-none-any.whl (193 kB)
[K     |████████████████████████████████| 193 kB 2.8 MB/s 
Installing collected packages: stanza
Successfully installed stanza-1.0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
path = "/kaggle/input/tweets/Hashtags_fullExport_clean.csv"
tweets = pd.read_csv(path, encoding='utf-8')

In [4]:
#cleaning master function
def clean_tweet(tweet, bigrams=True):
    tweet = re.sub(r"RT @.+: ",'',tweet)
    tweet = tweet.replace('#','')
    tweet = tweet.replace('@','')
    p = TwitterPreprocessor(tweet)
#    # preprocess test
    p.fully_preprocess()
    tweet = p.text
    return tweet

tweets['clean_text'] = tweets.text.apply(clean_tweet)

We can explore the tweets a bit first.

# EDA

We look at the text of the 10 most retweeted tweets. 

In [5]:
tweet_count = tweets[~tweets['mentions'].str.contains('OriginalFunko',na=False)].text.value_counts()

We got rid of @OriginalFunko, as we consider it being noise, since it simply launched a giveaway of backpacks based on mentioning them. It was captured by our algorithm since #fashion was used in the tweets. We don't believe this is related anyhow to the actual fashion week phenomenon so we get rid of the tweets related to @OriginalFunko.

In [6]:
pd.DataFrame(tweet_count).head(10)

Unnamed: 0,text
RT @MichaelKors: Spreading the joy: @RVsmtown’s Joy stops by our Fall 2020 #MichaelKorsCollection runway show. #AllAccessKors #NYFW #_imyour_joy https://t.co/ChGzBjSjgX,6833
RT @Coach: Picture perfect. Singer #PeckPalitChoke gets the artist treatment backstage at the #CoachFW20 runway show. #CoachNY #NYFW #GQxPECKxCoachFW20 #GQxPECKxCoach https://t.co/cnYot9KqmU,6192
RT @BritishVogue: .@ygofficialblink's Lisa’s @Prada front-row look at #MFW was inspired by her new hairstyle: https://t.co/fBGiheLTpm https://t.co/5YAvLnAjkj,6127
RT @MEENAVOGUEE: BELLA HADID FOR MARC JACOBS. #NYFW https://t.co/ylFeLZQGeB,6115
RT @PopCrave: Watch @NickiMinaj’s hilarious reaction to the models at Marc Jacobs’ #NYFW show getting close to her husband. https://t.co/agjTZYTfwH,5245
"RT @ChinaDaily: #Blackpink member #Lisa made an appearance at the Prada Show in Milan #FashionWeek 2020. It was her first public appearance this year, her golden suit fascinated Chinese fans！#LalisaWearsPrada #LALISAxPRADAFW @ygofficialblink https://t.co/VZvHAUXlLz",4864
RT @globaltimesnews: Blackpink's #Lisa went to Japan from #MilanFashionWeek for a performance on Saturday. Her solo dance on stage during the song 'Good thing & Señorita' won the hearts of many fans. #BlackpinkinFukuoka #INYOURAREA_WORLDTOURFinale #Lalisa @ygofficialblink https://t.co/in2khel2Om,3995
RT @wkorea: 오늘 밀라노로 출국한 #블랙핑크 #리사 의 공항 룩 #BLACKPINK #LISA #MilanFashionWeek #MFW https://t.co/efONE0u6Ra,3750
"RT @dispatchsns: 뷔(BTS), ""2019 공항패션 모음.zip"" [공항] https://t.co/YelVNeR0Ns #BTS #방탄소년단 #뷔 #김태형 #BTS_V #V #공항 #출국 #입국 #공항패션 #패션 #fashion #디스패치 #dispatch https://t.co/KhYJUvW5ST",2828
RT @Coach: Who else is counting down the minutes to the #CoachFW20 show? #Rosé #KikoMizuhara #Jisoo #CoachNY #NYFW https://t.co/1FkgiZjZGQ,2811


##### Once again, the most retweeted tweets were related to K-Pop bands!
The only two ones not being related in any way to K-Pop band members are related to supermodel Bella Hadid and singer Nicki Minaj appearances at Marc Jacob's event.

Let's now give a look at the <b> <font color='blue'> most common Hashtags</font></b> and <b><font color='blue'> who's being tweeted </font></b>

In [7]:
tweets['mentions'].value_counts().head(20)

Poshmarkapp                                                                  194372
OriginalFunko; OriginalFunko; Loungefly                                       28606
Coach                                                                         15682
dispatchsns                                                                   12518
MEENAVOGUEE                                                                    8297
globaltimesnews; ygofficialblink                                               7848
mefeater                                                                       7352
MichaelKors; RVsmtown                                                          6770
PopCrave; NICKIMINAJ                                                           6184
BritishVogue; ygofficialblink; Prada                                           6127
MichaelKors                                                                    5869
OriginalFunko; OriginalFunko; Loungefly; hellokitty                         

If we look at the tweets mentioning <i>@Poshmarkapp</i>

In [8]:
tweets[tweets['mentions']=='Poshmarkapp']['text'].unique()

array(["So good I had to share! Check out all the items I'm loving on @Poshmarkapp #poshmark #fashion #style #shopmycloset #luckybrand #missme #vanheusen: https://t.co/COqLOTK0nD https://t.co/f66dm4K7fQ",
       "So good I had to share! Check out all the items I'm loving on @Poshmarkapp #poshmark #fashion #style #shopmycloset #michaelkors #underarmour #forever21: https://t.co/8B5szDaJQw https://t.co/lLUAGMPqMW",
       "So good I had to share! Check out all the items I'm loving on @Poshmarkapp #poshmark #fashion #style #shopmycloset #madewell #freepeople #zara: https://t.co/Y4h0axAAus https://t.co/4452dSrbj9",
       ...,
       "So good I had to share! Check out all the items I'm loving on @Poshmarkapp #poshmark #fashion #style #shopmycloset #eddiebauer #turtlefur #aeropostale: https://t.co/NtsPX5rjoY https://t.co/8fWepzJaNn",
       "So good I had to share! Check out all the items I'm loving on @Poshmarkapp #poshmark #fashion #style #shopmycloset #nike #loft #mia: https://t.co/qO03pS

They can be considered noise, just like the @OriginalFunko ones. Let's filter both of them out and look again at the mentions. 

In [9]:
tweets[~(tweets['mentions'].str.contains('Poshmarkapp',na=False) | tweets['mentions'].str.contains('OriginalFunko', na=False))]['mentions'].value_counts().head(20)

Coach                                                                        15682
dispatchsns                                                                  12518
MEENAVOGUEE                                                                   8297
globaltimesnews; ygofficialblink                                              7848
mefeater                                                                      7352
MichaelKors; RVsmtown                                                         6770
PopCrave; NICKIMINAJ                                                          6184
BritishVogue; ygofficialblink; Prada                                          6127
MichaelKors                                                                   5869
ChinaDaily; ygofficialblink                                                   4864
globaltimesnews                                                               4574
kcrimsontide; eBay                                                            4007
wkor

## Let's move to the proper TopicModelling 

In [10]:
tweets = tweets[~(tweets['mentions'].str.contains('Poshmarkapp',na=False) | tweets['mentions'].str.contains('OriginalFunko', na=False))]

We already have a column containing clean text without stopwords, hashtags, links etc. from previous preprocessing we've done. 
Now we transform this into a vector. 

In [11]:
languages_to_download = tweets[tweets['lang']!='und'].lang.value_counts()[(tweets[tweets['lang']!='und'].lang.value_counts()>1000).values].index
languages_to_download

Index(['en', 'ja', 'ko', 'es', 'fr', 'th', 'it', 'de', 'pt', 'in', 'tr', 'pl',
       'tl', 'ht', 'da', 'et', 'nl', 'ca', 'ar'],
      dtype='object')

In [12]:

languages_supported = list(languages_to_download[0:5])
languages_supported = languages_supported+list(languages_to_download[6:9])
languages_supported = languages_supported+list(languages_to_download[10:12])
languages_supported

for lan in languages_supported:
    stanza.download(lan)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 8.17MB/s]                    
2020-06-04 15:06:46 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.0.0/en/default.zip: 100%|██████████| 402M/402M [01:01<00:00, 6.53MB/s]
2020-06-04 15:07:55 INFO: Finished downloading models and saved to /root/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 7.83MB/s]                    
2020-06-04 15:07:55 INFO: Downloading default packages for language: ja (Japanese)...
Downloading http://nlp.stanford.edu/software/stanza/1.0.0/ja/default.zip: 100%|██████████| 220M/220M [00:36<00:00, 6.10MB/s]
2020-06-04 15:08:36 INFO: Finished downloading models and saved to /root/stanza_resources.
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.jso

In [13]:
pipelines = {}
for l in languages_supported: 
    pipelines[l] = stanza.Pipeline(l)
pipelines

2020-06-04 15:15:16 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-06-04 15:15:17 INFO: Use device: gpu
2020-06-04 15:15:17 INFO: Loading: tokenize
2020-06-04 15:15:22 INFO: Loading: pos
2020-06-04 15:15:25 INFO: Loading: lemma
2020-06-04 15:15:26 INFO: Loading: depparse
2020-06-04 15:15:28 INFO: Loading: ner
2020-06-04 15:15:31 INFO: Done loading processors!
2020-06-04 15:15:31 INFO: Loading these models for language: ja (Japanese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| pos       | gsd     |
| lemma     | gsd     |
| depparse  | gsd     |

2020-06-04 15:15:31 INFO: Use device: gpu
2020-06-04 15:15:31 INFO: Loading: tokenize
2020-06-04 15:15:31 INFO: Loading: pos
2020-06-04 15:15:34 INFO: Loading: lemma
2020-06-04 15:15:34 INFO: Loading: depparse
2020-06-04 

{'en': <stanza.pipeline.core.Pipeline at 0x7f7ae5b16550>,
 'ja': <stanza.pipeline.core.Pipeline at 0x7f7ae84a40d0>,
 'ko': <stanza.pipeline.core.Pipeline at 0x7f7ad91a3710>,
 'es': <stanza.pipeline.core.Pipeline at 0x7f7aed59d290>,
 'fr': <stanza.pipeline.core.Pipeline at 0x7f7abdd06550>,
 'it': <stanza.pipeline.core.Pipeline at 0x7f79e3c882d0>,
 'de': <stanza.pipeline.core.Pipeline at 0x7f7b08f43450>,
 'pt': <stanza.pipeline.core.Pipeline at 0x7f799d612510>,
 'tr': <stanza.pipeline.core.Pipeline at 0x7f798c7589d0>,
 'pl': <stanza.pipeline.core.Pipeline at 0x7f7942bb80d0>}

In [14]:
nlp = None

def lemmatize(args):
    tweet = args[0].numpy().decode('UTF-8')
    lang = args[1].numpy().decode('UTF-8')

    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']
    if len(tweet) <= 2: #to avoid index out of range error. Strings shorter than 2 are irrelevant in any case
        return ''
    if lang not in languages_supported:
        nlp = pipelines['en']
    else: 
        nlp = pipelines[lang]
    doc = nlp(tweet)
    texts_out = " ".join([word.lemma if word.lemma not in ['-PRON-'] else '' for sent in doc.sentences for word in sent.words if word.upos in allowed_postags])
    return texts_out


In [15]:
import tensorflow as tf
import time
start_time = time.time()

tweets['clean_text'] = tweets['clean_text'].astype(str)
tweets = tweets.iloc[2*int(len(tweets)/3):-1]
args = (tf.convert_to_tensor(tweets['clean_text']), 
        tf.convert_to_tensor(tweets['lang']))
lemmatized_tweets = tf.map_fn(lemmatize, args, dtype=tf.string)

tweets['lemmatized'] = lemmatized_tweets
elapsed_time = time.time() - start_time
print(elapsed_time)

22832.313759088516


In [16]:
path = "/kaggle/working/Hashtags_lemmatized_thirdThird.csv"
tweets.to_csv(path)