#### Functions

In [None]:
import pandas as pd
import numpy as np
import random

!pip install ipython-autotime
%load_ext autotime


The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.74 s (started: 2021-05-01 20:09:06 +00:00)


In [None]:
!pip install emoji
import emoji
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS
stopwords = stopwords.words('english') + ['does']
stopwords = STOPWORDS.union(stopwords)

import string 

import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
lema = WordNetLemmatizer()

from nltk.stem import PorterStemmer
porter = PorterStemmer()


from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer


from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
time: 3.75 s (started: 2021-05-01 20:38:57 +00:00)


In [None]:
def clean_tweets(df):
    # remove link
    df['tweetsProc'] = df['tweet_text'].str.lower().str.replace('http\S+|www.\S+', '', case=False)
    # remove hashtage and at
    df['tweetsProc'] = df['tweetsProc'].str.lower().str.replace('@\S+|#\S+|&\S|&amp|\n', ' ', case=False)
    # remove posts with three words of less
    df = df[~df['tweetsProc'].str.split().str.len().lt(3)]
    df['tweetsProc'] = df['tweetsProc'].str.replace(' \[removed\]', '')
    # remove emojis
    emoji_list = emoji.get_emoji_regexp()
    df['tweetsProc'] = df['tweetsProc'].str.replace(emoji_list, '', regex=True)
    def is_valid_word(word):
        return word not in stopwords and word.isalpha() and len(word) > 2 
    df['tweetsProc'] = df['tweetsProc'].apply(lambda x: ' '.join([word for word in str(x).split() if is_valid_word(word)]))

    df['tweetsProc'] = df['tweetsProc'].apply(lambda x: ' '.join([lema.lemmatize(word) for word in str(x).split()]))

    df['tweetsProc'] = df['tweetsProc'].apply(lambda x: ' '.join([word for word in str(x).split() if is_valid_word(word)]))

    df['tweets_token'] = df['tweetsProc'].apply(lambda x: set(nltk.word_tokenize(x)))
    df.drop_duplicates(subset=['tweetsProc'], inplace=True)
    return df 

time: 20.8 ms (started: 2021-05-01 20:09:12 +00:00)


#### Code

#### Data Prepare

In [None]:
from google.colab import drive
drive.mount('drive')
drive = 'drive/My Drive/Spring 2021/Stringer/Data/'
# drive = '/content/drive/MyDrive/Stringer/Data/'

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).
time: 5.08 ms (started: 2021-05-01 20:09:42 +00:00)


In [None]:
doc2cluster = '2021-05-01-15-41-07-KE_15000_2021-04-29_search.csv'
data = pd.read_csv(drive + doc2cluster)
p90 = np.percentile(data.user_follower_count.tolist(), 10) 
print(f'remove 10% of tweets with users lower than {p90} followers')
data = data[data.user_follower_count > p90]
print(f'after remove, there are {data.shape[0]} tweets left')

remove 10% of tweets with users lower than 28.0 followers
after remove, there are 13466 tweets left
time: 1.31 s (started: 2021-05-01 20:09:44 +00:00)


In [None]:
df = clean_tweets(data)
tweets = df[['tweet_id','tweet_text','tweetsProc','tweets_token']]

time: 16.6 s (started: 2021-05-01 20:09:49 +00:00)


In [None]:
import gensim
from gensim import corpora, models
from gensim.matutils import corpus2dense, corpus2csc

# create dictionary from vocab
dictionary = gensim.corpora.Dictionary(tweets['tweets_token'])
dictionary.filter_extremes(no_below=15, no_above=0.4, keep_n=100000)

num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())

# translate doc into bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in tweets['tweets_token']]

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

corpus_tfidf_dense = corpus2dense(corpus_tfidf, num_terms, num_docs)

time: 760 ms (started: 2021-05-01 20:10:06 +00:00)


#### Model

In [None]:
# https://jyotiyadav99111.medium.com/selecting-optimal-number-of-clusters-in-kmeans-algorithm-silhouette-score-c0d9ebb11308
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
ks = range(2, 10)
silhouette = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k,random_state=42)
    
    # Fit model to samples
    model.fit_predict(corpus_tfidf_dense.T)
    score = silhouette_score(corpus_tfidf_dense.T, model.labels_, metric='euclidean')
    # Append the inertia to the list of inertias
    silhouette.append(score)
# plt.plot(ks, silhouette, '-o', color='black')
# plt.xlabel('number of clusters, k')
# plt.ylabel('silhouette score')
# plt.xticks(ks)
# plt.show()

# choose the k have largest sihouette score 
k = max(2, silhouette.index(max(silhouette)))
model = KMeans(n_clusters=k)

# Fit model to samples
model.fit(corpus_tfidf_dense.T)

from collections import Counter
Counter(model.labels_)

Counter({0: 3894, 1: 250, 2: 571, 3: 359, 4: 845, 5: 398, 6: 1455})

time: 1min 59s (started: 2021-05-01 20:10:45 +00:00)


#### sample results

In [None]:
result={}
global_mean = corpus_tfidf_dense.T.mean(axis=0).tolist()
for i in range(len(model.cluster_centers_)):
    # find the index of the smallest 10 words
    index = np.array([model.cluster_centers_[i][n]-global_mean[n] for n in range(len(global_mean))]).argsort()[-10:][::-1]
    words =  [dictionary.get(n) for n in index]
    result['cluster'+str(i+1)] = words
    # create_wordcloud(words)
top10words = pd.DataFrame(result)
print('Top 10 Words From Each Clusters')
display(top10words)

Top 10 Words From Each Clusters


Unnamed: 0,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6,cluster7
0,jab,blood,case,vaccine,covid,johnson,coronavirus
1,pfizer,clot,new,passport,test,boris,case
2,pandemic,risk,rise,dose,patient,northern,new
3,read,astrazeneca,active,maharashtra,death,ireland,variant
4,business,rare,surge,shortage,day,brexit,reported
5,impact,contraceptive,reported,centre,think,violence,death
6,moderna,pill,number,dos,year,irish,infection
7,april,chance,increase,minister,positive,border,recovery
8,apr,regulator,record,state,dying,peace,england
9,worker,link,rising,modi,hospital,lie,university


time: 49.5 ms (started: 2021-04-30 13:24:22 +00:00)


In [None]:
result = {}

for k_i in range(k):
    indices = [i for i, x in enumerate(model.labels_) if x == k_i]
    cluster = corpus_tfidf_dense.T[indices]
    from sklearn.metrics import pairwise_distances
    distance = pairwise_distances(cluster, model.cluster_centers_[k_i].reshape(1,-1))
    tweets_index = distance.flatten().argsort()[-5:][::-1].tolist()
    tweets_index = [indices[i] for i in tweets_index]

    result['cluster'+str(k_i+1)] = tweets.iloc[tweets_index]['tweet_text'].tolist()
top5tweets = pd.DataFrame(result)
print('Top 5 Tweets From Each Clusters')
display(top5tweets)

Top 5 Tweets From Each Clusters


Unnamed: 0,cluster1,cluster2,cluster3,cluster4,cluster5,cluster6,cluster7
0,"@sedvitae @Pammieoz Telling porkies, as usual. \nhttps://t.co/rp28rQzFIG","My mum is due her 2nd AstraZeneca vaccine, on Sunday. I’m honestly nervous for her.","Not one rational reason left to not unfreeze the import of vaccines, get Pfizer, moderna, Sputnik and johnson and johnson in, let the rich pay more their vaccines, allocate the 3000 crores to ramp up Covishield, let go of central control over vaccines or allot by case load. Now.",“FARRAKHAN WARNS: Do Not Take the Experimental Covid-19 Vaccine!”\n\n(Side note: This thread is not in my own words. Everything i’m typing are citations &amp; sources that can be found on https://t.co/zX5IluZz1P Vaccine (I broke the link up because 🐦 blocks it from being tweeted.,I have nothing against fans being allowed into the Euros but please explain how Covid 19 discriminates against club football.\n\nClubs are in more financial trouble than the SFA are in so why are the authorities prioritising this event ahead of others? @jasonleitch @NicolaSturgeon,"We had a United Kingdom to be proud of, we had a tolerant and welcoming society, we had peace in N Ireland. We had relative security and prosperity within the EU. We had freedom of movement. We abided by the rule of law. Then we had Brexit and Boris Johnson. Now we have chaos.","Both the coronavirus pandemic and arduous work hours are taking their toll. Check out our series about burnout, with analysis of one of the biggest problems facing businesses and their employees, personal stories and tips on how to avoid it https://t.co/BYOmyEcFoO"
1,@Karenlovecheese Surprised they haven't gone on to call it the EU Astra-Zeneca vaccine!!,"If it isn’t him announcing his halting of AstraZeneca and opting for Pfizer, he can keep it. #auspol https://t.co/WuCnzkVS9k","@janetmmcgowan longer than 2 weeks (so called long covid), with definitely people reporting problems for months. So for me the vaccine isn’t just about preventing death. It’s about keeping hospital cases very low, keeping chronic illness low, which also helps economy &amp; business. /end",Why are there vaccine courts set up in the USA?\n\nWhy have billions been paid out in vaccine damages over the years. https://t.co/Cb88ybmWop,"Saw #VakeelSaabOnApril9th at Overseas Censor Board !! Totally SPEECHLESS ! #PawanKalyan Stole the Show all the way. After #Master, this film will surely cross 100 cr in Covid Situation. Best Tollywood Court room drama ever made. #VakeelSaab is BLOCKBUSTER. ⭐⭐⭐⭐ https://t.co/T5o3bq5ebS","Boris Johnson, Jacob Rees-Mogg and the rest of the Cabinet should resign.\nThere lies are catching up with them. https://t.co/ks5SwQ5DYr","This great news was bashed back in April. Several of us tried to get this in front of the “right” individuals, to no avail. We went to social media and brought the news out directly. \nTY Twitter \n\nAntiparasitic drug Ivermectin kills coronavirus in 48 hours https://t.co/2YHN4GExoT"
2,"@vickyallover I was manageably fat before COVID, now it's unmanageably fat...\n\nI was fit and fat before, now I'm just double fat hahaha",@JohnBoweActor How long did it take for this to come out? You have to remember with these companies there has to be a serious issue for them to admit any wrongdoing. They still wont directly say their vaccine has caused these clots btw.\n\nhttps://t.co/4OkG4Vtjhu,"I fear that every time Hancock or Johnson say something to reassure folk about the AZ vaccine, more people start to worry that it isn't safe. If I didn't know better about this very rare case of them being right, I'd probably be thinking the same...","@AndreasQuinntia @mcmann_ryan @Femi_Sorry &gt;Why did you claim the UK didn't discover the Oxford vaccine or manufacture it?&lt;\nSince not the UK, but Oxford University did so.\nThe basic argument was that the EU has neither developed nor manufactured a vaccine.\nAnd if that's true, my statement that the UK also didn't is valid.",West Bengal CM Mamata Banerjee unlikely to attend today's meeting called by PM Narendra Modi over Covid situation. Chief Secretary Alapan Bandyopadhyay to attend the meeting with PM Modi: Sources,"Boris Johnson(Nov 2018) - ""I don't see how you can support a border down the Irish Sea from a democratic point of view... I don't see how Unionists can support it... &amp; I'll be voting against it."" He went on to vote for exactly this. \n\n https://t.co/tFLlZMfu9u","3 things to follow during this critical stage of Coronavirus\n\n1. Stop preaching now to wear mask, Sanitize &amp; take safety, everyone knows\n\n2. When u go out, stop thinking what others are doing, take ur own protocols &amp; safety\n\n3. If ur loved one die, only u will repent, remember"
3,Anthony #Fauci's limitless publicity thirst is undermining the war on #COVID19 https://t.co/46djICZU1z via @nypost,"@jessphillips Risk of clotting, contraceptive pill, 1400:1\nRisk of DVT on long haul flights, 150000:1\nRisk of clotting, Pfizer vaccine, 478261:1\nRisk of clotting, AstraZeneca vaccine, 357143:1\n\nRisk of clotting, COVID ICU patients, between 3:10 &amp; 7:10\n\nDo the effing math!","@CoinMarketCal $MRPH for sure, with real world use cases and major adoption with corps and gov, this one will continue to shine even through the next bear market.\n\n#Supplychain #logistics has become an important sector due to Covid.\nProject has an amazing, experienced team and huge network",Let us then !!!\nWe don’t have passports or a ban on vertical drinking for Flu.\nhttps://t.co/wh9IdRjcK0,@yvonneliverbird Exactly yvonne.. Yet we have precise figures for covid deaths. also the pharmaceutical companies are immune from prosecution under emergency legislation..,"The Prime Minister chose the Irish Sea as a divide between Northern Ireland and the rest of the UK, but instead of being honest about it, he dissembled\n#NI https://t.co/QiYUelb0IV","'Today, the most pressing issue to have arisen is whether a global vaccination programme is needed to end the coronavirus crisis. This question is so important that a debate urgently needs to be conducted to reach a global consensus on three basic points.' https://t.co/LbDydmZtAP"
4,Emplyee Activism “here to stay”. Its how organisations listen (or not) and what they do.. https://t.co/pejZ9K96Ce,"The government continues to arrogantly push forward with their rollout of the AstraZeneca vaccine.\n\nThis is despite confirmed links to blood clotting, which they initially denied, but now claim are minuscule compared to the benefit.\n\nYour thoughts? https://t.co/jylQ1S5r3l",@loicl 133m worldwide cases 2.89m deaths\nNo interest in playing UK/Johnston up or down.Covid played out approx the same most places\nThere was no amazing decision.What accounts for differences?(1st Lockdown?)\nThe World Bank is a lot more concerned about lot of other countries before UK.,Japan starting deliveries of vaccine for elderly. Maybe we’ll all fall into that age group when it’s our turn to get vaccinated... https://t.co/tZHMyLnQF8,"@JohnVPegg1 Hi John. Thanks for your comment. We are due to re-open our reception at Croft Street, Burnley on Monday (12 April) to provide face to face customer support. This had been temporarily closed due to COVID restrictions. We also have Neighbourhood Officers to offer support.","While this is true, &amp; I'm all for pragmatism, this was a predicted outcome of Brexit. I think those who saw it coming deserve a moment of 'I told you so,' as (a) they did and (b), despite his protestations, Johnson FAILED to prevent something he cheerily dismissed as Project Fear https://t.co/5TOOPDJUXR",@GrahamJ18821678 The common cold is a form of Coronavirus so all the measures taken to avoid Covid-19 will also prevent colds. Flu cases have also dropped dramatically. Increased hand hygiene has also led to a reduction in MRSA infections.


time: 267 ms (started: 2021-04-30 13:24:28 +00:00)


#### Save results

In [None]:
top10words.to_csv(drive + 'output/tweets_cluster_keywords' + doc2cluster)

time: 15.1 ms (started: 2021-04-30 13:24:40 +00:00)


In [None]:
df['clusters'] = model.labels_
df.to_csv(drive + '/output/tweets_cluster_output' + doc2cluster)
# df.to_csv('/content/drive/MyDrive/Stringer/Data/output/tweets_cluster_output.csv')

time: 171 ms (started: 2021-04-29 23:16:50 +00:00)
