 <center><h1> Data for Visualization </center></h1>


This notebook shows the code for preparing some data for visualization.

## Trending Topics

In [72]:
# Import necessary libraries

import pandas as pd
from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS) #Set of English Stopwords

import numpy as npy
from PIL import Image

maskArray = npy.array(Image.open("mask.png")) # Twitter Logo as a mask for wordcloud

In [44]:
# Import relevant files

sent_df = pd.read_csv("Processed Data.csv",sep='\t')
topic_df = pd.read_csv("Topic Modelled Data.csv",sep='\t')

In [45]:
relevant_df = sent_df.loc[:,['date','time','id']]
df = pd.merge(left=relevant_df, right=topic_df, left_on='id', right_on='id') #Merge the dataframe with respect to ID

In [46]:
df.sample(3) #Sample Data

Unnamed: 0,date,time,id,processed_text,topic_data,topic_perc_contrib
360804,2020-05-06,18:02:57,1258094919266099200,USER_MENTION USER_MENTION yet updated today co...,"case, people, india, virus, today, government,...",0.983669
420724,2020-05-17,16:04:46,1262051444766789632,eye china india back push probe covid19 origin...,"help, u, india, virus, people, fight, sir, man...",0.996122
531059,2020-06-11,18:53:50,1271153686610358275,person much illness getting corona getting awa...,"one, people, time, please, go, u, need, also, ...",0.990418


In [47]:
#Drop irrevelant data

df.drop(['id','topic_perc_contrib','time','processed_text'],axis=1,inplace=True)


In [75]:
adf=df.groupby('date')['topic_data'].value_counts()

In [110]:
#Group the data with respect to date and count the number of topics

df=df.groupby('date')['topic_data'].value_counts()
df = pd.DataFrame(df)
df.rename(columns={'topic_data': 'topic_count'},inplace=True)
df.reset_index(inplace=True)
df

Unnamed: 0,date,topic_data,topic_count
0,2020-03-25,"people, india, virus, one, govt, please, day, ...",1694
1,2020-03-25,"u, day, may, india, due, get, people, n, sir, ...",1509
2,2020-03-25,"one, people, time, please, go, u, need, also, ...",1329
3,2020-03-25,"india, people, patient, like, day, sir, delhi,...",1282
4,2020-03-25,"people, u, please, sir, time, government, figh...",1221
...,...,...,...
815,2020-06-14,"people, u, please, sir, time, government, figh...",1201
816,2020-06-14,"one, people, time, please, go, u, need, also, ...",1168
817,2020-06-14,"govt, home, people, one, fight, india, work, c...",1077
818,2020-06-14,"time, day, india, virus, case, world, u, count...",1025


In [243]:
#Split the data into different lockdown phases and save the data

lockdown1 = df[(df['date'] >= '2020-03-25') & (df['date'] <= '2020-04-14')]
lockdown1 = lockdown1.groupby('topic_data').sum()
lockdown1.reset_index(inplace=True)
lockdown1.sort_values(by='topic_count',inplace=True,ascending=False)
lockdown1.insert(0, "Phase", "LD1") 

lockdown2 = df[(df['date'] >= '2020-04-15') & (df['date'] <= '2020-05-03')]
lockdown2 = lockdown2.groupby('topic_data').sum()
lockdown2.reset_index(inplace=True)
lockdown2.sort_values(by='topic_count',inplace=True,ascending=False)
lockdown2.insert(0, "Phase", "LD2") 

lockdown3 = df[(df['date'] >= '2020-05-04') & (df['date'] <= '2020-05-17')]
lockdown3 = lockdown3.groupby('topic_data').sum()
lockdown3.reset_index(inplace=True)
lockdown3.sort_values(by='topic_count',inplace=True,ascending=False)
lockdown3.insert(0, "Phase", "LD3") 

lockdown4 = df[(df['date'] >= '2020-05-18') & (df['date'] <= '2020-05-31')]
lockdown4 = lockdown4.groupby('topic_data').sum()
lockdown4.reset_index(inplace=True)
lockdown4.sort_values(by='topic_count',inplace=True,ascending=False)
lockdown4.insert(0, "Phase", "LD4") 

unlock1 = df[(df['date'] >= '2020-06-01') & (df['date'] <= '2020-06-14')]
unlock1 = unlock1.groupby('topic_data').sum()
unlock1.reset_index(inplace=True)
unlock1.sort_values(by='topic_count',inplace=True,ascending=False)
unlock1.insert(0, "Phase", "Unlock1") 

combined_df = pd.concat([lockdown1,lockdown2,lockdown3,lockdown4,unlock1],ignore_index=True)
combined_df.to_csv("Topic data.csv",index=False)

## Trending Hashtags

In [144]:
# Read the data
hash_df = sent_df.loc[:,['date','hashtags']]
hash_df.sample(3)

Unnamed: 0,date,hashtags
199578,2020-04-14,#Lockdownextention
635,2020-03-25,#Lockdown21 #testings
31682,2020-03-27,#21daysLockdownIndia


In [203]:
#Split the data into different lockdown phases, find the top 10 hashtags and save the data

lockdown1 = hash_df[(hash_df['date'] >= '2020-03-25') & (hash_df['date'] <= '2020-04-14')]
lockdown1 = pd.DataFrame(lockdown1['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown1.reset_index(inplace=True)
lockdown1.insert(0, "Phase", "LD1")
lockdown1.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

lockdown2 = hash_df[(hash_df['date'] >= '2020-04-15') & (hash_df['date'] <= '2020-05-03')]
lockdown2 = pd.DataFrame(lockdown2['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown2.reset_index(inplace=True)
lockdown2.insert(0, "Phase", "LD2")
lockdown2.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

lockdown3 = hash_df[(hash_df['date'] >= '2020-05-04') & (hash_df['date'] <= '2020-05-17')]
lockdown3 = pd.DataFrame(lockdown3['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown3.reset_index(inplace=True)
lockdown3.insert(0, "Phase", "LD3")
lockdown3.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

lockdown4 = hash_df[(hash_df['date'] >= '2020-05-18') & (hash_df['date'] <= '2020-05-31')]
lockdown4 = pd.DataFrame(lockdown4['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
lockdown4.reset_index(inplace=True)
lockdown4.insert(0, "Phase", "LD4")
lockdown4.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

unlock1 = hash_df[(hash_df['date'] >= '2020-06-01') & (hash_df['date'] <= '2020-06-14')]
unlock1 = pd.DataFrame(unlock1['hashtags'].str.split(expand=True).stack().value_counts()).head(10)
unlock1.reset_index(inplace=True)
unlock1.insert(0, "Phase", "Unlock1")
unlock1.rename({0:"value","index":"hashtag"},axis=1,inplace=True)

combined_df = pd.concat([lockdown1,lockdown2,lockdown3,lockdown4,unlock1],ignore_index=True)
combined_df.to_csv("Hashtag data.csv",index=False)

## Tone Analyser

In [156]:
# Read the data and remove unnecessary columns

tone_df = pd.read_csv("Sentiment Data.csv",sep='\t')
tone_df = tone_df.loc[:,['date','sadness','confident','joy','analytical','anger','tentative','fear']]
tone_df.sample(3)

Unnamed: 0,date,sadness,confident,joy,analytical,anger,tentative,fear
2032,2020-06-14,1.0,,,1.0,,1.0,
2006,2020-06-11,,,,1.0,,1.0,
1371,2020-05-17,,,,1.0,,,


In [241]:
#Split the data into different lockdown phases and save the data

lockdown1 = tone_df[(tone_df['date'] >= '2020-03-25') & (tone_df['date'] <= '2020-04-14')]
lockdown1 = pd.DataFrame(lockdown1.sum())
lockdown1 = lockdown1.T
lockdown1.drop(['date'],axis=1,inplace=True)
lockdown1.insert(0,"Phase","LD1")

lockdown2 = tone_df[(tone_df['date'] >= '2020-04-15') & (tone_df['date'] <= '2020-05-03')]
lockdown2 = pd.DataFrame(lockdown2.sum())
lockdown2 = lockdown2.T
lockdown2.drop(['date'],axis=1,inplace=True)
lockdown2.insert(0,"Phase","LD2")

lockdown3 = tone_df[(tone_df['date'] >= '2020-05-04') & (tone_df['date'] <= '2020-05-17')]
lockdown3 = pd.DataFrame(lockdown3.sum())
lockdown3 = lockdown3.T
lockdown3.drop(['date'],axis=1,inplace=True)
lockdown3.insert(0,"Phase","LD3")

lockdown4 = tone_df[(tone_df['date'] >= '2020-05-18') & (tone_df['date'] <= '2020-05-31')]
lockdown4 = pd.DataFrame(lockdown4.sum())
lockdown4 = lockdown4.T
lockdown4.drop(['date'],axis=1,inplace=True)
lockdown4.insert(0,"Phase","LD4")

unlock1 = tone_df[(tone_df['date'] >= '2020-06-01') & (tone_df['date'] <= '2020-06-14')]
unlock1 = pd.DataFrame(unlock1.sum())
unlock1 = unlock1.T
unlock1.drop(['date'],axis=1,inplace=True)
unlock1.insert(0,"Phase","Unlock1")

combined_df = pd.concat([lockdown1,lockdown2,lockdown3,lockdown4,unlock1],ignore_index=True)
combined_df = combined_df.melt("Phase")
combined_df.rename({"variable":"Tone","value":"Value"},axis=1,inplace=True)
combined_df = combined_df.sort_values(by="Phase")
combined_df.to_csv("Tone data.csv",index=False)

## Wordcloud

In [77]:
# Read the data

df = pd.read_csv("Processed Data.csv",sep='\t')
df['processed_text']=df['processed_text'].astype(str)
df.sample(3)

Unnamed: 0,date,time,username,to,replies,retweets,favorites,text,mentions,hashtags,id,permalink,processed_text
331216,2020-05-02,07:20:41,prashantkarkera,EktaWorld,0,0,0,I don't know what you are rejoicing about when...,,#poorcustomercare #ektaparkville #anilkapoor #...,1256483733831639040,https://twitter.com/prashantkarkera/status/125...,dont know rejoicing existing customer given po...
293540,2020-04-26,14:25:33,MannuDas18,,0,1,1,#Help_Them There is a lot of merit in satisfyi...,,#Help_Them,1254416327894278144,https://twitter.com/MannuDas18/status/12544163...,help_them lot merit satisfying hunger hungry d...
41792,2020-03-28,14:02:13,helena_kolesnyk,,1,0,2,Fuck off corona #joke #besafe #CoronavirusPand...,,#joke #besafe #CoronavirusPandemic,1243901209259343872,https://twitter.com/helena_kolesnyk/status/124...,fuck corona joke besafe coronaviruspandemic pu...


In [78]:
# Remove unnecessary data

cloud_df = df.loc[:,['date','processed_text']]
cloud_df

Unnamed: 0,date,processed_text
0,2020-03-25,yeah missing freedom life covid19
1,2020-03-25,contribute cm relief fund help delhi govt figh...
2,2020-03-25,bhai assalamualaikum possible please call bhai...
3,2020-03-25,bold adress nation activity banned except esse...
4,2020-03-25,please understand important stay home responsi...
...,...,...
582685,2020-06-14,URL
582686,2020-06-14,covid
582687,2020-06-14,italy face two new coronavirus outbreak
582688,2020-06-14,india become top none modi reign india became ...


In [81]:
#Split the data into different lockdown phases, generate wordcloud and save the data

lockdown1 = cloud_df[(cloud_df['date'] >= '2020-03-25') & (cloud_df['date'] <= '2020-04-14')]
text = []
for item in lockdown1['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown1text = " ".join(string)
lockdown1cloud = WordCloud(background_color = "white",stopwords = stopwords,mask = maskArray)
lockdown1cloud.generate(lockdown1text)
lockdown1cloud.to_file("Lockdown1 cloud.png")

lockdown2 = cloud_df[(cloud_df['date'] >= '2020-04-15') & (cloud_df['date'] <= '2020-05-03')]
text = []
for item in lockdown2['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown2text = " ".join(string)
lockdown2cloud = WordCloud(background_color = "white",stopwords = stopwords,mask = maskArray)
lockdown2cloud.generate(lockdown2text)
lockdown2cloud.to_file("Lockdown2 cloud.png")

lockdown3 = cloud_df[(cloud_df['date'] >= '2020-05-04') & (cloud_df['date'] <= '2020-05-17')]
text = []
for item in lockdown3['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown3text = " ".join(string)
lockdown3cloud = WordCloud(background_color = "white",stopwords = stopwords,mask = maskArray)
lockdown3cloud.generate(lockdown3text)
lockdown3cloud.to_file("Lockdown3 cloud.png")

lockdown4 = cloud_df[(cloud_df['date'] >= '2020-05-18') & (cloud_df['date'] <= '2020-05-31')]
text = []
for item in lockdown4['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
lockdown4text = " ".join(string)
lockdown4cloud = WordCloud(background_color = "white",stopwords = stopwords,mask = maskArray)
lockdown4cloud.generate(lockdown4text)
lockdown4cloud.to_file("Lockdown4 cloud.png")

unlock1 = cloud_df[(cloud_df['date'] >= '2020-06-01') & (cloud_df['date'] <= '2020-06-14')]
text = []
for item in unlock1['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
unlock1text = " ".join(string)
unlock1cloud = WordCloud(background_color = "white",stopwords = stopwords,mask = maskArray)
unlock1cloud.generate(unlock1text)
unlock1cloud.to_file("Unlock1 cloud.png")

#General Data

text = []
for item in cloud_df['processed_text']:
    text.append(str(data) for data in item)
string = ["".join(data) for data in text]
generaltext = " ".join(string)
generalcloud = WordCloud(background_color = "white",stopwords = stopwords,mask = maskArray)
generalcloud.generate(generaltext)
generalcloud.to_file("General cloud.png")


<wordcloud.wordcloud.WordCloud at 0x1cfb2c172b0>