In [None]:
import pandas as pd
from googletrans import Translator
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets as widgets
import math
from IPython.display import clear_output
from datetime import datetime
from time import mktime
import time
import calendar
import seaborn as sn
import matplotlib as mpl
from IPython.display import clear_output
from IPython.display import display

pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', 100000)
pd.set_option('display.max_columns', 1000000)

### Initial setting

In [None]:
directory = 'data/'
file_name = 'iran_2021_eng.csv'

In [None]:
# load the file
tp = pd.read_csv(directory + file_name, low_memory=False, chunksize=10000)
df = pd.concat(tp, ignore_index=True)

In [None]:
# if loaded the file with retweets
df = df.loc[df['is_retweet'] == False]
df = df.loc[df['tweet_language'] == 'en']

In [None]:
txt_file = open("data/racewords.txt", "r")
file_content = txt_file.read()
words = file_content.split()

### Hashtag Distribution

Make sure this cell is run only once before continuing

In [None]:
# define the columns to use as the variables
hashtags = df['hashtags']

In [None]:
# preprocess step, convert string to list
def create_list(tags):
    new_list = []
    if type(tags) == str:
        if tags != '[]':
            new_list = tags.replace('[','').replace(']','')
            new_list = new_list.split(', ')
            new_list = [item.strip('\'') for item in new_list]
    final_list = []
    for i in range(len(new_list)):
        current_tag = new_list[i].lower()
        for w in words:
            if w in current_tag:
                final_list.append(current_tag)
                
    return final_list

for idx in range(len(hashtags)):
    try:
        new_list = create_list(hashtags[idx])
        hashtags.iloc[idx] = new_list
    except Exception as e:
#         print(e)
        hashtags.iloc[idx] = []

In [None]:
for idx in df.index:
    if((type(hashtags[idx]) is not float)): # checking NaN
        for tag_idx in range(len(hashtags[idx])):
            tag = hashtags[idx][tag_idx] # current hashtag
            tag = tag.replace('#','')
            
            if len(tag) > 0:
                hashtags[idx][tag_idx] = tag.lower()

In [None]:
# frequency dictionary
tag_freq = {}
for tag_list in hashtags:
    if((type(tag_list) is not float)):
        for t in tag_list: # iterate through each tag in the list
            if len(t) != 0: # if tag is word
                if t in tag_freq: # if tag already exists in tag_freq
                    tag_freq[t] += 1
                else: # if tag is first seen in tag_freq
                    tag_freq[t] = 1
sorted_tag_freq = sorted(tag_freq.items(), key=lambda x: x[1], reverse=True)

In [None]:
# configuring what to display for the pie chart
extra = sorted_tag_freq[5:]
extra_sum = 0

for x in extra:
    extra_sum += x[1]
    
# the number of tags to display
tags_disp = sorted_tag_freq[0:9]

# uncomment below to graph top hashtags with others
tags_disp.append(('others', extra_sum))

labels, count = list(zip(*tags_disp))
percent = []
for i in count:
    percent.append(i*100./sum(list(count)))
legend_labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(labels, percent)]

#### Pie Graph

In [None]:
mpl.style.use('default')
fig1, ax1 = plt.subplots()
colors = list(plt.cm.tab10(np.arange(10))) + ['brown']
pie = ax1.pie(count, labels=labels, startangle=90, radius=1, colors=colors)
plt.legend(pie[0], legend_labels, loc="center", bbox_to_anchor=(2,0))
plt.show()

### Calendar

In [None]:
tweet_time = df['tweet_time']

In [None]:
rows_to_delete = []
for i in range(len(df)):
    if type(tweet_time[i]) is float or len(tweet_time[i]) < 2:
        rows_to_delete.append(i)

In [None]:
df.drop(df.index[rows_to_delete], inplace=True)

In [None]:
time_list = []
for time_idx in tweet_time.index:
    try:
        time_list.append(tweet_time[time_idx][:-6])
    except Exception as e:
        print(time_idx)
        time_list.append("***********")

In [None]:
# tag and time pair
tag_time = {}
for idx in range(len(df)):
    if((type(hashtags[idx]) is not float)):
        for t in hashtags[idx]:
            if len(t) != 0:
                if t in tag_time:
                    try:
                        tag_time[t].append(time_list[idx][0:-3])
                    except:
                        print(idx)
                else:
                    tag_time[t] = [time_list[idx][0:-3]]
sorted_tag_time = sorted(tag_time.items(), key=lambda x: len(x[1]), reverse=True)

Start from here

In [None]:
# date1 = min(time_list)
date1 = '2019-01-01'
date2 = max(time_list)

In [None]:
# change last date to next month's first day
date2_datetime = datetime.strptime(date2, '%Y-%m-%d') # change last date to datetime object
new_month = (date2_datetime.month + 1) % 12 # increment last date's month by one
new_year = date2_datetime.year + 1
date2_datetime = date2_datetime.replace(year=new_year, month=new_month)
# change datetime object to string again
date3 = datetime.strftime(date2_datetime, '%Y-%m-%d')

In [None]:
# create dataframe for calendar
date_index = pd.date_range(date1, date3).format(formatter=lambda x: x.strftime('%Y-%m'))

In [None]:
zeros = np.zeros([len(date_index), len(tag_time)], dtype=int)
cal = pd.DataFrame(zeros, index=date_index)
cal.columns = tag_time.keys()
tags, time = zip(*sorted_tag_time)

In [None]:
# dates outside the range
old_date = []

for tag in tag_time: # for each hashtag
    # get all the dates of hashtag occurrences
    x = DataFrame(tag_time[tag])
    # group the same date together to get its count
    x = x.groupby(x[0]).size()
    # set cal's value to date's count
    for date in x.index:
        try:
            cal.loc[date][tag] = x.loc[date]
        except Exception as e:
            old_date.append(e)

WARNING: Don't use this cell again after running next cell instead run a cell named "Start from here"

In [None]:
# create the list for x axis with converting datetime object to string
datetime_axis = pd.date_range(start='2019-01-01', end=date3, freq='MS')

x_axis = []
for dt in datetime_axis:
    x_axis.append(datetime.strftime(dt, '%Y-%m'))

In [None]:
import matplotlib.dates as mdates

slider = widgets.IntRangeSlider(
    value=[0, 10],
    min=0,
    max=10,
    step=1,
    description='Range:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
)

box = widgets.SelectMultiple(
    options=tags[0:10],
    rows=10,
    value=tags[0:1],
    description='Keywords',
    disabled=False
)

def on_value_change(change):
    if(len(change['new']) == 0):
        pass
    elif type(change['new'][0]) == int:
        draw_graph(list(tags[change['new'][0]:change['new'][1]]))
    elif type(change['new'][0]) == str:
        draw_graph(list(change['new']))   
    clear_output(wait=True)
    display(slider, box)

def draw_graph(labl):
    fig = plt.figure()
    ax = fig.add_axes([0,0,2,1])
    plt.plot(cal[labl])
    myFmt = mdates.DateFormatter('%Y-%m')
    ax.xaxis.set_major_formatter(myFmt)
    ax.set_xticks(x_axis)
    plt.legend(labl)
    
slider.observe(on_value_change, 'value')
box.observe(on_value_change, 'value')
display(slider, box)

### Table and Heatmap

In [None]:
def draw_table(labl):
    cal.index = pd.to_datetime(cal.index,format='%Y-%m-%d')
    cal.set_index(cal.index)
    cal_month = cal.groupby([(cal.index.year),(cal.index.month)]).sum()
    index=pd.MultiIndex.from_tuples([(x[0], calendar.month_abbr[x[1]]) for x in cal_month.index]) 

    cal_month = cal_month[labl]
    cal_month_top = cal_month.set_index(index)
    display(cal_month_top)
    
    corrMatrix = cal_month.corr()
    np.fill_diagonal(corrMatrix.values, 0)
#     corrMatrix
    sn.heatmap(corrMatrix, annot=True, cmap="Greys")
    plt.show()

def on_value_change_table(change):
    clear_output(wait=True)
    display(slider_table, box_table)
    
    if(len(change['new']) == 0):
        pass
    elif type(change['new'][0]) == int:
        draw_table(list(tags[change['new'][0]:change['new'][1]]))
    elif type(change['new'][0]) == str:
        draw_table(list(change['new']))
    
slider_table = widgets.IntRangeSlider(
    value=[0, 50],
    min=0,
    max=50,
    step=1,
    description='Range:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
)

box_table = widgets.SelectMultiple(
    options=tags[0:50],
    rows=10,
    value=[tags[0]],
    description='Keywords',
    disabled=False
)

slider_table.observe(on_value_change_table, 'value')
box_table.observe(on_value_change_table, 'value')
display(slider_table, box_table)