In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

_30k_main_palette_dict = {
    "blue": "#202F66",
    "orange": "#FF7048",
    "purple": "#8B87EA",
    "pink": "#D869AB",
    "cyan": "#54C9B9",
    "yellow": "#F3D36E",
}

_30k_danger_safe_palette_dict = {
    "danger": "#FF4F72",
    "safe": "#58EDB9",
}

_30k_text_palette_dict = {
    "text_black": "#32363A",
    "text_lighter1": "#6F7273",
    "text_lighter2": "#8B8E8F",
    "bg_cream": "#FAF7F4"
}

_font_mono = "Inconsolata"
_font_serif = "Canela"
_font_sans = "Gill Sans Nova"

plt.style.use("./utils/minus_thirty_k.mplstyle")

_30k = list(_30k_main_palette_dict.values())
sns.set_palette(_30k)

import warnings

warnings.filterwarnings('ignore')

from utils.tweets_utils import *

In [2]:
df = pd.read_csv("./data/tweets_2019-2020.csv")

In [3]:
user = pd.read_csv("./data/tweets_2019-2020.csv")

# Extract mentioned users lookup table

In [4]:
df['user_mentions'] = df['user_mentions'].apply(eval)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21364 entries, 0 to 21363
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   tweetid                   21364 non-null  int64  
 1   userid                    21364 non-null  object 
 2   user_display_name         21364 non-null  object 
 3   user_screen_name          21364 non-null  object 
 4   user_reported_location    1386 non-null   object 
 5   user_profile_description  7781 non-null   object 
 6   user_profile_url          0 non-null      float64
 7   follower_count            21364 non-null  int64  
 8   following_count           21364 non-null  int64  
 9   account_creation_date     21364 non-null  object 
 10  account_language          21364 non-null  object 
 11  tweet_language            21364 non-null  object 
 12  tweet_text                21364 non-null  object 
 13  tweet_time                21364 non-null  object 
 14  tweet_

Should we also map the numeric twitter id in `user_mentions` with twitter `@user_id`?

In [6]:
user_ids = []
numeric_ids = []

for i, tweet in df.iterrows():
    tweet_mentioned_accounts = get_mentioned_accounts(tweet['tweet_text'])
    tweet_numeric_ids = tweet['user_mentions']

    
    # there are some @user_id that this data set didn't add it to user_mentions 
    # create look up table based on this data set user_mentions column
    for index in range(len(tweet_numeric_ids)):
        numeric_ids.append(tweet_numeric_ids[index])
        user_ids.append(tweet_mentioned_accounts[index])

user_lookup_df = pd.DataFrame({
    'numeric_id': numeric_ids,
    'user_id': user_ids
})

## How many time users are mentioned in takedown tweets

In [7]:
user_lookup_df

Unnamed: 0,numeric_id,user_id
0,10228272,@YouTube
1,1082182785501020160,@MaisonWanvipa
2,998815271551827968,@Pachachoncyber
3,20586159,@posttoday
4,4013102233,@pakornwut
...,...,...
20019,229313308,@armypr_news
20020,229313308,@armypr_news
20021,583857653,@shutup2557
20022,2923428150,@jomjai_konjing


# Count mention frequency by user_id

**mention** is counted by finding `@user_id` in a tweet, regardless just mention or retweet.

In [8]:
mentioned_count = user_lookup_df.groupby(by='user_id').count()
mentioned_count.rename(columns={'numeric_id': 'mention_freq'}, inplace=True)
mentioned_count = mentioned_count.join(user_lookup_df.drop_duplicates().set_index('user_id'), how='left')

In [9]:
mentioned_count = mentioned_count.reset_index().sort_values(by=['mention_freq'], ascending=False, ignore_index=True)
mentioned_count

Unnamed: 0,user_id,mention_freq,numeric_id
0,@army2pr,1830,1214742430601187328
1,@armypr_news,1559,229313308
2,@WassanaNanuam,921,267243786
3,@1st_Army_Area,782,358491915
4,@weloverta,437,337604843
...,...,...,...
1433,@amirah_mara,1,4203230773
1434,@amm0186,1,1052156140862156800
1435,@andrewbiggs,1,35689802
1436,@angtigre1,1,1185475916689854464


# Count Retweet by user_id

count only a tweet that start by: `RT @user_id:`

In [10]:
RT_user_ids = []

for i, tweet in df.iterrows():
    RT_user = get_rt_account(tweet['tweet_text'])
    if len(RT_user) > 0:
        RT_user_ids.append(RT_user[0])

In [11]:
RT_count = pd.DataFrame({'user_id': RT_user_ids})
RT_count['RT_freq'] = ''
RT_count = RT_count.groupby('user_id').count().sort_values('RT_freq', ascending=False)
RT_count

Unnamed: 0_level_0,RT_freq
user_id,Unnamed: 1_level_1
@army2pr,1293
@armypr_news,958
@WassanaNanuam,705
@1st_Army_Area,476
@political_drama,279
...,...
@YouCastNoShadow,1
@YounaChann,1
@ZvRUm7WNz8CDKY3GQvE3X7FXwMHAv34qJ353Tf3aBA=,1
@_bbaekmylight,1


# Merge retweet count and mention count

In [12]:
mentioned_count = mentioned_count.join(RT_count, on='user_id', how='left')
# replace nan with 0
mentioned_count['RT_freq'] = mentioned_count['RT_freq'].replace(np.nan, 0).astype(np.int)

In [13]:
mentioned_count = mentioned_count[['numeric_id', 'user_id', 'mention_freq', 'RT_freq']]

In [14]:
mentioned_count.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1438 entries, 0 to 1437
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   numeric_id    1438 non-null   object
 1   user_id       1438 non-null   object
 2   mention_freq  1438 non-null   int64 
 3   RT_freq       1438 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 45.1+ KB


In [15]:
mentioned_count.head(10)

Unnamed: 0,numeric_id,user_id,mention_freq,RT_freq
0,1214742430601187328,@army2pr,1830,1293
1,229313308,@armypr_news,1559,958
2,267243786,@WassanaNanuam,921,705
3,358491915,@1st_Army_Area,782,476
4,337604843,@weloverta,437,255
5,37950212,@ThaiPBS,436,253
6,1040252793339240448,@prayutofficial,347,211
7,199992361,@political_drama,346,279
8,1206467004338556929,@JuahuaHeadline,324,166
9,87732915,@SpringNews_TV,292,142


# Count Reply

In [16]:
reply_id, count_reply = np.unique(df['in_reply_to_userid'].dropna(), return_counts=True)
reply_df = pd.DataFrame({'numeric_id': reply_id, 'reply_freq': count_reply})

In [17]:
reply_df

Unnamed: 0,numeric_id,reply_freq
0,1000364309279522821,2
1,1000440616457715712,1
2,1004609778922881025,1
3,1005965704539934726,1
4,1006375442,3
...,...,...
816,mFKs1iHJWsMz2PfS57HBK3e7VY9nF3SjuaF7aL3oE=,2
817,p880kYlATKJMLLnU5zDev8LGFMv6iyAJ2tPp6rP6FBI=,1
818,qqgooAUqGfHHVLW1OEUTzuN2Yi9cCxXpEAtaDgZ0Tk=,1
819,r0m+1f+BAf8vzeHWPfu9BPAw6GsSvdvZUwcnq9oPA=,1


In [18]:
mentioned_count = mentioned_count.merge(reply_df, how='left')

mentioned_count['reply_freq'] = mentioned_count['reply_freq'].replace(np.nan, 0).astype(np.int)

In [19]:
mentioned_count.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1438 entries, 0 to 1437
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   numeric_id    1438 non-null   object
 1   user_id       1438 non-null   object
 2   mention_freq  1438 non-null   int64 
 3   RT_freq       1438 non-null   int64 
 4   reply_freq    1438 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 67.4+ KB


In [20]:
mentioned_count.head(10)

Unnamed: 0,numeric_id,user_id,mention_freq,RT_freq,reply_freq
0,1214742430601187328,@army2pr,1830,1293,399
1,229313308,@armypr_news,1559,958,591
2,267243786,@WassanaNanuam,921,705,202
3,358491915,@1st_Army_Area,782,476,302
4,337604843,@weloverta,437,255,182
5,37950212,@ThaiPBS,436,253,179
6,1040252793339240448,@prayutofficial,347,211,129
7,199992361,@political_drama,346,279,40
8,1206467004338556929,@JuahuaHeadline,324,166,157
9,87732915,@SpringNews_TV,292,142,150


# Save user look up table

In [21]:
mentioned_count.to_csv("./data/users_lookup.csv", index=False)