In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

sn.set_theme()

In [2]:
%%time
datasetTweetsCleanedPath = "../dataset/tweets_no_outliers.h5"
df_tweets = pd.read_hdf(datasetTweetsCleanedPath)

CPU times: user 7.72 s, sys: 7.75 s, total: 15.5 s
Wall time: 15.5 s


In [3]:
%%time
datasetUsersPath = "../dataset/users_clean_with_indicators_no_outliers.h5"
df_users = pd.read_hdf(datasetUsersPath)

CPU times: user 46 ms, sys: 5.01 ms, total: 51 ms
Wall time: 49.9 ms




## Merge users and tweets

In [4]:
df_bot_col = df_users.bot.to_frame("bot")
df_bot_col

Unnamed: 0,bot
2353593986,1
2358850842,0
137959629,1
466124818,1
2571493866,0
...,...
2911861962,0
1378532629,0
126984069,0
2383025796,1


In [5]:
df_tweets.groupby(["user_id"]).count()

Unnamed: 0_level_0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-1.000000e+00,213713,213713,213713,213713,213713,213713,213713,213713
0.000000e+00,6,6,6,6,6,6,6,6
2.000000e+00,2,2,2,2,2,2,2,2
3.000000e+00,3,3,3,3,3,3,3,3
5.000000e+00,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...
4.800000e+64,1,1,1,1,1,1,1,1
3.300000e+66,1,1,1,1,1,1,1,1
4.000000e+95,1,1,1,1,1,1,1,1
7.000000e+161,1,1,1,1,1,1,1,1


In [6]:
df_merge = pd.merge(df_tweets, df_bot_col, how="outer", left_on="user_id", right_index=True)
df_merge

  df_merge = pd.merge(df_tweets, df_bots, how="outer", left_on="user_id", right_index=True)


Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text,bot
0,327746321.0,0,0,0,0,0,0,2019-09-11 14:53:55,"if man is a little lower than angels, then ang...",1.0
9050,327746321.0,0,0,0,0,1,0,2019-10-21 17:42:10,read the biography of don henley http://t.co/...,1.0
12889,327746321.0,0,0,0,0,0,0,2019-11-02 15:11:22,don't tell me where your priorities are. show ...,1.0
17348,327746321.0,0,0,0,0,1,0,2019-11-07 22:07:20,learn about the great music of bill justis ht...,1.0
19810,327746321.0,0,0,0,0,1,0,2019-10-12 04:46:17,do you love james bond? â check out these cool...,1.0
...,...,...,...,...,...,...,...,...,...,...
11586308,932.0,0,0,0,0,0,0,2019-09-13 01:09:50,,
11597852,334249560.0,0,0,0,0,0,1,2017-12-09 19:29:36,@maricar85 yo toy currando :-( si te sirve d ...,1.0
11642251,8994.0,198,0,0,0,1,2,2020-04-23 04:34:03,"rt @annecurtissmith: and of course, thank you ...",
11650446,541.0,0,0,0,0,0,0,2019-06-12 19:13:54,whole life just went pause for a moment.,


In [7]:
print("NaN users:", len(df_merge[df_merge.bot.isna()].user_id.unique()))
valid_users = df_merge[df_merge.bot.notna()]
print("Valid users:", len(valid_users.user_id.unique()))

NaN users: 634
Valid users: 11508


## Keep only the tweets posted in 2019

In [8]:
df_tweets_2019 = valid_users[(valid_users.created_at > np.datetime64('2019-01-01 00:00:00')) & (valid_users.created_at < np.datetime64('2019-12-31 23:59:59'))].copy()

In [9]:
print("Min created_at:", df_tweets_2019.created_at.min())
print("Max created_at:", df_tweets_2019.created_at.max())

Min created_at: 2019-01-01 00:21:18
Max created_at: 2019-11-15 22:11:22


In [11]:
print("Number of unique users that posted in 2019:", len(df_tweets_2019.user_id.unique()))

Number of unique users that posted in 2019: 6787


In [12]:
df_tweets_2019.describe()

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,bot
count,4562614.0,4562614.0,4562614.0,4562614.0,4562614.0,4562614.0,4562614.0,4562614.0
mean,730830500.0,204.632,0.2154298,1.096931,0.124211,0.2000099,0.2296942,0.7160943
std,705194700.0,16413.58,119.5273,467.0575,0.5495074,0.4063234,0.5609365,0.4508917
min,722623.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,329904600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,467185500.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,615136300.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2722021000.0,3350111.0,200000.0,922462.0,28.0,40.0,40.0,1.0


In [13]:
df_tweets_2019.shape

(4562614, 10)

In [14]:
df_tweets_2019.isna().any()

user_id           False
retweet_count     False
reply_count       False
favorite_count    False
num_hashtags      False
num_urls          False
num_mentions      False
created_at        False
text              False
bot               False
dtype: bool

#### Only keep year, month and day in the created_at feature

In [15]:
df_tweets_2019['created_at'] = df_tweets_2019['created_at'].dt.date
df_tweets_2019.created_at = df_tweets_2019.created_at.astype('datetime64')
df_tweets_2019

Unnamed: 0,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text,bot
0,3.277463e+08,0,0,0,0,0,0,2019-09-11,"if man is a little lower than angels, then ang...",1.0
9050,3.277463e+08,0,0,0,0,1,0,2019-10-21,read the biography of don henley http://t.co/...,1.0
12889,3.277463e+08,0,0,0,0,0,0,2019-11-02,don't tell me where your priorities are. show ...,1.0
17348,3.277463e+08,0,0,0,0,1,0,2019-11-07,learn about the great music of bill justis ht...,1.0
19810,3.277463e+08,0,0,0,0,1,0,2019-10-12,do you love james bond? â check out these cool...,1.0
...,...,...,...,...,...,...,...,...,...,...
9403947,2.236789e+09,13,0,0,0,1,2,2019-03-13,rt @dallasnews: mt @sportsdaydfw: jerry jones:...,0.0
9541859,2.236789e+09,0,0,0,0,0,1,2019-03-14,@demarcusware im so sad your leaving! guess ro...,0.0
9834595,2.236789e+09,0,0,0,0,0,1,2019-03-15,@matrix31 a fancy metal one,0.0
7984056,1.200829e+08,0,0,0,0,0,1,2019-06-08,@donlbe take care,0.0


## Create timeseries

In [16]:
df_tweets_2019 = df_tweets_2019.groupby(['user_id', 'created_at']).sum()
df_tweets_2019

  df_tweets_2019 = df_tweets_2019.groupby(['user_id', 'created_at']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,bot
user_id,created_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
7.226230e+05,2019-01-03,7,0,4,1,0,6,0.0
7.226230e+05,2019-01-04,9919,0,2,1,0,5,0.0
7.226230e+05,2019-01-05,0,0,2,0,0,1,0.0
7.226230e+05,2019-01-06,1,0,0,1,0,2,0.0
7.226230e+05,2019-01-07,148,0,3,5,0,8,0.0
...,...,...,...,...,...,...,...,...
2.722021e+09,2019-07-28,0,0,5,0,0,1,0.0
2.722021e+09,2019-07-29,0,0,4,0,0,1,0.0
2.722021e+09,2019-07-30,0,0,6,0,0,4,0.0
2.722021e+09,2019-07-31,2948,0,4,0,0,1,0.0


In [18]:
df_tweets_2019['success_score'] = (df_tweets_2019.retweet_count + df_tweets_2019.reply_count + df_tweets_2019.favorite_count) / (df_tweets_2019.num_hashtags + df_tweets_2019.num_urls + df_tweets_2019.num_mentions + 0.1)
df_tweets_2019

Unnamed: 0_level_0,Unnamed: 1_level_0,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,bot,success_score
user_id,created_at,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7.226230e+05,2019-01-03,7,0,4,1,0,6,0.0,1.549296
7.226230e+05,2019-01-04,9919,0,2,1,0,5,0.0,1626.393443
7.226230e+05,2019-01-05,0,0,2,0,0,1,0.0,1.818182
7.226230e+05,2019-01-06,1,0,0,1,0,2,0.0,0.322581
7.226230e+05,2019-01-07,148,0,3,5,0,8,0.0,11.526718
...,...,...,...,...,...,...,...,...,...
2.722021e+09,2019-07-28,0,0,5,0,0,1,0.0,4.545455
2.722021e+09,2019-07-29,0,0,4,0,0,1,0.0,3.636364
2.722021e+09,2019-07-30,0,0,6,0,0,4,0.0,1.463415
2.722021e+09,2019-07-31,2948,0,4,0,0,1,0.0,2683.636364


In [19]:
# keep only success_score and bot
df_tweets_2019.drop(df_tweets_2019.columns.difference(['success_score', 'bot']), 1, inplace=True)
df_tweets_2019

  df_tweets_2019.drop(df_tweets_2019.columns.difference(['success_score', 'bot']), 1, inplace=True)


Unnamed: 0_level_0,Unnamed: 1_level_0,bot,success_score
user_id,created_at,Unnamed: 2_level_1,Unnamed: 3_level_1
7.226230e+05,2019-01-03,0.0,1.549296
7.226230e+05,2019-01-04,0.0,1626.393443
7.226230e+05,2019-01-05,0.0,1.818182
7.226230e+05,2019-01-06,0.0,0.322581
7.226230e+05,2019-01-07,0.0,11.526718
...,...,...,...
2.722021e+09,2019-07-28,0.0,4.545455
2.722021e+09,2019-07-29,0.0,3.636364
2.722021e+09,2019-07-30,0.0,1.463415
2.722021e+09,2019-07-31,0.0,2683.636364


In [20]:
# will put user id into column and created_at as index
df_tweets_2019 = df_tweets_2019.pivot_table('success_score','created_at','user_id')
df_tweets_2019

user_id,7.226230e+05,7.557460e+05,8.069750e+05,8.872810e+05,1.382561e+06,3.888491e+06,5.812422e+06,5.820222e+06,6.296742e+06,6.775342e+06,...,2.654027e+09,2.658727e+09,2.662897e+09,2.675561e+09,2.680793e+09,2.682782e+09,2.688868e+09,2.711227e+09,2.718000e+09,2.722021e+09
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,,,,,,,,,,,...,,,,,,,,,,
2019-01-02,,,,,,,,,,,...,,,,,,,,,,
2019-01-03,1.549296,0.0,,0.0,60.094007,0.0,0.10989,68.395062,,,...,,,,,,,,,,
2019-01-04,1626.393443,0.0,,,0.448065,,4.83871,575.742574,,,...,,,,,,,,,,
2019-01-05,1.818182,0.0,,,0.370370,,0.00000,0.000000,,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-11-11,,,,,,,,,,,...,,,,,,,,,,
2019-11-12,,,,,,,,,,,...,,,,,,,,,,
2019-11-13,,,,,,,,,,,...,,,,,,,,,,
2019-11-14,,,,,,,,,,,...,,,,,,,,,,


In [21]:
all_year_dates = pd.DataFrame(index = pd.Series(pd.date_range('01-01-2019','31-12-2019')).rename("created_at"))
all_year_dates

  all_year_dates = pd.DataFrame(index = pd.Series(pd.date_range('01-01-2019','31-12-2019')).rename("created_at"))


2019-01-01
2019-01-02
2019-01-03
2019-01-04
2019-01-05
...
2019-12-27
2019-12-28
2019-12-29
2019-12-30
2019-12-31


In [23]:
df_tweets_2019 = all_year_dates.merge(df_tweets_2019, how='outer', on="created_at")
df_tweets_2019

Unnamed: 0_level_0,7.226230e+05,7.557460e+05,8.069750e+05,8.872810e+05,1.382561e+06,3.888491e+06,5.812422e+06,5.820222e+06,6.296742e+06,6.775342e+06,...,2.654027e+09,2.658727e+09,2.662897e+09,2.675561e+09,2.680793e+09,2.682782e+09,2.688868e+09,2.711227e+09,2.718000e+09,2.722021e+09
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,,,,,,,,,,,...,,,,,,,,,,
2019-01-02,,,,,,,,,,,...,,,,,,,,,,
2019-01-03,1.549296,0.0,,0.0,60.094007,0.0,0.10989,68.395062,,,...,,,,,,,,,,
2019-01-04,1626.393443,0.0,,,0.448065,,4.83871,575.742574,,,...,,,,,,,,,,
2019-01-05,1.818182,0.0,,,0.370370,,0.00000,0.000000,,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,,,,,,,,,,,...,,,,,,,,,,
2019-12-28,,,,,,,,,,,...,,,,,,,,,,
2019-12-29,,,,,,,,,,,...,,,,,,,,,,
2019-12-30,,,,,,,,,,,...,,,,,,,,,,


In [24]:
df_tweets_2019.fillna(-1, inplace=True)
df_tweets_2019

Unnamed: 0_level_0,7.226230e+05,7.557460e+05,8.069750e+05,8.872810e+05,1.382561e+06,3.888491e+06,5.812422e+06,5.820222e+06,6.296742e+06,6.775342e+06,...,2.654027e+09,2.658727e+09,2.662897e+09,2.675561e+09,2.680793e+09,2.682782e+09,2.688868e+09,2.711227e+09,2.718000e+09,2.722021e+09
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.00000,-1.000000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-01-02,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.00000,-1.000000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-01-03,1.549296,0.0,-1.0,0.0,60.094007,0.0,0.10989,68.395062,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-01-04,1626.393443,0.0,-1.0,-1.0,0.448065,-1.0,4.83871,575.742574,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-01-05,1.818182,0.0,-1.0,-1.0,0.370370,-1.0,0.00000,0.000000,-1.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.00000,-1.000000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-12-28,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.00000,-1.000000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-12-29,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.00000,-1.000000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2019-12-30,-1.000000,-1.0,-1.0,-1.0,-1.000000,-1.0,-1.00000,-1.000000,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
