In [None]:
!pip install pandas

In [3]:
import pandas as pd

# load dataframe previously stored
df = pd.read_pickle(path='./tweets_database.pickle')
print(df.info)

<bound method DataFrame.info of                    tweet_id                   inserted  \
0       1234075973857009664 2020-03-01 17:46:23.134555   
1       1234076112017403904 2020-03-01 17:46:23.141314   
2       1234082785683329025 2020-03-01 17:46:23.146878   
3       1234082826485628933 2020-03-01 17:46:23.151518   
4       1234073877694816259 2020-03-01 17:46:23.155700   
...                     ...                        ...   
178030  1231794957444427776 2020-03-01 18:06:59.212246   
178031  1231798791881330689 2020-03-01 18:06:59.217530   
178032  1231800562032635904 2020-03-01 18:06:59.222633   
178033  1231804397086269440 2020-03-01 18:06:59.238009   
178034  1231804315024556032 2020-03-01 18:06:59.242992   

                            created_at          user_id_str user_screen_name  \
0       Sun Mar 01 11:20:14 +0000 2020   977745564875067392   louise123_dahl   
1       Sun Mar 01 11:20:47 +0000 2020  1027087573406167041          dori_ww   
2       Sun Mar 01 11:47:18 +00

In [4]:
# create a DF of nodes of the network
columns = ['user_id', 'username']
nodelist = pd.DataFrame(columns=columns)

# insert all tweet.users
nodelist['user_id'] = df['user_id_str']
nodelist['username'] = df['user_screen_name']
nodelist.set_index('user_id')

def get_users(user_id, username):
    """Get users from mentions, replies, retweets"""
    reduce = df.dropna(subset=[user_id, username])
    insert_list = []
    for index, row in reduce.iterrows():
        user = {'user_id': row[user_id], 'username': row[username]}
        insert_list.append(user)
    return pd.DataFrame(insert_list, columns=columns)

# insert all users of tweet.user_mentions
nodelist = nodelist.append(get_users('user_mention_id_str', 'user_mention_screen_name'), sort=False)

# insert all users of tweet.replies
nodelist = nodelist.append(get_users('in_reply_to_user_id_str', 'in_reply_to_screen_name'), sort=False)

# insert all users of tweet.retweets
nodelist = nodelist.append(get_users('retweeted_user_id_str', 'retweeted_user_screen_name'), sort=False)

# drop duplicates
nodelist.drop_duplicates(subset=columns, inplace=True)
print(nodelist.info)

# rename columns for Gephi
gephi_nodes = nodelist.rename(columns={'user_id': 'Id', 'username': 'Label'})
gephi_nodes.to_csv('nodelist.csv', index=False)

<bound method DataFrame.info of                     user_id         username
0        977745564875067392   louise123_dahl
1       1027087573406167041          dori_ww
2                 543389122          cpcxxps
3       1213476249944723456  PIxODnSW3uIVKgr
4                2937975091     BreasleyAdam
...                     ...              ...
144958            199206990       UniSalerno
145040            140043541        lescienze
145189            192899133    fam_cristiana
1682             2943709047       burgestine
2114             2176147594  DagoWthAttitude

[71534 rows x 2 columns]>


In [5]:
# Create a DF of directed edges connecting the nodes of the network
columns = ['source_id', 'target_id']
edgelist = pd.DataFrame(columns=columns)

def get_interactions(source_id, target_id):
    """Get interactions from mentions, replies, retweets"""
    reduce = df.dropna(subset=[source_id, target_id])
    insert_list = []
    for index, row in reduce.iterrows():
        interaction = {'source_id': row[source_id], 'target_id': row[target_id]}
        insert_list.append(interaction)
    return pd.DataFrame(insert_list, columns=columns)

# insert all user mentions, source=original, target=user mentioning another user
edgelist = edgelist.append(get_interactions(source_id='user_mention_id_str', target_id='user_id_str'))

# insert all replies, source=original, target=user replying to another user
edgelist = edgelist.append(get_interactions(source_id='in_reply_to_user_id_str', target_id='user_id_str'))

# insert all retweets, source=original, target=user retweeting another user
edgelist = edgelist.append(get_interactions(source_id='retweeted_user_id_str', target_id='user_id_str'))

# drop duplicates
edgelist.drop_duplicates(subset=['source_id', 'target_id'], inplace=True)

#TODO: remove self-references

print(edgelist.info)

# rename columns for Gephi
gephi_edges = edgelist.rename(columns={'source_id': 'Source', 'target_id': 'Target'})
gephi_edges.to_csv('edgelist.csv', index=False)

<bound method DataFrame.info of                   source_id            target_id
0                  74440023   977745564875067392
1                3393740861  1027087573406167041
2        939629034962653184            543389122
3       1013349325555011584  1213476249944723456
4       1233735020151496706           2937975091
...                     ...                  ...
123786  1071767979271077888  1140688563346071552
123842  1071767979271077888            304507855
123862  1071767979271077888   820761573262290944
124004  1071767979271077888            112286587
125725  1071767979271077888           3033217867

[101246 rows x 2 columns]>


In [None]:
# TODO: drop users that do not have any retweets, replies, or mentions to keep the graph lighter