In [None]:
!pip install pandas

In [1]:
import pandas as pd

def read_df(region):
    return pd.read_pickle(path='./tweets_database_{}.pickle'.format(region))

In [2]:
def create_nodes(df):

    # create a DF of nodes of the network
    columns = ['user_id', 'username']
    nodelist = pd.DataFrame(columns=columns)

    # insert all tweet.users
    nodelist['user_id'] = df['user_id_str']
    nodelist['username'] = df['user_screen_name']
    nodelist.set_index('user_id')

    def get_users(user_id, username):
        """Get users from mentions, replies, retweets"""
        reduce = df.dropna(subset=[user_id, username])
        insert_list = []
        for index, row in reduce.iterrows():
            user = {'user_id': row[user_id], 'username': row[username]}
            insert_list.append(user)
        return pd.DataFrame(insert_list, columns=columns)

    # insert all users of tweet.user_mentions
    nodelist = nodelist.append(get_users('user_mention_id_str', 'user_mention_screen_name'), sort=False)

    # insert all users of tweet.replies
    nodelist = nodelist.append(get_users('in_reply_to_user_id_str', 'in_reply_to_screen_name'), sort=False)

    # insert all users of tweet.retweets
    nodelist = nodelist.append(get_users('retweeted_user_id_str', 'retweeted_user_screen_name'), sort=False)

    # drop duplicates
    nodelist.drop_duplicates(subset=columns, inplace=True)
    print(nodelist.info)
    return nodelist

In [3]:
def create_edges(df):

    # Create a DF of directed edges connecting the nodes of the network
    columns = ['source_id', 'target_id']
    edgelist = pd.DataFrame(columns=columns)

    def get_interactions(source_id, target_id):
        """Get interactions from mentions, replies, retweets"""
        reduce = df.dropna(subset=[source_id, target_id])
        insert_list = []
        for index, row in reduce.iterrows():
            interaction = {'source_id': row[source_id], 'target_id': row[target_id]}
            insert_list.append(interaction)
        return pd.DataFrame(insert_list, columns=columns)

    # insert all user mentions, source=original, target=user mentioning another user
    edgelist = edgelist.append(get_interactions(source_id='user_mention_id_str', target_id='user_id_str'))

    # insert all replies, source=original, target=user replying to another user
    edgelist = edgelist.append(get_interactions(source_id='in_reply_to_user_id_str', target_id='user_id_str'))

    # insert all retweets, source=original, target=user retweeting another user
    edgelist = edgelist.append(get_interactions(source_id='retweeted_user_id_str', target_id='user_id_str'))

    # drop duplicates
    edgelist.drop_duplicates(subset=['source_id', 'target_id'], inplace=True)

    # remove self-references (e.g. replies to your own account)
    edgelist = edgelist[edgelist.source_id != edgelist.target_id]

    print(edgelist.info)
    return edgelist

In [4]:
def drop_isolates(nodelist, edgelist):
    # drop users that do not have any retweets, replies, or mentions to keep the graph lighter
    user_edges = edgelist['source_id']
    user_edges = user_edges.append(edgelist['target_id'])
    nodelist_filtered = nodelist[nodelist.user_id.isin(list(user_edges))]
    return nodelist_filtered
    

In [5]:
def to_gephi(region, nodelist, edgelist):
    gephi_nodes = nodelist.rename(columns={'user_id': 'Id', 'username': 'Label'})
    gephi_nodes.to_csv('nodelist_{}.csv'.format(region), index=False)
    gephi_edges = edgelist.rename(columns={'source_id': 'Source', 'target_id': 'Target'})
    gephi_edges.to_csv('edgelist_{}.csv'.format(region), index=False)

In [None]:
def run_for_country(region):
    print("Region: {}".format(region))
    df = read_df(region=region)
    nodelist = create_nodes(df)
    edgelist = create_edges(df)
    
    nodelist = drop_isolates(nodelist, edgelist)
    
    to_gephi(region, nodelist, edgelist)
    

In [None]:
   
run_for_country(region='netherlands')
run_for_country(region='germany')

In [8]:
def nx_stats(region):
    # lists for networkx stats
    df = read_df(region=region)
    nodelist = create_nodes(df)
    edgelist = create_edges(df)

    to_gephi(region, nodelist, edgelist)

nx_stats('netherlands')
nx_stats('germany')

<bound method DataFrame.info of                    user_id         username
0                150347332   mijnkunst_enzo
1                557237664            2_bie
2      1207546377645023233    kunstnakanker
3                728482182    shift4cashtag
4      1169703972237729792        khkkoleji
...                    ...              ...
40396              1358811   PetradeBoevere
40398            110969839      goosmdeboer
40416            170653983   RickBrink_mvgz
40432             10156602            issuu
40444  1199743127495618560  MichaelVrijhoef

[24514 rows x 2 columns]>
<bound method DataFrame.info of                  source_id            target_id
0                118084079            150347332
1      1185577801501556742            557237664
2               1011249745  1207546377645023233
3      1185577801501556742            728482182
4      1140896863434022912  1169703972237729792
...                    ...                  ...
2545    841781584579682304           13404086