In [60]:
!pip install pandas



You are using pip version 9.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [61]:
import pandas as pd

def read_df(region):
    return pd.read_pickle(path='./tweets_database_{}.pickle'.format(region))

In [62]:
def create_nodes(df):

    # create a DF of nodes of the network
    columns = ['user_id', 'username']
    nodelist = pd.DataFrame(columns=columns)

    # insert all tweet.users
    nodelist['user_id'] = df['user_id_str']
    nodelist['username'] = df['user_screen_name']
    nodelist.set_index('user_id')

    def get_users(user_id, username):
        """Get users from mentions, replies, retweets"""
        reduce = df.dropna(subset=[user_id, username])
        insert_list = []
        for index, row in reduce.iterrows():
            user = {'user_id': row[user_id], 'username': row[username]}
            insert_list.append(user)
        return pd.DataFrame(insert_list, columns=columns)
    
    def get_users_mentions(user_mentions):
        reduce = df.dropna(subset=[user_mentions])
        insert_list = []
        for index, row in reduce.iterrows():
            for item in row[user_mentions]:
                user = {'user_id': item['id'], 'username': item['screen_name']}
                insert_list.append(user)
        return pd.DataFrame(insert_list, columns=columns)
    # insert all users of tweet.user_mentions
    nodelist = nodelist.append(get_users_mentions('user_mentions'), sort=False)

    # insert all users of tweet.replies
    nodelist = nodelist.append(get_users('in_reply_to_user_id_str', 'in_reply_to_screen_name'), sort=False)

    # insert all users of tweet.retweets
    nodelist = nodelist.append(get_users('retweeted_user_id_str', 'retweeted_user_screen_name'), sort=False)

    # drop duplicates
    nodelist.drop_duplicates(subset=columns, inplace=True)
    print(nodelist.info)
    return nodelist

In [63]:
def create_edges(df):

    # Create a DF of directed edges connecting the nodes of the network
    columns = ['source_id', 'target_id']
    edgelist = pd.DataFrame(columns=columns)

    def get_interactions(source_id, target_id):
        """Get interactions from mentions, replies, retweets"""
        reduce = df.dropna(subset=[source_id, target_id])
        insert_list = []
        for index, row in reduce.iterrows():
            interaction = {'source_id': row[source_id], 'target_id': row[target_id]}
            insert_list.append(interaction)
        return pd.DataFrame(insert_list, columns=columns)
    
    def get_users_mentions_interactions(target_id,user_mentions):
        reduce = df.dropna(subset=[target_id, user_mentions])
        insert_list = []
        for index, row in reduce.iterrows():
            for item in row[user_mentions]:
                interaction = {'source_id': item['id'], 'target_id': row[target_id]}
                insert_list.append(interaction)
        return pd.DataFrame(insert_list, columns=columns)
    # insert all user mentions, source=original, target=user mentioning another user
    edgelist = edgelist.append(get_users_mentions_interactions(target_id='user_id_str', user_mentions='user_mentions'))

    # insert all replies, source=original, target=user replying to another user
    edgelist = edgelist.append(get_interactions(source_id='in_reply_to_user_id_str', target_id='user_id_str'))

    # insert all retweets, source=original, target=user retweeting another user
    edgelist = edgelist.append(get_interactions(source_id='retweeted_user_id_str', target_id='user_id_str'))

    # drop duplicates
    edgelist.drop_duplicates(subset=['source_id', 'target_id'], inplace=True)

    # remove self-references (e.g. replies to your own account)
    edgelist = edgelist[edgelist.source_id != edgelist.target_id]

    print(edgelist.info)
    return edgelist

In [64]:
def drop_isolates(nodelist, edgelist):
    # drop users that do not have any retweets, replies, or mentions to keep the graph lighter
    user_edges = edgelist['source_id']
    user_edges = user_edges.append(edgelist['target_id'])
    print(user_edges.shape)
    print(user_edges.drop_duplicates().shape)
    nodelist_filtered = nodelist[nodelist.user_id.isin(list(user_edges))]
    return nodelist_filtered
    

In [65]:
def to_gephi(region, nodelist, edgelist):
    gephi_nodes = nodelist.rename(columns={'user_id': 'Id', 'username': 'Label'})
    gephi_nodes.to_csv('nodelist_{}.csv'.format(region), index=False)
    gephi_edges = edgelist.rename(columns={'source_id': 'Source', 'target_id': 'Target'})
    gephi_edges.to_csv('edgelist_{}.csv'.format(region), index=False)

In [66]:
def run_for_country(region):
    print("Region: {}".format(region))
    df = read_df(region=region)
    nodelist = create_nodes(df)
    edgelist = create_edges(df)
    
    nodelist = drop_isolates(nodelist, edgelist)
    
    to_gephi(region, nodelist, edgelist)
    

In [67]:
if __name__ == '__main__':
    
    run_for_country(region='italy')
    run_for_country(region='germany')

Region: italy
<bound method DataFrame.info of                     user_id         username
0                 342818725    maggioantonio
1                 521546777          NoveseM
2       1035166905424863233    Malvy89611841
3       1096491048862449664  giovannaconfal4
4                 304695298    WolfgangDanso
5        885567061576908800          yxnggao
6                 427424760         byluke69
8                1251089251  salidaparallela
9                2584588979     AuroreItalie
10                 36039763   Brigante_Paolo
11       785129133009494021     thewaterflea
12      1077319297511374849   Simona53313767
13      1126426788635324417    Romav86737841
14                210749911        biche1780
15                 71242539        WheresMAE
16                336724931          ZanRoby
17       912630324206690305        Damare284
18                235918702    diodeglizilla
19      1222548747130982400    Felix43153007
20                209084129  FRANCESC0PAGANO
21       

<bound method DataFrame.info of                  source_id            target_id
0                 74440023   977745564875067392
1                  8720562   977745564875067392
2               3393740861  1027087573406167041
3       939629034962653184            543389122
4      1013349325555011584  1213476249944723456
5      1233735020151496706           2937975091
6      1233735020151496706             19119318
7      1233735020151496706  1156593585455869952
8       974523568703684609           1047236436
10              2603392298              8720562
11                17775446            707696955
12     1223735721254694912  1233714887647539201
13                30417430            304067927
14               293021893            304067927
15      888399491023466497           2435355202
16      939629034962653184           3291882408
17      859101343356772353           3033081447
18                22751175           3033081447
19                15865339  1170345103963152385
20      

In [49]:
df = read_df('italy')

In [50]:
df.shape

(139154, 17)

In [5]:
df.columns

Index(['tweet_id', 'inserted', 'location_query', 'location_id', 'created_at',
       'user_id_str', 'user_screen_name', 'followers_count',
       'user_mention_screen_name', 'user_mention_id_str',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name',
       'retweeted_user_id_str', 'retweeted_user_screen_name', 'result_type',
       'text'],
      dtype='object')

In [12]:
df.user_mention_screen_name.head(1)[0]


[{'id': 274093178,
  'id_str': '274093178',
  'indices': [3, 18],
  'name': 'Peter Gomez',
  'screen_name': 'petergomezblog'}]

In [15]:
i = [i for i in df.user_mention if len(i) >1]

In [18]:
print(i[0])

[{'id': 232908749, 'name': 'Andrea F.', 'id_str': '232908749', 'indices': [3, 16], 'screen_name': 'AndFranchini'}, {'id': 395218906, 'name': 'Corriere della Sera', 'id_str': '395218906', 'indices': [95, 104], 'screen_name': 'Corriere'}]


{'id': 232908749, 'name': 'Andrea F.', 'id_str': '232908749', 'indices': [3, 16], 'screen_name': 'AndFranchini'}
{'id': 395218906, 'name': 'Corriere della Sera', 'id_str': '395218906', 'indices': [95, 104], 'screen_name': 'Corriere'}
