In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os

In [2]:
user_list = pkl.load(open('../data_processed/4557_user_list.pkl', 'rb'))

In [3]:
user_list

['1103',
 '8273',
 '9269',
 '10571',
 '12496',
 '33493',
 '61633',
 '69153',
 '75533',
 '297173',
 '589903',
 '632403',
 '649873',
 '696053',
 '705883',
 '722533',
 '725093',
 '728403',
 '755768',
 '784290',
 '790135',
 '793059',
 '804992',
 '811087',
 '817524',
 '849201',
 '981381',
 '1067421',
 '1081281',
 '1174481',
 '1239931',
 '1330151',
 '1520501',
 '1534061',
 '1653771',
 '1683911',
 '1797291',
 '1961791',
 '2058511',
 '2316461',
 '2633711',
 '2642921',
 '2690951',
 '2700371',
 '4387001',
 '4899581',
 '5340042',
 '5374252',
 '5450132',
 '5468172',
 '5510042',
 '5522792',
 '5579952',
 '5730592',
 '5750262',
 '5779142',
 '5920812',
 '6125082',
 '6153722',
 '6292792',
 '6334632',
 '6409922',
 '6411892',
 '6426432',
 '6514082',
 '6690382',
 '6698022',
 '6758372',
 '7057692',
 '7059882',
 '7062302',
 '7074962',
 '7118512',
 '7138852',
 '7255642',
 '7517462',
 '7546872',
 '7563542',
 '7725042',
 '7740422',
 '7898902',
 '8069672',
 '8084442',
 '8106562',
 '8251772',
 '8292882',
 '83377

In [4]:
def load_files(folder_name, lines_keep=None):
    """return dict of dataframes with filename being the key, filter each user #records to line_retained"""
    info_dict = {}
    for file_name in os.listdir(folder_name):
        file_path = folder_name + '/' + file_name
        with open(file_path, 'rb') as f:
            df = pd.read_csv(f, dtype={'id': str})
        
        seed_user = file_name.split('.')[0]
        df['main_id'] = seed_user
        
        if lines_keep is not None and df.shape[0] > lines_keep:
            print("{} records kept".format(lines_keep))
            df = df.iloc[: lines_keep]

        info_dict[seed_user] = df
    return info_dict


def filter_users(df, user_list):
    follower_retain_mask = df.isin({'main_id': user_list}).main_id
    df = df.loc[follower_retain_mask]
    print("number of follow retained: ", df.groupby('main_id').count().shape[0])
    print("df_follow: ", df.shape)
    return df

## Get 2 csv for full followers/followings respectively

In [5]:
follower_dict = load_files('../data_crawled/followers')
following_dict = load_files('../data_crawled/followings')

In [6]:
follower_dict.get('1103')

Unnamed: 0,id,name,username,bio,location,url,join_date,join_time,tweets,following,followers,likes,media,private,verified,avatar,main_id
0,991748163584307201,ARA,Arablocks,Any piece of digital content in the world on a...,"New York, USA",http://Ara.one/app,2 May 2018,11:36 AM,345,5002,2663,613,158,False,False,https://pbs.twimg.com/profile_images/100149916...,1103
1,1849185284,Fí,AnnieAronburg,hæ,,https://pastebin.com/raw/jrZ2c3yU,9 Sep 2013,12:23 PM,99,1161,154,332,6,False,False,https://pbs.twimg.com/profile_images/107408334...,1103
2,843648271058849792,T,TollyPowell,,,,19 Mar 2017,7:19 PM,1,609,21,17,0,False,False,https://pbs.twimg.com/profile_images/882826110...,1103
3,1038180333110345729,Christine Deloitte,ChristineDeloi1,This is Christine ;),,,7 Sep 2018,2:40 PM,393,2882,1281,2866,0,False,False,https://pbs.twimg.com/profile_images/105887773...,1103
4,979888252449644544,Felipe 🌺,crochi,Computer Science undergraduate @ ICMC - Univer...,,http://github.com/felipecustodio,30 Mar 2018,6:08 PM,369,298,44,1529,84,False,False,https://pbs.twimg.com/profile_images/980253499...,1103
5,960200274856980480,Shivani,Shibsonsecurity,,,,4 Feb 2018,9:15 AM,4,63,2,16,0,False,False,https://pbs.twimg.com/profile_images/104433377...,1103
6,1054673164637294592,Carrie Thompson,CarrieRThompson,Sales Development Representative at Neustar (U...,,http://www.security.neustar,23 Oct 2018,2:57 AM,14,58,13,5,0,False,False,https://pbs.twimg.com/profile_images/105482653...,1103
7,540998745,Alignable,Alignable,The #smallbusiness network,,http://www.alignable.com,30 Mar 2012,8:37 AM,7618,12625,13831,3275,867,False,False,https://pbs.twimg.com/profile_images/831617793...,1103
8,1041001859366961153,GΞø §†IG(\/)Λ (▀̿Ĺ̯▀̿ ̿),ge0st1gma,"Greyhat, Pentester, RE, Malwr Researcher, Dark...",The World,,15 Sep 2018,9:32 AM,51,600,69,183,0,False,False,https://pbs.twimg.com/profile_images/104624283...,1103
9,11986272,マサ@SORAMAME5,delphinz,ひたすら楽しく生きていきたい。 美味い魚とビールがあれば満足この上なし。 迷ったらやる！それ...,逗子海岸,https://minih.wasforum.jp,8 Jan 2008,6:25 AM,6430,1741,663,1179,722,False,False,https://pbs.twimg.com/profile_images/141739960...,1103


In [7]:
df_followers = pd.concat(follower_dict.values(), axis=0).astype({'id': str, 'main_id': str})
df_followings = pd.concat(following_dict.values(), axis=0).astype({'id': str, 'main_id': str})

In [8]:
df_followers = filter_users(df_followers, user_list)
df_followings = filter_users(df_followings, user_list)

number of follow retained:  4557
df_followers:  (6063719, 17)
number of follow retained:  4557
df_followers:  (3655212, 17)


In [9]:
# df_followers.to_csv('../data_processed/df_followers_all.csv', index=False)
# df_followings.to_csv('../data_processed/df_followings_all.csv', index=False)

In [10]:
df_edges_all = pd.concat((df_followers[['id', 'main_id']], df_followings[['id', 'main_id']]), axis=0)

In [11]:
df_edges_all.shape

(9718931, 2)

In [12]:
df_edges_all.drop_duplicates(inplace=True)
df_edges_all.shape

(8026379, 2)

In [13]:
df_edges_all.head()

Unnamed: 0,id,main_id
0,1062779994395627521,259973352
1,746626802580361216,259973352
2,960790173041741824,259973352
3,759881206745923584,259973352
4,702415767187427328,259973352


In [14]:
df_edges_all.rename({'id': 'follow', 'main_id': 'seed'}, axis=1, inplace=True)
# df_edges_all.to_csv('../data_processed/df_edges_all_unweight.csv', index=False)

In [16]:
df_edges_all.head()

Unnamed: 0,follow,seed
0,1062779994395627521,259973352
1,746626802580361216,259973352
2,960790173041741824,259973352
3,759881206745923584,259973352
4,702415767187427328,259973352


## filtering edges

In [17]:
df_follow_edges = df_edges_all
filter_degree=10

In [30]:
print("filtering edges with less than {} connections to seeds...".format(filter_degree))
# count the number followed/following seed users
df_follow_ctseed = df_follow_edges.groupby('follow').count()
df_follow_ctseed.rename({'seed': 'count'}, axis='columns', inplace=True)
df_follow_ctseed['follow'] = df_follow_ctseed.index
df_follow_ctseed.sort_values(by='count', ascending=False, inplace=True)

# get the follow ids that will be retained
mask = df_follow_ctseed['count'] > filter_degree
id_list = df_follow_ctseed.loc[mask].index
print(id_list)
print(type(id_list))
# if the follow id is also Main id, keep it 
id_list = np.concatenate((id_list, user_list))

# filtering
mask_retained_follow = df_follow_edges.isin({'follow': id_list}).follow
df_follow_edges_retained = df_follow_edges.loc[mask_retained_follow]
print("number of retained edges filtered by 10 connections: {}".format(df_follow_edges_retained.shape))

# however some seed nodes are lost during filtering. 
original_uids = set(list(df_follow_edges.seed))
remain_uids = set(list(df_follow_edges_retained.seed))
lost_uids = original_uids.difference(remain_uids)
print("{} seed nodes are lost during filtering".format(len(lost_uids)))

# get the edges connected to these nodes
mask_edge_lost_seed = df_follow_edges.isin({'seed': list(lost_uids)}).seed
df_edges_lost_seed = df_follow_edges.loc[mask_edge_lost_seed]
print('{} edges are connected to seed users'.format(df_edges_lost_seed.shape[0]))


filtering edges with less than 10 connections to seeds...
Index(['1536791610', '44196397', '19397785', '822215679726100480', '16303106',
       '409486555', '19725644', '6017542', '14824849', '62513246',
       ...
       '2573006412', '747220393', '115992980', '22813321', '1605428244',
       '366690534', '25677631', '1570359672', '591948867', '2277912144'],
      dtype='object', name='follow', length=29505)
<class 'pandas.core.indexes.base.Index'>
number of retained edges: (584301, 2)
29 seed nodes are lost during filtering
23892 edges are connected to seed users


In [31]:
# from the edges for lost seed users, restore those with follow user with 1 more connections to seed
back_follow_id = df_edges_lost_seed['follow'].unique()
mask_back_follow = df_follow_ctseed.isin({'follow': list(back_follow_id)}).follow
df_back_follow_degree = df_follow_ctseed.loc[mask_back_follow]

df_follow_connected = df_back_follow_degree[df_back_follow_degree['count']>1]
follow_connected = df_follow_connected.follow
print("{} follower/followings connected to lost seed users have more than one connections to seed users".format(len(follow_connected)))

750 follower/followings connected to lost seed users have more than one connections to seed users


In [32]:
# Get the edges connected to the 'bringing-back' follow
mask_edges_back = df_follow_edges.isin({'follow': list(follow_connected)}).follow
df_edges_back = df_follow_edges.loc[mask_edges_back]
print("bringing back {} edges connected to the {} above follower/followings".format(df_edges_back.shape[0], len(follow_connected)))

bringing back 2002 edges connected to the 750 above follower/followings


In [33]:
# concat retained edges and retrieved back edges
df_follow_edges_retained = pd.concat([df_follow_edges_retained, df_edges_back], axis=0)
print("total number of edges in the filtered graph {}".format(df_follow_edges_retained.shape[0]))

total number of edges in the filtered graph 586303


In [34]:
df_follow_edges_retained.to_csv('../data_processed/df_filtered_edges_unweighted.csv', index=False)