# Generating a Network Analysis of Twitter Followers
## Collect Followers Data From Twitter API

In [None]:
import tweepy
import pandas as pd
import requests_cache
requests_cache.install_cache('nw_cache')
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# ---------- Read In Data
# load in genuine accounts data
genuine = pd.read_csv(open('./User dataset/genuine account.csv', 'rU'), 
                      encoding = 'utf-8', usecols = ['id'] )

# load in fake accounts data
path =r'./User dataset/fake data/'
allFiles = glob.glob(path + "/*.csv")
fake = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(open(file_, 'rU'), encoding = 'utf-8', usecols = ['id'])
    list_.append(df)
fake = pd.concat(list_)

# merge all the data 
all_ids = pd.concat([genuine, fake])

# test set
test_ids = all_ids.sample(n=10)

In [None]:
# ---------- Twitter API setup
CONSUMER_KEY = 'QifrSmYAQ2mjIf8kiPoL2kI4v'
CONSUMER_SECRET = 'qHXPmXDd4Gw2cqZ5zKmDpU6drKEHTF396pj9qyoUYWDcLsFOlm'
ACCESS_TOKEN = '3379234805-hUmhUJa0oV9V1mnKDuB6bitJ1QTEjdq2c9zE0RA'
ACCESS_TOKEN_SECRET = 'cQiqDWDuoLj8SH68d6JuPhthToImk3WzmcQ3pbyxYfYK1'

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth, wait_on_rate_limit=True) # wait_on_rate_limit=True: not return the 402 error code from twitter 
                                                # and wait the needed time to resume the task

In [None]:
# ---------- Set up Edges of Network
def network_edges(G, user_id, filename):
    followers = api.followers_ids(id = user_id)
    followings = api.friends_ids(id = user_id)
    
    for follower in followers:
        try:
            G.add_edge(follower, user_id, weight = api.get_user(id = follower).followers_count)
            # the speed is limited by Twitter API
        except tweepy.TweepError: # the account may be suspended or deleted
            pass
    
    for following in followings:
        try:
            G.add_edge(user_id, following, weight = api.get_user(id = following).followers_count)
        except tweepy.TweepError:
            pass
    nx.write_gexf(G, filename, encoding='utf-8', prettyprint=True)

In [None]:
# ---------- Build Networks of Relationships
twitter_sphere = nx.DiGraph()

for id in test_ids.id:
    network_edges(twitter_sphere, id, 'test.gexf')

### Local Clustering Coefficient    
The local clustering coefficient of a vertex in a graph quantifies how close its neighbours are to being a complete graph. It is the proportion of links between the vertices within its neighbourhood devided by the number of links that could possibly exit between them.    
For a graph $G=(V,E)$ consisting of a set of vertices $V$ and a set of edges $E$, the neighbourhood $N_i$ for a vertex $v_i$ is defined as its immediately connected neighbours as follows:
$$N_i=\{v_j:e_{ij}\in E \vee e_{ji}\in E\}$$
The local clustering coefficient for directed graphs is given as
$$C_i=\frac{|\{e_{jk}:v_j, v_k\in N_i,\ e_{jk}\in E\}|}{k_i(k_i-1)}$$
where $k_i$ is the number of neighbours of a vertex $v_i$.

In [None]:
# ---------- Local Clustering Coefficients
def local_clustering_coef(G, node):
    nbs = G.neighbors(node)
    L = 0
    for v in nbs:
        for u in nbs:
            if (v, u) in G.edges():
                L += 1.0            
    return L/(len(nbs)(len(nbs)-1))

In [None]:
# ---------- Bi-directional links ratio
def bi_dir_links_r(G, node):
    nbs = G.neighbors(node)
    N_bilink = 0
    N_following = 0
    for nb in nbs:
        if (node, nb) in G.edges():
            N_following += 1.0
            if (nb, node) in G.edges():
                N_bilink += 1.0
    return N_bilink/N_following      

In [None]:
# ---------- Analysis
in_degree_ = twitter_sphere.in_degree()
in_degree = {id: in_degree_[id] for id in test_ids.id}

out_degree_ = twitter_sphere.out_degree()
out_degree = {id: out_degree_[id] for id in test_ids.id}

local_c = {id: local_clustering_coef(twitter_sphere, id) for id in test_ids.id}
           
bi_r = {id: bi_dir_links_r(twitter_sphere, id) for id in test_ids}

df1 = pd.DataFrame_from_dict(in_degree, orient = 'index') 
df2 = pd.DataFrame_from_dict(out_degree, orient = 'index')
df3 = pd.DataFrame_from_dict(local_c, orient = 'index')  
df4 = pd.DataFrame_from_dict(bi_r, orient = 'index') 

final = pd.merge(pd.merge(pd.merge(pd.merge(df1, df2, on = 'id'), df3, on = 'id'), df4, on = 'id'), 
         test_ids, on = 'id')