#### **This notebook test for the evaulation code**

In [1]:
import pandas as pd

import coordinationz.cohashtag_helper as cohp

import importlib

  from .autonotebook import tqdm as notebook_tqdm


#### **Load IO data**

In [2]:
importlib.reload(cohp)

io_path = '/N/project/INCAS/new_parse/io/cuba_082020_tweets.pkl.gz'
control_path = '/N/project/INCAS/new_parse/control/cuba_082020_tweets_control.pkl.gz'

In [3]:
importlib.reload(cohp)

df = cohp.load_file(io_path, control_path)

Total control users : 30099
Total io users : 446
Total control data:  1353088
Total IO data:  250367


In [4]:
df.columns

Index(['tweetid', 'tweet_text', 'in_reply_to_tweetid', 'in_reply_to_userid',
       'created_at', 'tweet_client_name', 'tweet_language', 'hashtags',
       'mentions', 'urls', 'is_retweet', 'retweet_tweetid', 'retweet_userid',
       'userid', 'user_profile_image_url', 'user_screen_name',
       'account_creation_date', 'user_verified', 'user_protected',
       'user_profile_description', 'user_profile_entites', 'user_profile_url',
       'follower_count', 'following_count', 'user_profile_status_count',
       'user_profile_listed_count', 'user_profile_favourites_count',
       'user_reported_geo', 'user_reported_coordinates',
       'user_reported_location', 'label', 'user_display_name',
       'account_language', 'tweet_time', 'quoted_tweet_tweetid', 'latitude',
       'longitude', 'quote_count', 'reply_count', 'like_count',
       'retweet_count', 'user_mentions'],
      dtype='object')

In [5]:
filename='./data/cuba_network_all_attributes_with_node_label.pkl.gz'
df_attr = pd.read_pickle(filename)

#### **Get label data**

In [6]:
df_grp = (df
           .groupby(['userid', 'label'])['tweetid']
           .nunique()
           .to_frame('count')
           .reset_index()
          )

In [14]:
df_attr.columns

Index(['weight', 'pvalue', 'zscore', 'source', 'target', 'userid_x', 'label_x',
       'userid_y', 'label_y'],
      dtype='object')

In [16]:
len(df_attr)

45454

In [15]:
total_x = set(df_attr['source']).union(set(df_attr['target']))
print(len(total_x))

2266


In [21]:
def add_labels(graph_attr_filename, io_path, control_path):
    '''
    Adds label in 
    '''
    df = cohp.load_file(io_path, control_path)
    df_attr = pd.read_pickle(filename)
    
    df_grp = (df
           .groupby(['userid', 'label'])['tweetid']
           .nunique()
           .to_frame('count')
           .reset_index()
          )

    df_attr = df_attr.merge(df_grp[['userid', 'label']],
                            left_on='source',
                            right_on='userid',
                            how='left'
                           )
    df_attr = df_attr.merge(df_grp[['userid', 'label']],
                            left_on='target',
                            right_on='userid',
                            how='left'
                           )

    df_attr = df_attr.rename(columns={
        'label_x': 'source_label',
        'label_y': 'target_label'
    })

    df_need = df_attr[['weight', 'pvalue', 
                       'zscore', 'source', 
                       'target', 'source_label',
                       'target_label'
                      ]]
    
    return df_need

# filename='./data/cuba_network_all_attributes_with_node_label.pkl.gz'
graph_attr_filename='./data/cuba_network_all_attributes_with_node_label.pkl.gz'

df_need = add_labels(graph_attr_filename, 
                     df
                    )

In [25]:
print(df_need['source_label'].unique())
print(df_need['target_label'].unique())

[0 1]
[0 1]


In [29]:
sources = set(df_need.loc[df_need['source_label'] == 1]['source'])
targets = set(df_need.loc[df_need['target_label'] == 1]['target'])

print(sources)
print(targets)

{'1114684484698497029', '1120764379610456070', '1117864200712413184', '1136304899761815553'}
{'1120764379610456070', '1117864200712413184', '1144701211238645760', '1204492919673180160'}


In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn.model_selection import StratifiedKFold

def evaluation_metrics(labels_map,n_splits=10):
    """
    Args:
        label_map: dataframe with the classification label for each userid (columns = ['userid', 'label' (0 or 1),'eigen_centrality])
    Returns:
        Average Performance across 10 folds
    """

    skf = StratifiedKFold(n_splits=n_splits)
    X = labels_map[['user_id','eigen_centrality']]
    y = labels_map['label']
    skf.get_n_splits(X, y)

    metrics = {"f1_score":0,"recall":0,"precision":0}

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        thresh = [i for i in range(1,100,2)]
        centrality_ranges = [np.percentile(X_train['eigen_centrality'].values,t) for t in thresh]
        max_centrality_range = -1
        max_f1_score = -1

        for cent in centrality_ranges:
            new_y_train = X_test['eigen_centrality'].apply(lambda x: 1 if x>=cent else 0)
            if(max_f1_score<f1_score(new_y_train,y_train)):
                max_f1_score = f1_score(new_y_train,y_train)
                max_centrality_range = cent

        y_test_preds = X_test['eigen_centrality'].apply(lambda x: 1 if x>=max_centrality_range else 0)

        metrics['f1_score']+=f1_score(y_test,y_test_preds)
        metrics['recall']+=recall_score(y_test,y_test_preds)
        metrics['precision']+=precision_score(y_test,y_test_preds)

    metrics['f1_score']/=float(n_splits)
    metrics['recall']/=float(n_splits)
    metrics['precision']/=float(n_splits)

    return metrics