#### **This notebook checks for the coordination between two users**

In [3]:
import pandas as pd
import numpy as np
import os
import importlib

#### **Load the files**

In [1]:
io_path = '/N/project/INCAS/new_parse/io/cuba_082020_tweets.pkl.gz'
control_path = '/N/project/INCAS/new_parse/control/cuba_082020_tweets_control.pkl.gz'

In [2]:
import coordinationz.cohashtag_helper as cohp

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
importlib.reload(cohp)

df = cohp.load_file(io_path, control_path)

Total control users : 30099
Total io users : 446
Total control data:  1353088
Total IO data:  250367


#### **Check overlap**

In [5]:
df_1 = df.loc[df['label'] == 1]
df_0 = df.loc[df['label'] == 0]

In [6]:
df.columns

Index(['tweetid', 'tweet_text', 'in_reply_to_tweetid', 'in_reply_to_userid',
       'created_at', 'tweet_client_name', 'tweet_language', 'hashtags',
       'mentions', 'urls', 'is_retweet', 'retweet_tweetid', 'retweet_userid',
       'userid', 'user_profile_image_url', 'user_screen_name',
       'account_creation_date', 'user_verified', 'user_protected',
       'user_profile_description', 'user_profile_entites', 'user_profile_url',
       'follower_count', 'following_count', 'user_profile_status_count',
       'user_profile_listed_count', 'user_profile_favourites_count',
       'user_reported_geo', 'user_reported_coordinates',
       'user_reported_location', 'label', 'user_display_name',
       'account_language', 'tweet_time', 'quoted_tweet_tweetid', 'latitude',
       'longitude', 'quote_count', 'reply_count', 'like_count',
       'retweet_count', 'user_mentions'],
      dtype='object')

In [5]:
df_io['userid'].unique()[:5]

array(['1124052498661150720', '1198810114091814912',
       '1221180950580731910', '3415413674', '1189569550540558336'],
      dtype=object)

In [11]:
df['retweet_userid'].nunique()

187460

In [10]:
len(set(df_1['retweet_userid']).intersection(
    set(df_0['retweet_userid']))
   )

2096

In [16]:
df_io.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250367 entries, 8433 to 4802241
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   tweetid                   250367 non-null  object        
 1   userid                    250367 non-null  object        
 2   user_display_name         250367 non-null  object        
 3   user_screen_name          250367 non-null  object        
 4   user_reported_location    160704 non-null  object        
 5   user_profile_description  239648 non-null  object        
 6   user_profile_url          25189 non-null   object        
 7   follower_count            250367 non-null  int64         
 8   following_count           250367 non-null  int64         
 9   account_creation_date     250367 non-null  object        
 10  account_language          250367 non-null  object        
 11  tweet_language            250367 non-null  object        
 12

#### **Functions**

In [31]:
def get_hashtag(df):
    '''
    Gets the hashtag
    '''
    import ast

    df_hashtag = df.loc[
        (~df['hashtags'].isnull()) & (df['hashtags'] != '[]')
    ]

    df_hashtag['list_hashtag'] = df_hashtag['hashtags'].apply(
        lambda x: ast.literal_eval(x)
    )
    
    return df_hashtag


def check_activity_correlation(df_user1, 
                               df_user2,
                               sample
                              ):
    df_user1['tweet_time_year'] = df_user1['tweet_time'].map(
        lambda x: x.strftime('%Y-%m-%d')
    )
    df_user2['tweet_time_year'] = df_user2['tweet_time'].map(
        lambda x: x.strftime('%Y-%m-%d')
    )

    df_user1_grp = (df_user1
                    .groupby(['tweet_time_year'])['tweetid']
                    .nunique()
                    .to_frame('count')
                    .reset_index()
                   )
    df_user2_grp = (df_user2
                    .groupby(['tweet_time_year'])['tweetid']
                    .nunique()
                    .to_frame('count')
                    .reset_index()
                   )
    
    print('*** Correlation of activity *** \n')
    from scipy import stats
    
    x = np.array(df_user1_grp['count'].tolist())
    y = np.array(df_user2_grp['count'].tolist())
    total_x = len(x)
    total_y = len(y)

    if total_x > total_y:
        #y is lower than x
        min_val = total_y
        min_x = x
        min_y = y
    else:
        #x is lower than y
        min_val = total_x
        min_x = y
        min_y = x
    
    all_corr = []
    for i in range(sample):
        x_sample = np.random.choice(min_x,
                                    min_val, 
                                    replace=False
                                   )
        res = stats.spearmanr(x_sample, min_y)
        all_corr.append(res.statistic)
    
    print('Mean Spearman correlation of daily activity:', np.mean(np.array(all_corr)))

    
def check_hashtag_overlap(df_user1, df_user2):
    print('*** Hashtag overlap *** \n')
    
    df_user1 = get_hashtag(df_user1) 
    df_user2 = get_hashtag(df_user2)
    
    df_explode_1 = df_user1.explode('list_hashtag')
    df_explode_2 = df_user2.explode('list_hashtag')
    
    overlap = set(df_explode_1['list_hashtag']).intersection(
        df_explode_2['list_hashtag']
    )
    total = set(df_explode_1['list_hashtag']).union(
        df_explode_2['list_hashtag']
    )
    print('Total overlap :', len(overlap), ' out of total ', len(total))
    print('Jaccard of hashtags: ', round(len(overlap)/len(total), 2))
    
    
def check_retweeted_user_overlap(df_user1, df_user2):
    print('*** Retweeted Userid overlap *** \n')
    
    overlap = set(df_user1['retweet_userid']).intersection(
        df_user2['retweet_userid']
    )
    total = set(df_user1['retweet_userid']).union(
        df_user2['retweet_userid']
    )
    user1 = set(df_user1['retweet_userid'])
    user2 = set(df_user2['retweet_userid'])
    userid1 = df_user1['userid'].unique()[0]
    userid2 = df_user2['userid'].unique()[0]
    
    print(f'User {userid1} has total ', len(user1), ' retweeted userid.')
    print(f'User {userid2} has total ', len(user2), ' retweeted userid.')
    
    print('Total overlap :', len(overlap), ' out of total ', len(total))
    print('Jaccard of retweeted userid: ', round(len(overlap)/len(total), 2))
    
def check_two_users(userid_1, 
                    userid_2, 
                    df, 
                    indicator=None,
                    sample=10
                   ):
    if userid_1 not in df['userid'].tolist():
        print(f'User {userid_1} Not found')
        
        return
        
    if userid_2 not in df['userid'].tolist():
        print(f'User {userid_2} Not found')
        
        return
    
    df['tweet_time'] = pd.to_datetime(df['tweet_time'])
    df_user1 = df.loc[df['userid'] == userid_1]
    df_user2 = df.loc[df['userid'] == userid_2]
    
    check_activity_correlation(df_user1, df_user2, sample=sample)
    check_hashtag_overlap(df_user1, df_user2)
    check_retweeted_user_overlap(df_user1, df_user2)
    
    
    
check_two_users('1124052498661150720', 
                '1198810114091814912', 
                df_io, 
                indicator=None
               )

*** Correlation of activity *** 

Mean Spearman correlation of daily activity: -0.0160888395658024
*** Hashtag overlap *** 

Total overlap : 64  out of total  778
Jaccard of hashtags:  0.08
*** Retweeted Userid overlap *** 

User 1124052498661150720 has total  725  retweeted userid.
User 1198810114091814912 has total  132  retweeted userid.
Total overlap : 55  out of total  802
Jaccard of retweeted userid:  0.07


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user1['tweet_time_year'] = df_user1['tweet_time'].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user2['tweet_time_year'] = df_user2['tweet_time'].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hashtag['list_hashtag'] = df_hashtag['hashtags'].apply(
A value is trying to be set on