# Group study - Retweet classification

## Import

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

import importlib
import sys
sys.path.insert(0, '../5.0_Group_study(todo)')
import group_util as ut
sys.path.insert(0, '..')
import general_utils as gen_ut

## Dataset and general info preparation

In [2]:
cols_study = ['user_screen_name','user_mentions','rt_user_screen_name','in_reply_to_screen_name', 'user_url_cred',
             'rt_created_at','quoted_status_id','in_reply_to_user_id','in_reply_to_screen_name','rt_user_screen_name']
base_path = '../'

In [3]:
df = pd.read_csv('../tweets.csv',low_memory=False,
                 usecols=cols_study)

In [4]:
data = pd.read_csv('../1_Dataset_preparation/listControlledUsers.csv')  
novaxUsers = data[data['malicious']]
provaxUsers = data[data['benevolent']]

dfNovax = df[np.in1d(df['user_screen_name'], novaxUsers)].copy()
dfProvax = df[np.in1d(df['user_screen_name'], provaxUsers)].copy()

In [5]:
numTotalUsers = len(df.groupby('user_screen_name').count())
numTotalPosts = len(df)

In [6]:
tweets,retweet,reply,quotes = gen_ut.separate_post_type(df)  

numTotalRealTweet = len(tweets)

## Dataframe about information populate

In [7]:
dfGeneralInfo = pd.DataFrame()
dfOriginalTweetInfo = pd.DataFrame()
dfRetweetInfo = pd.DataFrame()
dfReplyInfo = pd.DataFrame()
dfQuotesInfo = pd.DataFrame()

In [8]:
def dfInfoPopulate(index,df):
    tweets,retweet,reply,quotes = gen_ut.separate_post_type(df)  
    
    dfGeneralInfo.loc[index,'user'] = len(df.groupby('user_screen_name').count())
    dfGeneralInfo.loc[index,'user %'] = (dfGeneralInfo.loc[index,'user'] / numTotalUsers) * 100

    dfGeneralInfo.loc[index,'post'] = len(df)
    dfGeneralInfo.loc[index,'post %'] = (dfGeneralInfo.loc[index,'post'] / numTotalPosts) * 100
    
    #Original tweet
    dfOriginalTweetInfo.loc[index,'original_tweets'] = len(tweets.groupby('user_screen_name').count())
    dfOriginalTweetInfo.loc[index,'original_tweets %'] = (dfOriginalTweetInfo.loc[index,'original_tweets']
                                                          / numTotalRealTweet) * 100

    #Retweet
    dfRetweetInfo.loc[index,'retweets'] = len(retweet)
    rt_novax = len(retweet[np.in1d(retweet['rt_user_screen_name'], novaxUsers)])
    rt_high = len(retweet[np.in1d(retweet['rt_user_screen_name'], provaxUsers)])
    #rt_other = len(retweet[retweet['rt_user_screen_name'].isin(otherUser)])
    
    dfRetweetInfo.loc[index,'retweets (novax)'] = rt_novax
    dfRetweetInfo.loc[index,'retweets %(novax)'] = (rt_novax / dfRetweetInfo.loc[index,'retweets']) * 100
    
    dfRetweetInfo.loc[index,'retweets (high credibility)'] = rt_high
    dfRetweetInfo.loc[index,'retweets %(high credibility)'] = (rt_high / dfRetweetInfo.loc[index,'retweets']) * 100
    
    #dfRetweetInfo.loc[index,'retweets (other users)'] = rt_other
    #dfRetweetInfo.loc[index,'retweets %(other users)'] = (rt_other / dfRetweetInfo.loc[index,'retweets']) * 100
    
    
    #Reply
    dfReplyInfo.loc[index,'reply'] = len(reply)
    rp_novax = len(reply[np.in1d(reply['in_reply_to_screen_name'], novaxUsers)])
    rp_high = len(reply[np.in1d(reply['in_reply_to_screen_name'], provaxUsers)])
    #rp_other = len(reply[reply['in_reply_to_screen_name'].isin(otherUser)])
    
    dfReplyInfo.loc[index,'reply (novax)'] = rp_novax
    dfReplyInfo.loc[index,'reply %(novax)'] = (rp_novax / dfReplyInfo.loc[index,'reply']) * 100
    
    dfReplyInfo.loc[index,'reply (high credibility)'] = rp_high
    dfReplyInfo.loc[index,'reply %(high credibility)'] = (rp_high / dfReplyInfo.loc[index,'reply']) * 100
    
    #dfReplyInfo.loc[index,'reply (other users)'] = rp_other
    #dfReplyInfo.loc[index,'reply %(other users)'] = (rp_other / dfReplyInfo.loc[index,'reply']) * 100
    
    
    #Quotes
    dfQuotesInfo.loc[index,'quotes'] = len(quotes)
    qt_novax = len(quotes[np.in1d(quotes['user_mentions'], novaxUsers)])
    qt_provax = len(quotes[np.in1d(quotes['user_mentions'], provaxUsers)])
    #rp_other = len(reply[reply['in_reply_to_screen_name'].isin(otherUser)])
    
    dfQuotesInfo.loc[index,'quotes (novax)'] = qt_novax
    dfQuotesInfo.loc[index,'quotes %(novax)'] = (qt_novax / dfQuotesInfo.loc[index,'quotes']) * 100
    
    dfQuotesInfo.loc[index,'quotes (high credibility)'] = qt_provax
    dfQuotesInfo.loc[index,'quotes %(high credibility)'] = (qt_provax / dfQuotesInfo.loc[index,'quotes']) * 100
    
    #dfQuotesInfo.loc[index,'quotes (other users)'] = rp_other
    #dfQuotesInfo.loc[index,'quotes %(other users)'] = (rp_other / dfQuotesInfo.loc[index,'quotes']) * 100

In [9]:
dfInfoPopulate('benevolent',dfProvax)
dfInfoPopulate('malicious',dfNovax)
#dfInfoPopulate('other_user',dfOther)

## Information printing

In [10]:
dfGeneralInfo

Unnamed: 0,user,user %,post,post %
benevolent,24.0,0.007862,3400.0,0.083884
malicious,55.0,0.018017,103552.0,2.55482


In [11]:
dfOriginalTweetInfo

Unnamed: 0,original_tweets,original_tweets %
benevolent,24.0,0.003278
malicious,52.0,0.007103


In [12]:
dfRetweetInfo

Unnamed: 0,retweets,retweets (novax),retweets %(novax),retweets (high credibility),retweets %(high credibility)
benevolent,882.0,0.0,0.0,537.0,60.884354
malicious,75806.0,19151.0,25.263172,17.0,0.022426


In [13]:
dfReplyInfo

Unnamed: 0,reply,reply (novax),reply %(novax),reply (high credibility),reply %(high credibility)
benevolent,841.0,8.0,0.951249,199.0,23.662307
malicious,7437.0,1235.0,16.606158,94.0,1.263951


In [14]:
dfQuotesInfo

Unnamed: 0,quotes,quotes (novax),quotes %(novax),quotes (high credibility),quotes %(high credibility)
benevolent,463.0,0.0,0.0,0.0,0.0
malicious,5287.0,0.0,0.0,0.0,0.0
