In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
def translate(s):
    return ''.join(ch for ch in str(s) if ch.isalnum())

In [5]:
db_tweets = pd.read_csv('../data/db_annotated_tweets.csv', index_col=False)
db_tweets = db_tweets.fillna('')

# db_user_tweets = pd.read_csv('../data/db_annotated_user_tweets.csv', index_col=False)
# db_user_tweets = db_user_tweets.fillna('')

# db_tweets = pd.concat([db_tweets, db_user_tweets])

db_tweets['UserName'] = db_tweets['UserName'].apply(lambda x: translate(x.replace('@', '').replace(' ', '')))
db_tweets['retweet'] = db_tweets['retweet'].apply(lambda x: translate(x.replace('@', '').replace(' ', '')))

In [6]:
db_tweets['MentionedUser'] = db_tweets['mentions'].apply(lambda x: [i.split(',')[0] for i in x.split(';')])
mentions = db_tweets.explode('MentionedUser')[['UserName', 'MentionedUser']]
mentions['MentionedUser'] = mentions['MentionedUser'].apply(lambda x: translate(x.replace('@', '').replace(' ', '')))
mentions = mentions[(mentions['UserName'] != '') & (mentions['MentionedUser'] != '') & (mentions['MentionedUser'] != mentions['UserName'])]
mention_counts = mentions.groupby(['UserName', 'MentionedUser'])['UserName'].count().to_frame().rename(columns={'UserName':'count'}).reset_index()
mention_counts

Unnamed: 0,UserName,MentionedUser,count
0,000Dillon000,CBedfordDC,1
1,001Prometheus,GunsVelez,1
2,001Prometheus,NoNutNoSlut,1
3,001Prometheus,PBnJbot,1
4,001Prometheus,freekyleusa,2
...,...,...,...
229940,zzzdogman,eveforamerica,1
229941,zzzdogman,rezazia,1
229942,zzzjamieg,RadioFreeTom,1
229943,zzzuperior,cityafreaks,1


In [7]:
mention_counts[mention_counts['UserName'] == '001Prometheus']

Unnamed: 0,UserName,MentionedUser,count
1,001Prometheus,GunsVelez,1
2,001Prometheus,NoNutNoSlut,1
3,001Prometheus,PBnJbot,1
4,001Prometheus,freekyleusa,2


In [12]:
mention_counts['weights'] = mention_counts.groupby('UserName')['count'].apply(lambda x: x / sum(x))
mention_counts = mention_counts.fillna(1.0)
mention_counts.sort_values(by='weights', ascending=False)

Unnamed: 0,UserName,MentionedUser,count,weights
0,000Dillon000,CBedfordDC,1,1.000000
116518,SkolMitzel,JackPosobiec,1,1.000000
116545,Skuner83,mattgaetz,1,1.000000
116544,Skullandarkness,cathyrusson,1,1.000000
116536,Skule65,SydneyLWatson,1,1.000000
...,...,...,...,...
39827,DysonCarl,lebangmoloi,1,0.000821
39828,DysonCarl,lkknodelduren,1,0.000821
39830,DysonCarl,moniquedhooghe,1,0.000821
39819,DysonCarl,TheDickKnightV2,1,0.000821


In [14]:
retweets = db_tweets[db_tweets['retweet'] != '']
retweet_counts = retweets.groupby(['UserName', 'retweet'])['UserName'].count().to_frame().rename(columns={'UserName':'count'}).reset_index()
retweet_counts

Unnamed: 0,UserName,retweet,count
0,000Dillon000,CBedfordDC,1
1,0045James,imUrB00gieman,1
2,0071Holly,fakebiden,1
3,008moonside,DougCameron51,1
4,00LilacSky00,WalterDavis4L,1
...,...,...,...
112538,zzzdas,FAsintomatico,1
112539,zzzdogman,McKaylaRoseJ,1
112540,zzzdogman,eveforamerica,1
112541,zzzdogman,rezazia,1


In [15]:
retweet_counts['count'].value_counts()

1     108969
2       2907
3        418
4        134
5         51
6         23
7         13
8         10
9          7
12         4
11         2
67         1
14         1
16         1
19         1
21         1
Name: count, dtype: int64

In [16]:
retweet_counts['weights'] = retweet_counts.groupby('UserName')['count'].apply(lambda x: x / sum(x))
retweet_counts

Unnamed: 0,UserName,retweet,count,weights
0,000Dillon000,CBedfordDC,1,1.000000
1,0045James,imUrB00gieman,1,1.000000
2,0071Holly,fakebiden,1,1.000000
3,008moonside,DougCameron51,1,1.000000
4,00LilacSky00,WalterDavis4L,1,1.000000
...,...,...,...,...
112538,zzzdas,FAsintomatico,1,1.000000
112539,zzzdogman,McKaylaRoseJ,1,0.333333
112540,zzzdogman,eveforamerica,1,0.333333
112541,zzzdogman,rezazia,1,0.333333


In [18]:
retweet_counts.fillna(1.0)

Unnamed: 0,UserName,retweet,count,weights
0,000Dillon000,CBedfordDC,1,1.000000
1,0045James,imUrB00gieman,1,1.000000
2,0071Holly,fakebiden,1,1.000000
3,008moonside,DougCameron51,1,1.000000
4,00LilacSky00,WalterDavis4L,1,1.000000
...,...,...,...,...
112538,zzzdas,FAsintomatico,1,1.000000
112539,zzzdogman,McKaylaRoseJ,1,0.333333
112540,zzzdogman,eveforamerica,1,0.333333
112541,zzzdogman,rezazia,1,0.333333


In [19]:
interactions = pd.merge(mention_counts, retweet_counts, left_on=['UserName', 'MentionedUser'], right_on=['UserName', 'retweet'], how='outer', suffixes=('_mention', '_rt'))
interactions = interactions.fillna(0)

In [20]:
interactions.to_csv('../data/interactions.csv')