In [85]:
import pandas as pd
import numpy as np
import csv
import glob
from collections import defaultdict
import random
import copy
import json
import os
from collections import Counter
import pickle
from datetime import datetime

# 1. Read dataset

In [86]:
# How many lines to read
# lines = 224667
lines = 500000
dataset = 'russian'

### 1.1. Import file path

Base path: Datasets folder
- Russian: "russian_rtid.txt" (twid ts uid rtid)


In [87]:
base_path = "./Datasets/"
filename = dataset +".txt"
path_to_file = os.path.join(base_path, filename)

### 1.2. Add headers 

### 1.3 Read file with headers and print its first 5 rows

In [88]:
if dataset == 'covid':
    header_list = ["twid", "uid", "ts", "rtid"]

    # reading data
    initial_file = pd.read_csv(path_to_file, sep="\t", names=header_list)

    # in case of covid.txt
    initial_file["ts"] = initial_file["ts"].astype('datetime64[ns]') 
    initial_file["ts"] = initial_file.ts.dt.to_pydatetime()

    # sorting by time
    initial_file.sort_values("ts", inplace = True)

    # dropping ALL duplicate values 
    initial_file.drop_duplicates(subset ="twid", 
                         keep = 'first', inplace = True)
elif dataset == 'russian':
    header_list = ["twid", "ts", "uid", "rtid"]

    # reading data
    initial_file = pd.read_csv('./Datasets/russian_rtid.txt', sep="\s+", names=header_list)

    # sorting by time
    initial_file.sort_values("ts", inplace = True)

    # dropping ALL duplicate values 
    initial_file.drop_duplicates(subset ="twid", 
                         keep = 'first', inplace = True)

In [89]:
input_file = initial_file

In [90]:
input_file

Unnamed: 0,twid,ts,uid,rtid
0,955515379299778561,1516647594,201677046,-1
1,955515380449054721,1516647594,917409759225139200,955496328808853504
2,955515420177510400,1516647603,4363056442,-1
3,955515458458914816,1516647613,1590944965,-1
4,955515460996366337,1516647613,980583553,955512752373673984
...,...,...,...,...
1945361,976042785601216513,1521541709,850236221456437248,976030607116591104
1945359,976042785731162112,1521541709,2974210367,976016190647697408
1945362,976042787450867712,1521541709,1110976454,-1
1945363,976042788956594177,1521541710,3306364731,-1


In [91]:
len(input_file)

1943830

### 1.3 Filter data:
- We want an original twid to be retweeted from at least one user 


In [92]:
data1 = input_file[((input_file['twid'].isin(input_file['rtid']))) & ((input_file['rtid']==-1))]

- We want all rtid (!=1) to exist in the twid column for completeness

In [93]:
data2 = input_file[((input_file['rtid'].isin(input_file['twid']))) & ((input_file['rtid']!=-1)) & ((input_file['rtid']!=974014568073695232))]

In [94]:
data = pd.concat([data1, data2])

In [95]:
len(data)

827268

In [96]:
data

Unnamed: 0,twid,ts,uid,rtid
206,956216939172499456,1516814859,28859692,-1
346,956218141184577536,1516815145,4086524967,-1
401,956218574967902208,1516815249,264912416,-1
420,956218666508521472,1516815270,2613120188,-1
479,956219412377407488,1516815448,386829998,-1
...,...,...,...,...
1945353,976042746426413057,1521541699,782976485133000704,975614873118871552
1945355,976042758371790848,1521541702,1863574340,976042274625900544
1945357,976042761286799360,1521541703,731574742344466432,976041960426352640
1945361,976042785601216513,1521541709,850236221456437248,976030607116591104


### 1.4 Count number of original tweets in the whole trace

In [97]:
# original tweets are the tweets with rtid = -1
o_ids = list(set(list(data['twid'].loc[data['rtid'] == -1])))

In [98]:
print('Number of original tweets in the trace:', len(o_ids))

Number of original tweets in the trace: 60426


### 1.5 Create set of users

In [99]:
U = data.uid.unique()

In [100]:
print('Number of unique users in the trace:', len(U))

Number of unique users in the trace: 93892


In [101]:
try: 
    pickle.dump(U, open("./Extracted/"+dataset+"/U" + str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

In [102]:
data.loc[data['uid']==358147382]

Unnamed: 0,twid,ts,uid,rtid
706,956221240225746944,1516815884,358147382,-1
162688,958170756088782848,1517280685,358147382,-1
427677,961264449742168064,1518018279,358147382,-1
443396,961514646195851264,1518077930,358147382,-1
556169,963078985608519680,1518450898,358147382,-1
588905,963495796778946560,1518550273,358147382,-1
684767,964772264565788672,1518854607,358147382,-1
797364,966705872335462400,1519315615,358147382,-1
797378,966706049796407296,1519315657,358147382,-1
797419,966706389870678016,1519315739,358147382,-1


# 2. Create episodes

- For each original tweet t that we detect in T, we construct the set of its retweets, which we call episode and denote by $E_{t}$.
- The whole set of episodes is denoted by $E$ and includes o_ids episodes in total, which is the number of original tweets
- We count $X_{ij}$ out of the o_ids total episodes where $(i,j)$ appears as an ordered pair.

### 2.1 Create tweets dictionaries

- We name it E and it includes the u_ids of the users that retweeted each t 
- The source $s_{t}$ will be included in dictionary $s_{t}$

In [103]:
# group dataframe by rtid id
gb = data.groupby(['rtid'])
E = gb['uid'].unique()
# result = result.reset_index(inplace=False) #lastly, reset the index

In [104]:
dd = defaultdict(list)
E = E.to_dict(dd)

In [105]:
del(E[-1])

In [106]:
len(E)

60426

For example, for tweet t = 976039941472956416 the retweeters are: 


In [107]:
# list(E[976039941472956416])

### 2.2 Create $s_t$ dictionary with the original tweets

In [108]:
s_t = data.loc[data['rtid']==-1][['twid', 'uid']]
s_t = s_t.set_index('twid').T.to_dict('list')

For example, for tweet t = 976039941472956416 the original tweeter is:

In [109]:
# s_t[976039941472956416]

The number of original tweets / episodes:

In [110]:
len(s_t)

60426

Save S dictionary

In [112]:
try: 
    pickle.dump(s_t, open("./Extracted/"+dataset+"/S"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

Insert the user who originally tweeted each tweet in the first position of each episode

In [113]:
for ep in E:
    E[ep] = np.insert(E[ep], 0, s_t[ep], axis=0)

In [114]:
# for ep in E:
#     a_set = set(E[ep])
#     contains_duplicates = len(E[ep]) != len(a_set)
#     if contains_duplicates == True: print(ep, E[ep])

Delete the duplicates from each tweet and remove tweets with no retweets

In [115]:
for ep in list(E):
    E[ep] = list(dict.fromkeys(E[ep]))   
    if len(E[ep])==1:
        E.pop(ep)
# removes duplicates from list and maintains order, and removes tweets with no retweets

In [116]:
try: 
    pickle.dump(E, open("./Extracted/"+dataset+"/E"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

### 2.3 Count $M_{ij}$ out of the o_ids total episodes where $(i,j)$ appears as an ordered pair.

Create dictionary d_f that has for key a user and for values the users that appear after him and then create a $M$ dictionary with the total times for each user

In [117]:
d = defaultdict()
for s in E:
    rw_list = E[s]
    for u in rw_list:
        u_after = rw_list[rw_list.index(u)+1:]
        if u in d:
            if len(u_after)!=0:
                d[u].append(u_after)
        else:
            if len(u_after)!=0:
                d[u] = [u_after]

In [118]:
d_f = defaultdict()
for i in d: 
    d_f[i] = [item for sublist in d[i] for item in sublist]

In [119]:
M = defaultdict()
for i in d_f:
    M[i] = Counter(d_f[i])

In [120]:
# l1 = list(data['rtid'].loc[data['uid']==358147382])

In [121]:
# l2 = list(data['rtid'].loc[data['uid']==842298422505926656])

In [122]:
# data.loc[data['twid']==964903486700900352]

In [123]:
# for i in M:
#     for j in M[i]:
#         if i==358147382 or j==358147382: print(i,j,M[i][j])

### 2.2 Save M in files

In [124]:
try: 
    pickle.dump(M, open("./Extracted/"+dataset+"/M"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

# 3. Create D for Saito

Create a $D$ dictionary, which contains the users that retweeted each tweet, in each timestamp:
- key1: episode $s$
- key2: timestamps $ts $of episode $s$
- values: list of users that retweeted $s$ at $ts$

In [125]:
D = dict()
for s in E:
    D[s] = dict()
    D[s][0] = [list(data['uid'].loc[data['twid']==s])[0]]

for s in E:
    visited_users = []
    retweets = data[['uid','ts']].loc[data['rtid']==s]
    for index, row in retweets.iterrows():
        if row['uid']!=D[s][0][0] and row['uid'] not in visited_users: # remove cases where user retweets again  and # remove cases where user retweets himself
            visited_users.append(row['uid'])
            if row['ts'] in D[s]: 
                D[s][row['ts']].append(row['uid'])
            else:
                D[s][row['ts']] = []
                D[s][row['ts']].append(row['uid'])
                
# removes tweets with no retweets
# for s in list(D):
#     if len(D[s])==1:
#         D.pop(s)


In [126]:
try: 
    pickle.dump(D, open("./Extracted/"+dataset+"/D"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

### 3.1 Count $M_{ij}$ out of the o_ids total episodes where $(i,j)$ appears as an ordered pair according to D. (difference that users at the same timestamp do not count as an ordered pair) 

Create dictionary d_f that has for key a user and for values the users that appear after him and then create a $M$ dictionary with the total times for each user

In [127]:
d = defaultdict()
for s in D:
    for users_list in list(D[s].values()):
        index_now = list(D[s].values()).index(users_list)
        for u in users_list:
            u_after = list(D[s].values())[index_now+1:]
            if u in d:
                if len(u_after)!=0:
                    d[u].append(u_after)
            else:
                if len(u_after)!=0:
                    d[u] = [u_after]

In [128]:
d_f = defaultdict()
for i in d: 
    d_f[i] = [user for sublist in d[i] for item in sublist for user in item]

In [129]:
M = defaultdict()
for i in d_f:
    M[i] = Counter(d_f[i])

In [130]:
# D[956234331961389056]

In [131]:
try: 
    pickle.dump(M, open("./Extracted/"+dataset+"/M_d" + str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

# 4. Create E_newman for Newman

In [132]:
user_id_to_retweet = defaultdict(list)
total_retweets = 0 
for index, tweet in input_file.iterrows():
    user_id = tweet['uid']
    tweet_id = tweet['twid']
    retweet_id = tweet['rtid']
    if retweet_id!=-1:
        try:
            retweeted_by_id = int(input_file['uid'].loc[input_file['twid']==retweet_id])
            total_retweets += 1
            user_id_to_retweet[user_id].append(retweeted_by_id)
        except:
            continue

In [133]:
Eij_newman = defaultdict(dict)

for user in user_id_to_retweet:
        for retweet in user_id_to_retweet[user]:
            if not retweet in Eij_newman[user]:
                Eij_newman[user][retweet] = 1
            else:
                Eij_newman[user][retweet] += 1
                

In [134]:
try: 
    pickle.dump(Eij_newman, open("./Extracted/"+dataset+"/E_newman"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")