In [None]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict
import os
from collections import Counter
import pickle

# 1. Read dataset

In [None]:
# indicate here how many lines to read from the trace
lines = 500000

### 1.1. Import file path

Base path: datasets folder
- Russian: "russian_rtid.txt" (twid ts uid rtid)


In [None]:
base_path = "../datasets/"
filename = "russian_rtid.txt"
path_to_file = os.path.join(base_path, filename)

### 1.2. Add headers 

In [None]:
header_list = ["twid", "ts", "uid", "rtid"]

### 1.3 Read file with headers and print its first 5 rows

In [None]:
# reading data
initial_file = pd.read_csv(path_to_file, sep="\s+", names=header_list)

# sorting by time
initial_file.sort_values("ts", inplace = True)

# dropping ALL duplicate values 
initial_file.drop_duplicates(subset ="twid", 
                     keep = 'first', inplace = True)

In [None]:
initial_file.head()

### 1.4 the data that we are going to work with is the 'input_file' dataframe containining the number of lines we indicated in line 3

In [None]:
input_file = initial_file[0:lines]

### 1.5 Filter data:
- We want an original twid to be retweeted from at least one user 


In [None]:
data1 = input_file[((input_file['twid'].isin(input_file['rtid']))) & ((input_file['rtid']==-1))]

- We want all rtid (!=1) to exist in the twid column for completeness

In [None]:
data2 = input_file[((input_file['rtid'].isin(input_file['twid']))) & ((input_file['rtid']!=-1)) & ((input_file['rtid']!=974014568073695232))]

- Merge both sets of data into data

In [None]:
data = pd.concat([data1, data2])

In [None]:
print('Our final data includes', len(data), 'lines')

### 1.6 Save original tweets in the whole trace
Original tweets are the tweets with rtid = -1


In [None]:
o_ids = list(set(list(data['twid'].loc[data['rtid'] == -1])))

### 1.7 Create set of users

In [None]:
U = data.uid.unique()

In [None]:
print('Number of unique users in the trace:', len(U))

Save results

In [None]:
try: 
    pickle.dump(U, open("./extracted/U" + str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

# 2. Create episodes

- For each original tweet t that we detect in $P$, we construct the set of its retweets, which we call episode and denote by $E_{s}$.
- The whole set of episodes is denoted by $E$ and includes o_ids episodes in total, which is the number of original tweets/
- We count $M_{ij}$ out of the o_ids total episodes where $(i,j)$ appears as an ordered pair.

### 2.1 Create tweets dictionaries

Dictionary E includes the u_ids of the users that retweeted each tweet s.

In [None]:
# group data by rtid
gb = data.groupby(['rtid'])
E = gb['uid'].unique()

In [None]:
# transform to default dict
E = E.to_dict(defaultdict(list))

In [None]:
# remove tweets with rtid=-1
del(E[-1])

### 2.2 Create $s_t$ dictionary with the original tweets

Each source $r_{s}$ will be included in dictionary S.

In [None]:
S = data.loc[data['rtid']==-1][['twid', 'uid']]
S = S.set_index('twid').T.to_dict('list')

Save S dictionary

In [None]:
try: 
    pickle.dump(s_t, open("./extracted/S"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

Insert the user who originally tweeted each tweet in the first position of each episode

In [None]:
for ep in E:
    E[ep] = np.insert(E[ep], 0, S[ep], axis=0)

Delete the duplicates from each tweet and remove tweets with no retweets

In [None]:
for ep in list(E):
    E[ep] = list(dict.fromkeys(E[ep]))   
    if len(E[ep])==1:
        E.pop(ep)

In [None]:
try: 
    pickle.dump(E, open("./extracted/E"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

# 3. Create D 

Create a $D$ dictionary, which contains the users that retweeted each tweet, in each timestamp:
- key1: episode $s$
- key2: timestamps $ts$ of episode $s$
- values: list of users that retweeted $s$ at $ts$

In [None]:
D = dict()
for s in E:
    D[s] = dict()
    D[s][0] = [list(data['uid'].loc[data['twid']==s])[0]]

for s in E:
    visited_users = []
    retweets = data[['uid','ts']].loc[data['rtid']==s]
    for index, row in retweets.iterrows():
        if row['uid']!=D[s][0][0] and row['uid'] not in visited_users: # remove cases where user retweets again  and # remove cases where user retweets himself
            visited_users.append(row['uid'])
            if row['ts'] in D[s]: 
                D[s][row['ts']].append(row['uid'])
            else:
                D[s][row['ts']] = []
                D[s][row['ts']].append(row['uid'])

In [None]:
try: 
    pickle.dump(D, open("./extracted/D"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

### 3.1 Count $M_{ij}$ out of the o_ids total episodes where $(i,j)$ appears as an ordered pair according to D. (difference that users at the same timestamp do not count as an ordered pair) 

Create dictionary d_f that has for key a user and for values the users that appear after him and then create a $M$ dictionary with the total times a $(i,j)$ pair appears.

In [None]:
d = defaultdict()
for s in D:
    for users_list in list(D[s].values()):
        index_now = list(D[s].values()).index(users_list)
        for u in users_list:
            u_after = list(D[s].values())[index_now+1:]
            if u in d:
                if len(u_after)!=0:
                    d[u].append(u_after)
            else:
                if len(u_after)!=0:
                    d[u] = [u_after]

In [None]:
d_f = defaultdict()
for i in d: 
    d_f[i] = [user for sublist in d[i] for item in sublist for user in item]

In [None]:
M = defaultdict()
for i in d_f:
    M[i] = Counter(d_f[i])

In [None]:
try: 
    pickle.dump(M, open("./extracted/M" + str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")

# 4. Create E_newman to evaluate and compare our method with Newman

In [None]:
user_id_to_retweet = defaultdict(list)
total_retweets = 0 
for index, tweet in data.iterrows():
    user_id = tweet['uid']
    tweet_id = tweet['twid']
    retweet_id = tweet['rtid']
    if retweet_id!=-1:
        try:
            retweeted_by_id = int(input_file['uid'].loc[input_file['twid']==retweet_id])
            total_retweets += 1
            user_id_to_retweet[user_id].append(retweeted_by_id)
        except:
            continue

In [None]:
Eij_newman = defaultdict(dict)
for user in user_id_to_retweet:
        for retweet in user_id_to_retweet[user]:
            if not retweet in Eij_newman[user]:
                Eij_newman[user][retweet] = 1
            else:
                Eij_newman[user][retweet] += 1
                

In [None]:
try: 
    pickle.dump(Eij_newman, open("./extracted/E_newman"+ str(lines) + ".p", "wb"))
except: 
    print("Unable to write to file")