# Load Data

In [1]:
# Imports
# from settings import *
# import analyze_cascade
import datetime
import pandas as pd
import numpy as np
from itertools import groupby
from collections import Counter
from random import shuffle
from random import choices
import math
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
metadata_file = 'metadata_anon.txt'

In [3]:
# Read meta data 
fin = open(metadata_file,'r')
lines = fin.readlines()
fin.close()
cascade_id2metadata={}
for line in lines:
    line = line.replace('\n','')
    item = eval(line)
    cascade_id2metadata[item[0]] = item[1]

## Descriptives of dynamic measures

### Static measures

In [133]:
# Get static measures
cid = []
veracity = []
virality = []
depth = []
breadth = []
size = []
verified = []
nfollowers = []
nfollowees = []
engagement = []
category = []
for cascade,metadata in cascade_id2metadata.items():
    if metadata['virality'] is not None: 
        cid.append(cascade)
        veracity.append(metadata['veracity'])
        virality.append(metadata['virality'])
        depth.append(metadata['depth'])
        breadth.append(metadata['max_breadth'])
        size.append(metadata['size'])
        verified.append(metadata['verified_list'][0])
        nfollowers.append(metadata['num_followers_list'][0])
        nfollowees.append(metadata['num_followees_list'][0])
        engagement.append(metadata['engagement_list'][0])
        category.append(metadata['rumor_category'])

# Convert to data frame
df = pd.DataFrame({'cid': cid,
                   'veracity': veracity,
                   'virality': virality,
                   'depth': depth,
                   'breadth': breadth,
                   'size': size,
                   'verified': verified,
                   'nfollowers': nfollowers,
                   'nfollowees': nfollowees,
                   'engangement': engagement,
                   'category': category})

# Inspect
df.head(5)

Unnamed: 0,breadth,category,cid,depth,engangement,nfollowees,nfollowers,size,veracity,verified,virality
0,10703,Viral Photos/Stories/Urban Legends,106998,11,25.799399,186.0,672.0,23228,MIXED,False,4.003857
1,11783,Science/Nature/Tech/Food/Health,106999,9,10.811974,313.0,380.0,14827,MIXED,False,2.535338
2,6504,Viral Photos/Stories/Urban Legends,107000,13,15.395237,518.0,504.0,14129,MIXED,False,4.019705
3,5772,Viral Photos/Stories/Urban Legends,107001,8,3.140842,189.0,228.0,9972,MIXED,False,3.271008
4,6041,Viral Photos/Stories/Urban Legends,107002,8,5.160261,174.0,110.0,9526,MIXED,False,3.115942


### Dynamic measures

In [4]:
len_depth2time = []
len_num_followees_list = []
len_depth2uu = []
len_uu2time = []
len_depth2breadth = []
for cascade,metadata in cascade_id2metadata.items():
    if metadata['virality'] is not None: 
        len_depth2time.append(len(metadata['depth2time'].keys()))
        len_num_followees_list.append(len(metadata['num_followees_list']))
        len_depth2uu.append(len(metadata['depth2uu'].keys()))
        len_uu2time.append(len(metadata['uu2time'].keys()))
        len_depth2breadth.append(len(metadata['depth2breadth'].keys()))
    
# Convert to data frame
df_len = pd.DataFrame({'depth2time ': len_depth2time, 
                       'num_followees_list': len_num_followees_list, 
                       'depth2uu': len_depth2uu, 
                       'uu2time': len_uu2time, 
                       'depth2breadth': len_depth2breadth})

# # Get summary
df_len.describe(percentiles = [0.25, 0.5, 0.75, 1])
    

Unnamed: 0,depth2breadth,depth2time,depth2uu,num_followees_list,uu2time
count,42081.0,42081.0,42081.0,42081.0,42081.0
mean,1.707707,1.707707,1.707707,93.878829,93.878829
std,1.319555,1.319555,1.319555,950.694376,950.694376
min,1.0,1.0,1.0,2.0,2.0
25%,1.0,1.0,1.0,2.0,2.0
50%,1.0,1.0,1.0,4.0,4.0
75%,2.0,2.0,2.0,9.0,9.0
100%,24.0,24.0,24.0,46895.0,46895.0
max,24.0,24.0,24.0,46895.0,46895.0


## Create LSTM data

### Get dynamic data

In [5]:
# Function to get expression of each item in a dictionary entry
def get_expression_list(entry):
    expression = []
    for i in entry.keys():
        expression.append(float(entry[i]))
    return expression

# Convert y to classification
def veracity_to_categorical(v):
    if v == 'FALSE':
        vbin = [1,0,0]
    elif v == 'MIXED':
        vbin = [0,1,0]
    elif v == 'TRUE':
        vbin = [0,0,1]
    return vbin

# Get data in list format
data = []
for cascade,metadata in cascade_id2metadata.items():
    if metadata['virality'] is not None:       
        # Get depth
        depth2time = get_expression_list(metadata['depth2time'])
        depth2uu = get_expression_list(metadata['depth2uu'])
        depth2breadth = get_expression_list(metadata['depth2breadth']) 
        veracity = veracity_to_categorical(metadata['veracity'])
        data_id = []
        for time, uu, breadth in zip(depth2time, depth2uu, depth2breadth):
            data_t = [cascade, 
                      veracity,
                      time, uu, breadth]
            data_id.append(data_t)
        data.extend([data_id])

### Preprocessing

In [6]:
# Function: Create training and test set
def split_list(lst, train_size): # train_size is a proportion
    split = len(lst) * train_size
    if split.is_integer():
        split = int(split)
        return lst[:split], lst[split:]
    else:
        split = math.floor(split) + 1
        return lst[:split], lst[split:]
    
# Function: Padding for groups of equal batches
def padding(lst, bsize):
    if len(lst) % bsize != 0:
        psize = bsize - (len(lst) % 5)
        samples = choices(lst, k=psize)
        lst.extend(samples)
    return lst

# Get sublist
def get_sublist(list_in_list, start, stop):
    x = []
    for lst in list_in_list:
        x_id = []
        for sublist in lst:
            if stop is None:
                x_id.append(sublist[start:])
            elif start is None:
                x_id.append(sublist[:stop])
            else:
                x_id.append(sublist[start:stop])
        x.extend([x_id])
    return x

# Separate id, x and y
def separate(list_in_list):
    cid = []
    y = []
    for lst in list_in_list:
        cid.append(lst[0][0]) # only one id is needed
        # The following code would assume target replication in the model
#         veracity_id = []
#         for sublist in lst:
#             veracity_id.extend([sublist[1]])
#         veracity.append(veracity_id)
        y.append(lst[0][1])
    x = get_sublist(list_in_list,2,None)
    return cid, y, x

# # Group by sequence length and append to have batches of 5 for both training and test
data.sort(key=len)   # Randomly reshuffle before? random.shuffle(...)
x_train = []
x_test = []
y_train = []
y_test = []
cid_train = []
cid_test = []
for k, g in groupby(data, len):
    group = list(g)
    if len(group) > 2: # This omits too small groups
        shuffle(group)
        # Create train and test bucket
        train_group, test_group = split_list(group, 0.5)
        # Padd for equal batch size
        train_group_padded = padding(train_group, 5)
        test_group_padded = padding(test_group, 5)
        # Separate list
        cid_train_group, y_train_group, x_train_group = separate(train_group)
        cid_test_group, y_test_group, x_test_group = separate(test_group)
        # Append:  convert y and x into numpy arrays for nn models
        x_train.append(np.array(x_train_group))
        x_test.append(np.array(x_test_group))
        y_train.append(np.array(y_train_group))
        y_test.append(np.array(y_test_group))
        cid_train.append(cid_train_group)
        cid_test.append(cid_test_group)

### Data standardization

In [7]:
# Function to standardize the list
def standardization(lst, index, mean, std):
    for array3d in lst:
        for array2d in array3d:
            for vector in array2d:
                vector[index] = (vector[index] - mean) / std
    return lst

# Function to compute mean and std of variable and then standardizes this variable in list
def standardize_data(a_list, b_list, index):
    var = []
    # Compute mean and std from train data variable
    for array3d in a_list:
        for array2d in array3d:
            for vector in array2d:
                var.append(vector[index])
    var = np.array(var)
    var_mean = var.mean()
    var_std = var.std()
    # Standardize a
    a_list_std = standardization(a_list, index, var_mean, var_std)
    b_list_std = standardization(b_list, index, var_mean, var_std)
    return a_list_std, b_list_std

# Standardize all variables
def standardize_all(a_list, b_list):
    length = len(a_list[0][0][0])
    indices = list(range(length))
    for i in indices:
        std_a, std_b = standardize_data(a_list, b_list, i)
    return std_a, std_b

x_train, x_test = standardize_all(x_train, x_test)

## LSTM train data descriptives

In [20]:
# Group size and sequence length
i = 1
for g in x_train:
    print('Group: ', i, ' ', 'Observations: ', len(g), ' ' 'Sequence length', len(g[0]))
    i += 1

Group:  1   Observations:  12960  Sequence length 1
Group:  2   Observations:  4790  Sequence length 2
Group:  3   Observations:  1790  Sequence length 3
Group:  4   Observations:  760  Sequence length 4
Group:  5   Observations:  320  Sequence length 5
Group:  6   Observations:  170  Sequence length 6
Group:  7   Observations:  95  Sequence length 7
Group:  8   Observations:  65  Sequence length 8
Group:  9   Observations:  35  Sequence length 9
Group:  10   Observations:  25  Sequence length 10
Group:  11   Observations:  20  Sequence length 11
Group:  12   Observations:  15  Sequence length 12
Group:  13   Observations:  10  Sequence length 13
Group:  14   Observations:  10  Sequence length 14
Group:  15   Observations:  5  Sequence length 15
Group:  16   Observations:  5  Sequence length 16
Group:  17   Observations:  5  Sequence length 17
Group:  18   Observations:  5  Sequence length 19


In [32]:
# Convert y to classification
def reverse_veracity_to_categorical(vbin):
    if vbin[0] == 1:
        v = 'FALSE'
    elif vbin[1] == 1:
        v = 'MIXED'
    elif vbin[2] == 1:
        v = 'TRUE'
    return v

# Outcome distribution
i = 1
for g in y_train:
    ver = []
    for y in g:
        ver.append(reverse_veracity_to_categorical(y))
    print('Group: ', i, Counter(ver))
    i += 1

Group:  1 Counter({'FALSE': 9665, 'TRUE': 2010, 'MIXED': 1285})
Group:  2 Counter({'FALSE': 3669, 'TRUE': 675, 'MIXED': 446})
Group:  3 Counter({'FALSE': 1375, 'TRUE': 248, 'MIXED': 167})
Group:  4 Counter({'FALSE': 608, 'TRUE': 90, 'MIXED': 62})
Group:  5 Counter({'FALSE': 245, 'TRUE': 50, 'MIXED': 25})
Group:  6 Counter({'FALSE': 127, 'TRUE': 24, 'MIXED': 19})
Group:  7 Counter({'FALSE': 81, 'MIXED': 7, 'TRUE': 7})
Group:  8 Counter({'FALSE': 52, 'MIXED': 8, 'TRUE': 5})
Group:  9 Counter({'FALSE': 30, 'TRUE': 4, 'MIXED': 1})
Group:  10 Counter({'FALSE': 21, 'TRUE': 2, 'MIXED': 2})
Group:  11 Counter({'FALSE': 20})
Group:  12 Counter({'FALSE': 14, 'TRUE': 1})
Group:  13 Counter({'FALSE': 7, 'MIXED': 3})
Group:  14 Counter({'FALSE': 9, 'MIXED': 1})
Group:  15 Counter({'FALSE': 5})
Group:  16 Counter({'FALSE': 5})
Group:  17 Counter({'FALSE': 5})
Group:  18 Counter({'FALSE': 3, 'MIXED': 2})


## LSTMs

### LSTM for depth

In [8]:
# Create LSTM model
model = Sequential()
model.add(LSTM(5, input_shape = (None, 3),  return_sequences = False))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [9]:
# Fit model and get train predictions
train_pred = []
test_pred = []
for X,Y,Z in zip(x_train, y_train, x_test):
    hist = model.fit(X, Y, epochs=2, batch_size=5)
    pred1 = model.predict(X, batch_size=5)
    pred2 = model.predict(Z, batch_size=5)
    train_pred.append(pred1)
    test_pred.append(pred2)

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


In [125]:
# Convert predictions to data frame with ID
def pred_to_df(ids, pred, var_name, train):
    # Create data frame of predictions
    n = len(pred[0][0])
    cols = ['cid']
    cols.extend([var_name + str(i) for i in range(n)])
    init = 0
    for id_gr,p_gr in zip(ids,pred):
        for i,p in zip(id_gr, p_gr):
            if init == 0:
                matrix = [np.append([i],p)]
                init = 1
            else:
                matrix = np.concatenate((matrix, [np.append([i],p)]), axis=0)
    df = pd.DataFrame(matrix, columns=cols)
    # Make id column integer
    df.cid = df.cid.astype(int)
    # Drop duplicates
    df = df.drop_duplicates('cid')
    # Combine train and test predictions and input ID
    df.insert(1, 'train', train)
    return df

# Get train and test predictions
ydepth = pred_to_df(cid_train, train_pred, 'ydepth', True)
y_test_test = pred_to_df(cid_test, test_pred, 'ydepth', False)
# Combine
ydepth.append(y_test_test)

Unnamed: 0,cid,train,ydepth0,ydepth1,ydepth2
0,66469,True,0.734211,0.112140,0.153649
1,67449,True,0.734211,0.112140,0.153649
2,64041,True,0.735100,0.111755,0.153145
3,66573,True,0.734211,0.112140,0.153649
4,68528,True,0.733321,0.112525,0.154154
5,86372,True,0.735544,0.111563,0.152893
6,54685,True,0.735544,0.111563,0.152893
7,64647,True,0.734656,0.111948,0.153397
8,86903,True,0.735544,0.111563,0.152893
9,66568,True,0.734211,0.112140,0.153649


### LSTM for users

#### Get data

#### Run model

#### Get predictions

## Combined Model

### Merge data

In [138]:
df_comb = pd.merge(df, ydepth, how='left', on='cid')

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,cid,train,ydepth0,ydepth1,ydepth2
0,66469,True,0.734211,0.112140,0.153649
1,67449,True,0.734211,0.112140,0.153649
2,64041,True,0.735100,0.111755,0.153145
3,66573,True,0.734211,0.112140,0.153649
4,68528,True,0.733321,0.112525,0.154154
5,86372,True,0.735544,0.111563,0.152893
6,54685,True,0.735544,0.111563,0.152893
7,64647,True,0.734656,0.111948,0.153397
8,86903,True,0.735544,0.111563,0.152893
9,66568,True,0.734211,0.112140,0.153649


In [129]:
print(df)

       breadth                            category     cid  depth  \
0        10703  Viral Photos/Stories/Urban Legends  106998     11   
1        11783     Science/Nature/Tech/Food/Health  106999      9   
2         6504  Viral Photos/Stories/Urban Legends  107000     13   
3         5772  Viral Photos/Stories/Urban Legends  107001      8   
4         6041  Viral Photos/Stories/Urban Legends  107002      8   
5         6160                       Entertainment  107003      8   
6         2110  Viral Photos/Stories/Urban Legends  107004     14   
7         3092  Viral Photos/Stories/Urban Legends  107005     10   
8         4971                            Politics  107006      3   
9         3374  Viral Photos/Stories/Urban Legends  107007      5   
10        3906                            Politics  107008      4   
11        1484  Viral Photos/Stories/Urban Legends  107009     13   
12        3451                            Politics  107010      4   
13        1477     Science/Nature/