# Load Data

In [1]:
# Imports
# from settings import *
# import analyze_cascade
import datetime
import pandas as pd
import numpy as np
from itertools import groupby
from collections import Counter
from random import shuffle
from random import choices
import math

from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import LSTM, Input, Dense
from keras.layers import concatenate as kerasconc

import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

Using TensorFlow backend.


In [2]:
metadata_file = '/Users/jaspermeijering/Google Drive/a Study/EPA Study Abroad - Carnegie Mellon University/Courses/CMU - 95845 - Applied Analytics The Machine Learning Pipeline/Machine Learning Pipeline Final Project/Data/FalseNews_Code_Data/data/metadata_anon.txt'

In [3]:
# Read meta data 
fin = open(metadata_file,'r')
lines = fin.readlines()
fin.close()
cascade_id2metadata={}
for line in lines:
    line = line.replace('\n','')
    item = eval(line)
    cascade_id2metadata[item[0]] = item[1]

## Descriptives of dynamic measures

### Static measures

In [4]:
# Get static measures
cid = []
veracity = []
virality = []
depth = []
breadth = []
size = []
verified = []
nfollowers = []
nfollowees = []
engagement = []
category = []
for cascade,metadata in cascade_id2metadata.items():
    if metadata['virality'] is not None: 
        cid.append(cascade)
        veracity.append(metadata['veracity'])
        virality.append(metadata['virality'])
        depth.append(metadata['depth'])
        breadth.append(metadata['max_breadth'])
        size.append(metadata['size'])
        verified.append(metadata['verified_list'][0])
        nfollowers.append(metadata['num_followers_list'][0])
        nfollowees.append(metadata['num_followees_list'][0])
        engagement.append(metadata['engagement_list'][0])
        category.append(metadata['rumor_category'])

# Convert to data frame
static = pd.DataFrame({'cid': cid,
                       'veracity': veracity,
                       'virality': virality,
                       'depth': depth,
                       'breadth': breadth,
                       'size': size,
                       'verified': verified,
                       'nfollowers': nfollowers,
                       'nfollowees': nfollowees,
                       'engangement': engagement,
                       'category': category})

# Inspect
static.head(5)

Unnamed: 0,breadth,category,cid,depth,engangement,nfollowees,nfollowers,size,veracity,verified,virality
0,10703,Viral Photos/Stories/Urban Legends,106998,11,25.799399,186.0,672.0,23228,MIXED,False,4.003857
1,11783,Science/Nature/Tech/Food/Health,106999,9,10.811974,313.0,380.0,14827,MIXED,False,2.535338
2,6504,Viral Photos/Stories/Urban Legends,107000,13,15.395237,518.0,504.0,14129,MIXED,False,4.019705
3,5772,Viral Photos/Stories/Urban Legends,107001,8,3.140842,189.0,228.0,9972,MIXED,False,3.271008
4,6041,Viral Photos/Stories/Urban Legends,107002,8,5.160261,174.0,110.0,9526,MIXED,False,3.115942


# Handel NA's - MICE

In [5]:
#Inspect
static['nfollowers'].isnull().values.sum()
static['nfollowees'].isnull().values.sum()
static['verified'].isnull().values.sum()

# 136 have nan for nfollowers, nfollowees and verified, 135 in 'War/Terrorism/Shootings' and 1 in Politics
static[static['verified'].isnull()]['category'].unique()
sum(static[static['verified'].isnull()]['category'] == 'War/Terrorism/Shootings')
static[static['verified'].isnull()]



Unnamed: 0,breadth,category,cid,depth,engangement,nfollowees,nfollowers,size,veracity,verified,virality
8302,2,War/Terrorism/Shootings,54959,1,3.668825,,,2,FALSE,,1.000000
9013,2,War/Terrorism/Shootings,55670,1,5.714839,,,2,FALSE,,1.000000
9740,2,War/Terrorism/Shootings,56397,1,121.611547,,,2,FALSE,,1.000000
15985,3,War/Terrorism/Shootings,62642,1,7.591549,,,3,FALSE,,1.333333
28168,10,War/Terrorism/Shootings,74825,2,48.698715,,,11,FALSE,,1.963636
35759,1503,War/Terrorism/Shootings,82416,8,13.583922,,,3764,FALSE,,4.329435
35864,2783,Politics,82521,13,20.989461,,,7620,FALSE,,5.078605
38208,2,War/Terrorism/Shootings,84865,2,9.067479,,,3,TRUE,,1.333333
38298,2,War/Terrorism/Shootings,84955,2,2.618711,,,3,TRUE,,1.333333
38338,2,War/Terrorism/Shootings,84995,2,0.092596,,,3,TRUE,,1.333333


In [39]:
#find out where Nics drops NA's -> undrop them run mice.


In [30]:
static.columns

Index(['breadth', 'category', 'cid', 'depth', 'engangement', 'nfollowees',
       'nfollowers', 'size', 'veracity', 'verified', 'virality'],
      dtype='object')

In [37]:
staticx = static.loc[:,['breadth', 'cid', 'depth', 'engangement', 'nfollowees',
       'nfollowers', 'size', 'virality']].as_matrix()
staticx

array([[  1.07030000e+04,   1.06998000e+05,   1.10000000e+01, ...,
          6.72000000e+02,   2.32280000e+04,   4.00385720e+00],
       [  1.17830000e+04,   1.06999000e+05,   9.00000000e+00, ...,
          3.80000000e+02,   1.48270000e+04,   2.53533792e+00],
       [  6.50400000e+03,   1.07000000e+05,   1.30000000e+01, ...,
          5.04000000e+02,   1.41290000e+04,   4.01970546e+00],
       ..., 
       [  2.00000000e+00,   8.87350000e+04,   1.00000000e+00, ...,
          4.40000000e+01,   2.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   8.87360000e+04,   1.00000000e+00, ...,
          9.46000000e+02,   2.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   8.87370000e+04,   1.00000000e+00, ...,
          9.46000000e+02,   2.00000000e+00,   1.00000000e+00]])

In [38]:
import fancyimpute # install pip install fancyimpute
mddd = fancyimpute.MICE().complete(staticx)
mddd

[MICE] Completing matrix with shape (42081, 8)
[MICE] Starting imputation round 1/110, elapsed time 0.003
[MICE] Starting imputation round 2/110, elapsed time 0.020
[MICE] Starting imputation round 3/110, elapsed time 0.029
[MICE] Starting imputation round 4/110, elapsed time 0.038
[MICE] Starting imputation round 5/110, elapsed time 0.046
[MICE] Starting imputation round 6/110, elapsed time 0.055
[MICE] Starting imputation round 7/110, elapsed time 0.064
[MICE] Starting imputation round 8/110, elapsed time 0.074
[MICE] Starting imputation round 9/110, elapsed time 0.083
[MICE] Starting imputation round 10/110, elapsed time 0.091
[MICE] Starting imputation round 11/110, elapsed time 0.100
[MICE] Starting imputation round 12/110, elapsed time 0.108
[MICE] Starting imputation round 13/110, elapsed time 0.117
[MICE] Starting imputation round 14/110, elapsed time 0.126
[MICE] Starting imputation round 15/110, elapsed time 0.135
[MICE] Starting imputation round 16/110, elapsed time 0.144
[M

array([[  1.07030000e+04,   1.06998000e+05,   1.10000000e+01, ...,
          6.72000000e+02,   2.32280000e+04,   4.00385720e+00],
       [  1.17830000e+04,   1.06999000e+05,   9.00000000e+00, ...,
          3.80000000e+02,   1.48270000e+04,   2.53533792e+00],
       [  6.50400000e+03,   1.07000000e+05,   1.30000000e+01, ...,
          5.04000000e+02,   1.41290000e+04,   4.01970546e+00],
       ..., 
       [  2.00000000e+00,   8.87350000e+04,   1.00000000e+00, ...,
          4.40000000e+01,   2.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   8.87360000e+04,   1.00000000e+00, ...,
          9.46000000e+02,   2.00000000e+00,   1.00000000e+00],
       [  2.00000000e+00,   8.87370000e+04,   1.00000000e+00, ...,
          9.46000000e+02,   2.00000000e+00,   1.00000000e+00]])

# STOP HERE - Nothing to see here anymore

### Dynamic measures

In [None]:
len_depth2time = []
len_num_followees_list = []
len_depth2uu = []
len_uu2time = []
len_depth2breadth = []
for cascade,metadata in cascade_id2metadata.items():
    if metadata['virality'] is not None: 
        len_depth2time.append(len(metadata['depth2time'].keys()))
        len_num_followees_list.append(len(metadata['num_followees_list']))
        len_depth2uu.append(len(metadata['depth2uu'].keys()))
        len_uu2time.append(len(metadata['uu2time'].keys()))
        len_depth2breadth.append(len(metadata['depth2breadth'].keys()))
    
# Convert to data frame
dynamic_len = pd.DataFrame({'depth2time ': len_depth2time, 
                           'num_followees_list': len_num_followees_list, 
                           'depth2uu': len_depth2uu, 
                           'uu2time': len_uu2time, 
                           'depth2breadth': len_depth2breadth})

# # Get summary
dynamic_len.describe(percentiles = [0.25, 0.5, 0.75, 1])
    

## Create LSTM data

### Get dynamic data

In [None]:
# Function to get expression of each item in a dictionary entry
def get_expression_list(entry):
    expression = []
    for i in entry.keys():
        expression.append(float(entry[i]))
    return expression

# Convert y to classification
def veracity_to_categorical(v):
    if v == 'FALSE':
        vbin = [1,0,0]
    elif v == 'MIXED':
        vbin = [0,1,0]
    elif v == 'TRUE':
        vbin = [0,0,1]
    else:
        vbin = None
    return vbin

# Get data in list format
data = []
for cascade,metadata in cascade_id2metadata.items():
    if metadata['virality'] is not None:       
        # Get depth
        depth2time = get_expression_list(metadata['depth2time'])
        depth2uu = get_expression_list(metadata['depth2uu'])
        depth2breadth = get_expression_list(metadata['depth2breadth']) 
        veracity = veracity_to_categorical(metadata['veracity'])
        data_id = []
        for time, uu, breadth in zip(depth2time, depth2uu, depth2breadth):
            data_t = [cascade, 
                      veracity,
                      time, uu, breadth]
            data_id.append(data_t)
        data.extend([data_id])

### Preprocessing

In [None]:
# Function: Create training and test set
def split_list(lst, train_size): # train_size is a proportion
    split = len(lst) * train_size
    if split.is_integer():
        split = int(split)
        return lst[:split], lst[split:]
    else:
        split = math.floor(split) + 1
        return lst[:split], lst[split:]
    
# Function: Padding for groups of equal batches
def padding(lst, bsize):
    if len(lst) % bsize != 0:
        psize = bsize - (len(lst) % 5)
        samples = choices(lst, k=psize)
        lst.extend(samples)
    return lst

# Get sublist
def get_sublist(list_in_list, start, stop):
    x = []
    for lst in list_in_list:
        x_id = []
        for sublist in lst:
            if stop is None:
                x_id.append(sublist[start:])
            elif start is None:
                x_id.append(sublist[:stop])
            else:
                x_id.append(sublist[start:stop])
        x.extend([x_id])
    return x

# Separate id, x and y
def separate(list_in_list):
    cid = []
    y = []
    for lst in list_in_list:
        cid.append(lst[0][0]) # only one id is needed
        # The following code would assume target replication in the model
#         veracity_id = []
#         for sublist in lst:
#             veracity_id.extend([sublist[1]])
#         veracity.append(veracity_id)
        y.append(lst[0][1])
    x = get_sublist(list_in_list,2,None)
    return cid, y, x

# # Group by sequence length and append to have batches of 5 for both training and test
data.sort(key=len)   # Randomly reshuffle before? random.shuffle(...)
x_train = []
x_test = []
y_train = []
y_test = []
cid_train = []
cid_test = []
for k, g in groupby(data, len):
    group = list(g)
    if len(group) > 2: # This omits too small groups
        shuffle(group)
        # Create train and test bucket
        train_group, test_group = split_list(group, 0.5)
        # Padd for equal batch size
        train_group_padded = padding(train_group, 5)
        test_group_padded = padding(test_group, 5)
        # Separate list
        cid_train_group, y_train_group, x_train_group = separate(train_group)
        cid_test_group, y_test_group, x_test_group = separate(test_group)
        # Append:  convert y and x into numpy arrays for nn models
        x_train.append(np.array(x_train_group))
        x_test.append(np.array(x_test_group))
        y_train.append(np.array(y_train_group))
        y_test.append(np.array(y_test_group))
        cid_train.append(cid_train_group)
        cid_test.append(cid_test_group)

### Data standardization

In [None]:
# Function to standardize the list
def standardization(lst, index, mean, std):
    for array3d in lst:
        for array2d in array3d:
            for vector in array2d:
                vector[index] = (vector[index] - mean) / std
    return lst

# Function to compute mean and std of variable and then standardizes this variable in list
def standardize_data(a_list, b_list, index):
    var = []
    # Compute mean and std from train data variable
    for array3d in a_list:
        for array2d in array3d:
            for vector in array2d:
                var.append(vector[index])
    var = np.array(var)
    var_mean = var.mean()
    var_std = var.std()
    # Standardize a
    a_list_std = standardization(a_list, index, var_mean, var_std)
    b_list_std = standardization(b_list, index, var_mean, var_std)
    return a_list_std, b_list_std

# Standardize all variables
def standardize_all(a_list, b_list):
    length = len(a_list[0][0][0])
    indices = list(range(length))
    for i in indices:
        std_a, std_b = standardize_data(a_list, b_list, i)
    return std_a, std_b

x_train, x_test = standardize_all(x_train, x_test)

## LSTM train data descriptives

In [None]:
# Group size and sequence length
i = 1
for g in x_train:
    print('Group: ', i, ' ', 'Observations: ', len(g), ' ' 'Sequence length', len(g[0]))
    i += 1

In [None]:
# Convert y to classification
def reverse_veracity_to_categorical(vbin):
    if vbin[0] == 1:
        v = 'FALSE'
    elif vbin[1] == 1:
        v = 'MIXED'
    elif vbin[2] == 1:
        v = 'TRUE'
    return v

# Outcome distribution
i = 1
for g in y_train:
    ver = []
    for y in g:
        ver.append(reverse_veracity_to_categorical(y))
    print('Group: ', i, Counter(ver))
    i += 1

In [None]:
# Outcome distribution test overall
ver = []
for g in y_test:
    for y in g:
        ver.append(reverse_veracity_to_categorical(y))
print('Group: ', Counter(ver))

## LSTMs

### LSTM for depth

In [None]:
# Create LSTM model
model = Sequential()
model.add(LSTM(5, input_shape = (None, 3),  return_sequences = False))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Fit model and get train predictions
train_pred = []
test_pred = []
for X,Y,Z in zip(x_train, y_train, x_test):
    hist = model.fit(X, Y, epochs=2, batch_size=5)
    pred1 = model.predict(X, batch_size=5)
    pred2 = model.predict(Z, batch_size=5)
    train_pred.append(pred1)
    test_pred.append(pred2)

In [None]:
# Convert predictions to data frame with ID
def pred_to_df(ids, pred, var_name, train):
    # Create data frame of predictions
    n = len(pred[0][0])
    cols = ['cid']
    cols.extend([var_name + str(i) for i in range(n)])
    init = 0
    for id_gr,p_gr in zip(ids,pred):
        for i,p in zip(id_gr, p_gr):
            if init == 0:
                matrix = [np.append([i],p)]
                init = 1
            else:
                matrix = np.concatenate((matrix, [np.append([i],p)]), axis=0)
    df = pd.DataFrame(matrix, columns=cols)
    # Make id column integer
    df.cid = df.cid.astype(int)
    # Drop duplicates
    df = df.drop_duplicates('cid')
    # Combine train and test predictions and input ID
    df.insert(1, 'train', train)
    return df

# Get train and test predictions
ydepth = pred_to_df(cid_train, train_pred, 'ydepth', True)
y_test_test = pred_to_df(cid_test, test_pred, 'ydepth', False)
# Combine
ydepth = ydepth.append(y_test_test, ignore_index=True)

### LSTM for users

#### Get data

#### Run model

#### Get predictions

## Combined Model

### Merge data

In [None]:
df_comb = pd.merge(static, ydepth, how='left', on='cid')

### Preprocessing

In [None]:
# Make x variables categorical with integer values
def x_in_df_to_int(df, x):
    df[x].fillna('Unknown')
    df[x] = df[x].astype('category')
    num = list(range(len(df[x].cat.categories)))
    df[x] = df[x].cat.rename_categories(num)
    df[x] = df[x].astype('int')
    return df

df_comb = x_in_df_to_int(df_comb, 'verified')
df_comb = x_in_df_to_int(df_comb, 'category')

In [None]:
df_comb['virality'].isnull().values.sum()

In [None]:
# Drop NAs: Recall a few buckets were too small for LSTM --> obs removed --> resulting in NAs with merge --> drop
df_comb = df_comb.dropna(axis=0, how='any')

In [None]:
# Change format of y
def var_to_binary_in_df(df, var, fct):
    array = np.array([fct(df[var][0])])
    for i in df[var][1:]:
        array = np.concatenate((array, [fct(i)]), axis=0)
    df[var] = array.tolist()
    return df

df_comb = var_to_binary_in_df(df_comb, 'veracity', veracity_to_categorical)

In [None]:
# Split into training and test
all_vars = list(df_comb.columns.values)
cat_vars = ['category', 'verified']
cont_vars = list(set(all_vars) - set(cat_vars) - set(['cid', 'veracity', 'train']))
X_train_cat  = df_comb.loc[df_comb["train"], cat_vars]
X_test_cat  = df_comb.loc[df_comb["train"], cat_vars]
X_train_con  = df_comb.loc[df_comb["train"], cont_vars]
X_test_con  = df_comb.loc[df_comb["train"], cont_vars]
Y_train = df_comb.loc[df_comb["train"], ['veracity']]
Y_test = df_comb.loc[df_comb["train"], ['veracity']]

In [None]:
# Make categorical X to binary matrix
def x_cat_binmatrix(X):
    Xbin = pd.DataFrame(to_categorical(X.iloc[:,0]))
    X_ = pd.DataFrame(data=X.iloc[:,~0])
    for x in X_:
        Xbin = np.concatenate((Xbin, to_categorical(X_[x])), axis=1)
    return Xbin

X_train_cat = x_cat_binmatrix(X_train_cat)
X_test_cat = x_cat_binmatrix(X_test_cat)

In [None]:
# Standardize data by train mean and standard dev.
def stdx(x, mean, sd):
    return (x - mean) / sd

def std_train_test(df_train, df_test):
    for x_train, x_test in zip(df_train, df_test):
        mean = df_train[x_train].mean()
        sd = df_train[x_train].std()
        df_train[x_train] = stdx(df_train[x_train], mean, sd)
        df_test[x_test] = stdx(df_test[x_test], mean, sd)
    return df_train, df_test

X_train_con, X_test_con = std_train_test(X_train_con, X_test_con)

In [None]:
# Change datastructue df -> matrix (already done for categorical) 
X_train_con  = X_train_con.as_matrix()
X_test_con = X_test_con.as_matrix()
Y_train = Y_train.as_matrix()
Y_test = Y_test.as_matrix()

In [None]:
# Convert lists in Y to arrays
def lists_in_array_to_arrays(narray):
    matrix = np.array([np.asarray(narray[0][0])])
    for lst in narray[1:]:
        matrix = np.concatenate((matrix, [np.asarray(lst[0])]), axis=0)
    return matrix

Y_train = lists_in_array_to_arrays(Y_train)
Y_test = lists_in_array_to_arrays(Y_test)

### Modelling

In [None]:
cat_input = Input(shape=(X_train_cat.shape[1],), name='Categorial_input')
con_input = Input(shape=(X_train_con.shape[1],), name='Continous_input')

x = kerasconc([cat_input, con_input])
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(3, activation='softmax', name='main_output')(x) 
model_total = Model(inputs=[cat_input, con_input], outputs=[main_output])
model_total.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['mse','accuracy'])
model_total.fit([X_train_cat, X_train_con], 
                Y_train,
                epochs=10, 
                batch_size=24)
model_total.predict([X_test_cat, X_test_con],
                   batch_size=24)

### Evaluation

In [None]:
# Prediction
ypred_con = model_total.predict([X_train_cat,X_train_con],batch_size = 42)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(Y_train[:, i], ypred_con[:,i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(Y_train.ravel(), ypred_con.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-average ROC curve and ROC area
n_classes = 3
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot parameters
lw = 1

# Plot all ROC curves
plt.figure(figsize=(15,10))
plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

plt.plot(fpr[2], tpr[2], color = 'aqua', lw=lw,
            label='ROC curve of class True (area = {1:0.2f})'
             ''.format(2, roc_auc[2]))
plt.plot(fpr[0], tpr[0], color= 'darkorange', lw=lw,
            label='ROC curve of class False (area = {1:0.2f})'
             ''.format(0, roc_auc[0]))
plt.plot(fpr[1], tpr[1], color= 'cornflowerblue', lw=lw,
            label='ROC curve of class Mixed (area = {1:0.2f})' #check whether coding is correct
             ''.format(1, roc_auc[1]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve True, False and Mixed news based on tweet cascades')
plt.legend(loc="lower right")
plt.show()