In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
#import util
# pd.set_option('display.float_format', lambda x: '%.3f' % x)

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from collections import OrderedDict
from time import time
import os

from scipy.optimize import fmin_powell
from scipy import integrate

import theano as thno
import theano.tensor as T
import arviz as az

az.style.use('arviz-darkgrid')

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class AdditionalAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, include_additional_attr = True): # no *args or **kargs
        self.include_additional_attr = include_additional_attr
        self.guille_feature_names = ['H', 'dest_A_1', 'dest_A_2', 'dest_A_3', 'dest_A_4', 'dest_A_5',
                                     'dest_A_6', 'dest_I', 'dest_dTR', 'dest_hK', 'dest_hM', 'dest_mR', 
                                     'src_A_1', 'src_A_2', 'src_A_3', 'src_A_4', 'src_A_5', 'src_A_6', 
                                     'src_I', 'src_dTR', 'src_hK', 'src_hM','src_mR']
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        if self.include_additional_attr:
            return X # might need to check if this returns pandas or a numpy, i need a numpy
        else:
            return X[self.guille_feature_names]

In [3]:
from joblib import dump

def save_models(saveas, rnd_estimator, lr_estimator):
    save_to = '../models/'
    saveas = os.path.join(save_to, saveas)
    if os.path.exists(saveas):
        raise FileExistsError(f'The path "{os.path.abspath(saveas)}" already exists, ' 
                              'please choose a new name.')
    else:
        rfc_dir = os.path.join(saveas, 'rfc')
        bayes_dir = os.path.join(saveas, 'bayes/')
        
        os.makedirs(rfc_dir)
        os.mkdir(bayes_dir)
    
    # save random forest estimator
    dump(rnd_estimator, os.path.join(rfc_dir, 'model.rnd'))
    
    # save bayesian logistic regression model
    lr_estimator.save(bayes_dir)

Set Project Parameters

In [10]:
include_additional_attr = False
dataset_filepath = "../data/processed/small-network-5/dataset.h5"
dataset_key = 'TVNWiGXGlrOnAwm'

if not os.path.exists(dataset_filepath):
    raise FileNotFoundError(f'{os.path.abspath(dataset_filepath)} does not exist.')

# Explore Dataset

In [11]:
# dataset = pd.read_csv(dataset_filepath)
dataset = pd.read_hdf(dataset_filepath, key=dataset_key)

# Remove user_ids from dataframe
dataset = dataset.drop(['src_user_id', 'dest_user_id'], axis=1)
dataset.head()

Unnamed: 0,H,dest_A_1,dest_A_2,dest_A_3,dest_A_4,dest_A_5,dest_A_6,dest_I,dest_avg_negative_sentiment_of_tweets,dest_avg_number_followers,...,src_ratio_of_retweets_to_tweets,src_ratio_of_tweet_per_time_period_1,src_ratio_of_tweet_per_time_period_2,src_ratio_of_tweet_per_time_period_3,src_ratio_of_tweet_per_time_period_4,src_ratio_of_tweets_that_got_retweeted_per_time_period_1,src_ratio_of_tweets_that_got_retweeted_per_time_period_2,src_ratio_of_tweets_that_got_retweeted_per_time_period_3,src_ratio_of_tweets_that_got_retweeted_per_time_period_4,y
0,0.003559,0.188573,0.01561,0.071495,0.276616,0.280674,0.167031,1.0,1.0,1.0,...,0.781638,0.289392,0.274504,0.246898,0.189206,0.012717,0.003412,0.022022,0.013648,0
1,0.0,0.296296,0.037037,0.0,0.222222,0.148148,0.296296,0.037007,1.0,0.026874,...,0.781638,0.289392,0.274504,0.246898,0.189206,0.012717,0.003412,0.022022,0.013648,0
2,0.010782,0.22332,0.166008,0.037549,0.033597,0.221344,0.318182,0.693531,1.0,0.322489,...,0.996601,0.21508,0.048208,0.468789,0.267923,0.01267,0.00309,0.026267,0.021323,0
3,0.000758,0.195604,0.065934,0.0,0.105495,0.393407,0.23956,0.623629,1.0,0.30976,...,0.817104,0.105805,0.019039,0.493758,0.381398,0.048689,0.008739,0.176966,0.129838,1
4,0.0,0.056924,0.01503,0.010233,0.45379,0.285577,0.178446,1.0,1.0,1.0,...,0.954813,0.233055,0.007119,0.2606,0.499226,0.21727,0.005881,0.231198,0.422779,1


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 84 columns):
H                                                            5 non-null float64
dest_A_1                                                     5 non-null float64
dest_A_2                                                     5 non-null float64
dest_A_3                                                     5 non-null float64
dest_A_4                                                     5 non-null float64
dest_A_5                                                     5 non-null float64
dest_A_6                                                     5 non-null float64
dest_I                                                       5 non-null float64
dest_avg_negative_sentiment_of_tweets                        5 non-null float64
dest_avg_number_followers                                    5 non-null float64
dest_avg_number_friends                                      5 non-null float64
dest_avg_number_of_

In [13]:
dataset.describe()

Unnamed: 0,H,dest_A_1,dest_A_2,dest_A_3,dest_A_4,dest_A_5,dest_A_6,dest_I,dest_avg_negative_sentiment_of_tweets,dest_avg_number_followers,...,src_ratio_of_retweets_to_tweets,src_ratio_of_tweet_per_time_period_1,src_ratio_of_tweet_per_time_period_2,src_ratio_of_tweet_per_time_period_3,src_ratio_of_tweet_per_time_period_4,src_ratio_of_tweets_that_got_retweeted_per_time_period_1,src_ratio_of_tweets_that_got_retweeted_per_time_period_2,src_ratio_of_tweets_that_got_retweeted_per_time_period_3,src_ratio_of_tweets_that_got_retweeted_per_time_period_4,y
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,0.00302,0.192144,0.059924,0.023856,0.218344,0.26583,0.239903,0.670833,1.0,0.531825,...,0.866359,0.226545,0.124674,0.343389,0.305392,0.060813,0.004907,0.095695,0.120247,0.4
std,0.00458,0.086788,0.062835,0.030744,0.162567,0.090435,0.067814,0.394075,0.0,0.44342,...,0.101942,0.075252,0.137589,0.126304,0.133932,0.08884,0.002419,0.100799,0.176161,0.547723
min,0.0,0.056924,0.01503,0.0,0.033597,0.148148,0.167031,0.037007,1.0,0.026874,...,0.781638,0.105805,0.007119,0.246898,0.189206,0.01267,0.00309,0.022022,0.013648,0.0
25%,0.0,0.188573,0.01561,0.0,0.105495,0.221344,0.178446,0.623629,1.0,0.30976,...,0.781638,0.21508,0.019039,0.246898,0.189206,0.012717,0.003412,0.022022,0.013648,0.0
50%,0.000758,0.195604,0.037037,0.010233,0.222222,0.280674,0.23956,0.693531,1.0,0.322489,...,0.817104,0.233055,0.048208,0.2606,0.267923,0.012717,0.003412,0.026267,0.021323,0.0
75%,0.003559,0.22332,0.065934,0.037549,0.276616,0.285577,0.296296,1.0,1.0,1.0,...,0.954813,0.289392,0.274504,0.468789,0.381398,0.048689,0.005881,0.176966,0.129838,1.0
max,0.010782,0.296296,0.166008,0.071495,0.45379,0.393407,0.318182,1.0,1.0,1.0,...,0.996601,0.289392,0.274504,0.493758,0.499226,0.21727,0.008739,0.231198,0.422779,1.0


In [None]:
sns.stripplot(x="y", y="H", data=dataset, jitter=True)

In [14]:
X = dataset.drop("y", axis=1) # drop labels for training set
y = dataset["y"].copy()

feature_names = X.columns.values

y.value_counts()

0    3
1    2
Name: y, dtype: int64

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
    ('attribs_adder', AdditionalAttributes(include_additional_attr = include_additional_attr)),
    ('imputer', SimpleImputer(strategy="mean")),
])

# ('std_scaler', StandardScaler()),

X = pipeline.fit_transform(X)

KeyError: "['dest_A', 'src_A'] not in index"

In [None]:
# resample using SMOTE balance
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X, y)

In [None]:
# column values based on whether additional attributes is true or false
# include_additional_attr = pipeline.named_steps['attribs_adder'].include_additional_attr
guille_feature_names = pipeline.named_steps['attribs_adder'].guille_feature_names

if not include_additional_attr:
    feature_names = guille_feature_names

In [None]:
X_resampled = pd.DataFrame(X_resampled, columns=feature_names)
y_resampled = pd.Series(y_resampled)

In [None]:
y_resampled.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
print ('x_train shape: ', X_train.shape)
print ('y_train shape: ', y_train.shape)
print ('x_test shape: ', X_test.shape)
print ('y_test shape: ', y_test.shape)

## Exploring the data

In [None]:
#g = seaborn.pairplot(data)

In [None]:
# Compute the correlation matrix
corr = X_train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)



# Get most important features using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(random_state=42)

params = {
    'max_features': [.4,'auto', 4], 
    'n_estimators': [15, 100, 200], 
    'min_samples_leaf': [.1, 1],
}

cv = GridSearchCV(rfc, params, n_jobs=-1).fit(X_train, y_train)

clf = cv.best_estimator_
cv.best_params_

In [None]:
from sklearn.model_selection import cross_val_score

# Conduct cross-validation
acc_cv_results = cross_val_score(clf,
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=10, # Cross-validation technique
                             scoring="accuracy",
                             n_jobs=-1) # Use all CPU scores

f1_cv_results = cross_val_score(clf,
                             X_train, # Feature matrix
                             y_train, # Target vector
                             cv=10, # Cross-validation technique
                             scoring="f1",
                             n_jobs=-1) # Use all CPU scores

print(f'accuracy: {acc_cv_results.mean()}\nf1_score: {f1_cv_results.mean()}')

In [None]:
classes = [0, 1]

In [None]:
from yellowbrick.classifier import ConfusionMatrix
#mapping = {1:'negative', 0:'positive'}
fig, ax = plt.subplots(figsize=(7, 7))
cm_viz = ConfusionMatrix(clf, classes=classes, fontsize=15)
cm_viz.fit(X_train, y_train)
cm_viz.score(X_test, y_test)
cm_viz.poof()

In [None]:
from yellowbrick.classifier import ClassificationReport
fig, ax = plt.subplots(figsize=(10, 7))
viz = ClassificationReport(clf, classes=classes, fontsize=15)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.poof()

In [None]:
clf.fit(X_train, y_train)

In [None]:
importances = clf.feature_importances_

feat_importances = pd.Series(importances, index=feature_names)
n_features = feat_importances.nlargest(15)
n_features

In [None]:
features = n_features.index

## Build Bayesian Logistics Regression Model

In [None]:
X = X_resampled[features]
y = y_resampled.copy()

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print ('x_train shape: ', X_train.shape)
print ('y_train shape: ', y_train.shape)
print ('x_test shape: ', X_test.shape)
print ('y_test shape: ', y_test.shape)

In [None]:
#g = seaborn.pairplot(data)

In [None]:
# Compute the correlation matrix
corr = X_train.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)



In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
from pymc3_models.models.LogisticRegression import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train, y_train, inference_type='nuts', inference_args={'draws': 2000})

In [None]:
az.plot_trace(LR.trace)

In [None]:
LR.summary

In [None]:
y_pred = LR.predict(X_test)

In [None]:
y_score = LR.predict_proba(X_test)

### Performance

In [None]:
LR.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['0', '1']); ax.yaxis.set_ticklabels(['1', '0']);

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_score)

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, threshold = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc_score(y_test, y_score))
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
save_models(saveas='big-network-full-attr', rnd_estimator=clf, lr_estimator=LR)