In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent
src_path = Path(project_path, 'src')
model_path = Path(project_path, 'model')
utils_path = str(Path(project_path, 'utils'))

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    embedding_path = 'D:\Dataset\{0}\embedding'.format(embedding)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    embedding_path = '/Volumes/Dataset/{0}/embedding'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    embedding_path = Path(project_path, 'embedding')

# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path, str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('embedding path = {0}'.format(embedding_path))
print('sys.path = {0}'.format(sys.path))

# Time-series features extraction

In [None]:
import random
import pandas as pd
import numpy as np
from itertools import islice
from tsfresh import extract_features
from tsfresh.feature_selection.significance_tests import target_real_feature_real_test

random_state = 7
np.random.seed(random_state)

In [None]:
%ls ../dataset

In [None]:
filename = 'sentiment_per_post.csv'
# filename = 'sentiment_per_user_micro.csv'
# filename = 'sentiment_per_user_macro.csv'

In [None]:
data = pd.read_csv(Path(data_path, filename))
data.tail()

In [None]:
cpu_number = os.cpu_count()

In [None]:
dataset = data[['user_id', 'timestamp', 'sent_1', 'sent_2', 'sent_3', 'sent_4', 'sent_5']].copy()
y = data[['risk_label']]

In [None]:
extracted_features_user_post = extract_features(dataset, column_id="user_id", column_sort="timestamp", 
                                                n_jobs=cpu_number)

In [None]:
extracted_features_user_post = target_real_feature_real_test(dataset, column_id="user_id", column_sort="timestamp", 
                                                n_jobs=cpu_number)

In [None]:
print(dataset.shape)
print(len(dataset['user_id'].unique()))
print(extracted_features_user_post.shape)

In [None]:
extracted_features_user_post.index.name = 'user_id'
extracted_features_user_post.reset_index(level=0, inplace=True)
extracted_features_user_post.head()

In [None]:
# get the user_id and risk label without duplicates
data_risk = data[['user_id', 'risk_label']].drop_duplicates(keep='first')
n_data = pd.merge(extracted_features_user_post, data_risk, 
                  left_on='user_id', right_on='user_id', how='inner')
n_data.to_csv(Path(data_path, 'extracted_features_sentiment_per_user.csv'), index=False)

# Feature Selection

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import f1_score
from keras import backend as K
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_recall_fscore_support
from numpy import interp 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import multi_gpu_model
import tensorflow as tf

random_state = 7
np.random.seed(random_state)

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [None]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())
assert 'GPU' in str(device_lib.list_local_devices())
print('Tensorflow recognizes GPUs')

# confirm Keras sees the GPU
from keras import backend
available_gpu = backend.tensorflow_backend._get_available_gpus()
assert len(available_gpu) > 0
available_gpus = len(available_gpu)
print('number of available GPUs = {0}'.format(available_gpus))
print('list of GPUs = {0}\n'.format(available_gpu))

In [None]:
def class_report(y_true, y_pred, y_score=None, average='macro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true {0} is not the same shape as y_pred {1}".format(
              y_true.shape,
              y_pred.shape)
        )
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average=average))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred-cnt'] = pred_cnt
    class_report_df['pred-cnt'].iloc[-1] = total

    if not (y_score is None):
        # false positive rate
        fpr = dict()
        # true positive rate
        tpr = dict()
        roc_auc = dict()
        for label_ix, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_ix])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())

            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    return class_report_df

In [None]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['risk_label'])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['risk_label'].values, dtrain_predictions))
    print("R1-Score (Train): %f" % f1(dtrain['risk_label'], dtrain_predprob))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

In [None]:
dataset = pd.read_csv(Path(data_path, 'extracted_features_sentiment_per_user.csv'))

target = 'risk_label'
IDcol = 'user_id'

dataset.fillna(-1, inplace=True)

le = preprocessing.LabelEncoder()
le.fit(dataset['risk_label'])
dataset.loc[: ,'risk_label'] = le.transform(dataset['risk_label']) 

print(dataset.shape)
dataset.head()

In [None]:
# define 5-fold cross validation test harness
k_fold = pd.read_csv(Path(data_path, 'clpsych19_public_crossvalidation_splits.csv'), header=None,
                    names=['fold', 'train_text', 'user_id'])

# keep non conrol user ids
k_fold = k_fold[k_fold['user_id'] > 0]

# keep the top 100 features
top_100_list = list()

report_with_auc_df = ''

print('5 fold CV starting')
for fold_ix in range(1,6):
    
    print('\nFold = {0}'.format(fold_ix))
    
    train_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'training')]['user_id']
    test_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'test')]['user_id']
    
    x_train = dataset[dataset['user_id'].isin(train_ix)].copy()
    x_test = dataset[dataset['user_id'].isin(test_ix)].copy()
    
    y_train = x_train['risk_label']
    x_train.drop(['risk_label', 'user_id'], axis=1, inplace=True)
    
    y_test = x_test['risk_label']
    x_test.drop(['risk_label', 'user_id'], axis=1, inplace=True)
    
    model = GradientBoostingClassifier(random_state=random_state)
    model.fit(x_train, y_train)

    #Predict training set:
    dtrain_predictions = model.predict(x_train)
    dtrain_predprob = model.predict_proba(x_train)[:,1]
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y_train, dtrain_predictions))
    print("R1-Score (Train): %f" % f1_score(y_train, dtrain_predictions, average='macro'))

    #Print Feature Importance:    
    feat_imp = pd.Series(model.feature_importances_[:30], predictors[:30]).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
        
    # remove the sent_$_ part of the features
    features_imps = [re.sub('sent_\d__','',x) for x in feat_imp.keys()]
    features_imps = list(dict.fromkeys(features_imps))[:30]
        
    top_100_list.append(features_imps)
    
    features = [x for x in x_train.columns if re.sub('sent_\d__','',x) in features_imps]

    x_train = x_train[features].copy()
    x_test = x_test[features].copy()

    # basic neural network
    model = Sequential()
    model.add(Dense(25, input_dim=np.shape(x_train)[1], activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(len(y_train.unique()), activation='sigmoid'))

    model = multi_gpu_model(model, gpus=available_gpus)

    # Compile model
    model.compile(optimizer='adam', loss=f1_loss, metrics=[f1])
    model.fit(x_train, y_train, epochs=1, batch_size=100, verbose=1)

    y_pred_train = np.argmax(model.predict(x_train), axis=1)
    y_pred_test = np.argmax(model.predict(x_test), axis=1)

    report_with_auc = class_report(
        y_true=y_test, 
        y_pred=y_pred_test, 
        y_score=model.predict(x_test),
        average='macro')

    cv_column = [fold_ix]
    cv_column.extend( [''] * (report_with_auc.index.shape[0] - 1))
    report_with_auc['Fold'] = cv_column
    report_with_auc['Risk-Factor'] = report_with_auc.index
    report_with_auc = report_with_auc.set_index(['Fold', 'Risk-Factor'])

    if fold_ix == 1:
        report_with_auc_df = report_with_auc.copy()
    else:
        report_with_auc_df = report_with_auc_df.append(report_with_auc.copy())

In [None]:
report_with_auc_df