In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent
src_path = Path(project_path, 'src')
model_path = Path(project_path, 'model')
utils_path = Path(project_path, 'utils')

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    embedding_path = 'D:\Dataset\{0}\embedding'.format(embedding)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    embedding_path = '/Volumes/Dataset/{0}/embedding'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    embedding_path = Path(project_path, 'embedding')

# including the project folder and the utils folder
if str(utils_path) not in ''.join(sys.path):
    sys.path.extend([str(project_path), str(utils_path), str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('embedding path = {0}'.format(embedding_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from utils.datapath import data_path_scripts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, classification_report
from numpy import interp 
from sklearn.metrics import confusion_matrix
from collections import Counter
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dropout
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam, Adam
from keras.regularizers import l1_l2
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
import tensorflow as tf
from keras.utils import multi_gpu_model
from sklearn.impute import SimpleImputer

%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 13]

# seed for numpy and sklearn
random_state = 7
np.random.seed(random_state)

In [None]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())
assert 'GPU' in str(device_lib.list_local_devices())
print('Tensorflow recognizes GPUs')

# confirm Keras sees the GPU
from keras import backend
available_gpu = backend.tensorflow_backend._get_available_gpus()
assert len(available_gpu) > 0
available_gpus = len(available_gpu)
print('number of available GPUs = {0}'.format(available_gpus))
print('list of GPUs = {0}\n'.format(available_gpu))

In [None]:
def class_report(y_true, y_pred, y_score=None, average='macro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true {0} is not the same shape as y_pred {1}".format(
              y_true.shape,
              y_pred.shape)
        )
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average=average))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred-cnt'] = pred_cnt
    class_report_df['pred-cnt'].iloc[-1] = total

    if not (y_score is None):
        # false positive rate
        fpr = dict()
        # true positive rate
        tpr = dict()
        roc_auc = dict()
        for label_ix, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_ix])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())

            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    return class_report_df

In [None]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
filename = 'sentiment_per_post.csv'
dataset = pd.read_csv(Path(data_path, filename))

le = LabelEncoder()
le.fit(dataset['risk_label'])
dataset.loc[: ,'risk_label'] = le.transform(dataset['risk_label']) 

dataset.head()

In [None]:
filename = Path(data_path, 'static_features_pandas_v2.pkl')
dataset = pd.read_pickle(filename)
dataset.head()

<pre>
Feature Set   Description  
A             static_derieved_features  
B             post_coount_by_subreddit
C             lexicon_count  
D             sentiments_macro  
E             sentiments_micro  
F             empathy
G             readability
H             social_context
I             srl
J             ctakes

In [None]:
# features_interest = ['SETAAA']
# features_interest = ['SETBBB'] # contains NaN
# features_interest = ['SETCCC'] # contains NaN
# features_interest = ['SETDDD']
# features_interest = ['SETEEE']
# features_interest = ['SETFFF'] # contains NaN
# features_interest = ['SETGGG'] # contains NaN
# features_interest = ['SETHHH'] # contains NaN
# features_interest = ['SETIII'] # contains NaN
# features_interest = ['SETJJJ'] # contains NaN
# features_interest = ['SETAAA', 'SETDDD', 'SETEEE', 'SETFFF', 'SETGGG']
features_interest = ['SETAAA', 'SETBBB']

features = [x for x in dataset.columns if x.split('__')[0] in features_interest]
# including label
features.append('target')

In [None]:
data = dataset[features].copy()

In [None]:
data = dataset.copy()

In [None]:
classifier = 'logit'

# define 5-fold cross validation test harness
k_fold = pd.read_csv(Path(data_path, 'clpsych19_public_crossvalidation_splits.csv'), header=None,
                    names=['fold', 'train_text', 'user_id'])

# keep non conrol user ids
k_fold = k_fold[k_fold['user_id'] > 0]

print('5 fold CV starting')
for fold_ix in range(1,6):
    
    print('\nFold = {0}'.format(fold_ix))
    
    train_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'training')]['user_id']
    test_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'test')]['user_id']
    
    x_train = data[data.index.isin(train_ix)].copy()
    x_test = data[data.index.isin(test_ix)].copy()
    
    y_train = x_train['target']
    x_train.drop(['target'], axis=1, inplace=True)
    
    y_test = x_test['target']
    x_test.drop(['target'], axis=1, inplace=True)
    
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(x_train)
    x_train = imp_mean.transform(x_train)
    
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean.fit(x_test)
    x_test = imp_mean.transform(x_test)

#     x_train = x_train.fillna(x_train.mean())
#     x_test = x_test.fillna(x_test.mean())
    
    x_train = (x_train - x_train.mean()) / (x_train.max() - x_train.min())
    x_test = (x_test - x_test.mean()) / (x_test.max() - x_test.min())
    
    if classifier == 'nn':
        # basic neural network
        model = Sequential()
        model.add(Dense(5000, input_dim=np.shape(x_train)[1], activation='relu'))
        model.add(Dense(2500, activation='relu'))
        model.add(Dense(2000, activation='relu'))
        model.add(Dense(1000, activation='relu'))
        model.add(Dense(500, activation='relu'))
        model.add(Dense(250, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(len(y_train.unique()), activation='sigmoid'))

        model = multi_gpu_model(model, gpus=available_gpus)

        # treating every instance of class 1 as 50 instances of class 0
        class_weight = {1: 1, 2: 10, 3:10, 4: 1}

        # Compile model
        model.compile(optimizer='adam', loss=f1_loss, metrics=[f1])
        model.fit(x_train, y_train, epochs=1, batch_size=100, verbose=1, 
                  class_weight=class_weight)

        y_pred_train = np.argmax(model.predict(x_train), axis=1)
        y_pred_test = np.argmax(model.predict(x_test), axis=1)
        y_score = model.predict(x_test)
        
        
        
    elif classifier == 'logit':
#         model = LogisticRegression(class_weight='balanced', n_jobs=-1,
#                                   multi_class='auto', solver='lbfgs',
#                                   tol=0.00001, C=10, max_iter=1000, verbose=True,
#                                   random_state=random_state)
        model = LogisticRegression(class_weight='balanced', n_jobs=-1,
                          multi_class='auto', solver='lbfgs',
                          tol=0.00001, C=10, max_iter=1000, verbose=True,
                          random_state=random_state)  
        model.fit(x_train, y_train)
        y_pred_test = model.predict(x_test)
        y_score=model.predict_proba(x_test)
        
    report_with_auc = class_report(
    y_true=y_test, 
    y_pred=y_pred_test, 
    y_score=y_score,
    average='macro')

    

    cv_column = [fold_ix]
    cv_column.extend( [''] * (report_with_auc.index.shape[0] - 1))
    report_with_auc['Fold'] = cv_column
    report_with_auc['Risk-Factor'] = report_with_auc.index
    report_with_auc = report_with_auc.set_index(['Fold', 'Risk-Factor'])

    if fold_ix == 1:
        report_with_auc_df = report_with_auc.copy()
    else:
        report_with_auc_df = report_with_auc_df.append(report_with_auc.copy())

In [None]:
report_with_auc_df