In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent
src_path = Path(project_path, 'src')
model_path = Path(project_path, 'model')
utils_path = Path(project_path, 'utils')

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    embedding_path = 'D:\Dataset\{0}\embedding'.format(embedding)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    embedding_path = '/Volumes/Dataset/{0}/embedding'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    embedding_path = Path(project_path, 'embedding')

# including the project folder and the utils folder
if str(utils_path) not in ''.join(sys.path):
    sys.path.extend([str(project_path), str(utils_path), str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('embedding path = {0}'.format(embedding_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from utils.datapath import data_path_scripts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, classification_report
from numpy import interp 
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from collections import Counter
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dropout
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam, Adam
from keras.regularizers import l1_l2
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
import tensorflow as tf
from keras.utils import multi_gpu_model
from sklearn.impute import SimpleImputer

%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 13]

# seed for numpy and sklearn
random_state = 7
np.random.seed(random_state)

In [None]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())
assert 'GPU' in str(device_lib.list_local_devices())
print('Tensorflow recognizes GPUs')

# confirm Keras sees the GPU
from keras import backend
available_gpu = backend.tensorflow_backend._get_available_gpus()
assert len(available_gpu) > 0
available_gpus = len(available_gpu)
print('number of available GPUs = {0}'.format(available_gpus))
print('list of GPUs = {0}\n'.format(available_gpu))

In [None]:
from sklearn.preprocessing import LabelBinarizer

def class_report(y_true, y_pred, y_score=None, average='macro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true {0} is not the same shape as y_pred {1}".format(
              y_true.shape,
              y_pred.shape)
        )
        return

    lb = LabelBinarizer()
    
    # check if multi label ?
    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(y_pred, return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)
    
    # precision
    #     ratio tp / (tp + fp)
    #     ability of the classifier not to label as positive a sample that is negative
    # recall
    #     tp / (tp + fn)
    #     ability of the classifier to find all the positive samples
    # f1-score
    #     2 * (precision * recall)/(precision + recall)
    #     weighted harmonic mean of the precision and recall
    #     best value at 1 and worst score at 0
    # support
    #     number of occurrences of each class in y_true
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average=average))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred-cnt'] = pred_cnt
    class_report_df['pred-cnt'].iloc[-1] = total

    if not (y_score is None):
        # false positive rate
        fpr = dict()
        # true positive rate
        tpr = dict()
        roc_auc = dict()
        for label_ix, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_ix])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())

            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    return class_report_df

In [None]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
filename = 'sentiment_per_post.csv'
dataset = pd.read_csv(Path(data_path, filename))

label = 'risk_label'

le = LabelEncoder()
le.fit(dataset['risk_label'])
dataset.loc[: ,'risk_label'] = le.transform(dataset['risk_label']) 

dataset.head()

In [None]:
filename = Path(data_path, 'static_features_pandas_v2.pkl')
dataset = pd.read_pickle(filename)
label = 'target'
dataset.head()

<pre>
Feature Set   Description  
A             static_derieved_features  
B             post_coount_by_subreddit
C             lexicon_count  
D             sentiments_macro  
E             sentiments_micro  
F             empathy
G             readability
H             social_context
I             srl
J             ctakes

In [None]:
features_interest = ['SETAAA']
# features_interest = ['SETBBB'] # contains NaN
# features_interest = ['SETCCC'] # contains NaN
# features_interest = ['SETDDD']
# features_interest = ['SETEEE']
# features_interest = ['SETFFF'] # contains NaN
# features_interest = ['SETGGG'] # contains NaN
# features_interest = ['SETHHH'] # contains NaN
# features_interest = ['SETIII'] # contains NaN
# features_interest = ['SETJJJ'] # contains NaN
features_interest = ['SETAAA', 'SETDDD', 'SETEEE', 'SETFFF']

features = [x for x in dataset.columns if x.split('__')[0] in features_interest]
# including label
features.append('target')

In [None]:
data = dataset[features].copy()

In [None]:
data = dataset.copy()
data.head()

In [None]:
# define 5-fold cross validation test harness
k_fold = pd.read_csv(Path(data_path, 'clpsych19_public_crossvalidation_splits.csv'), header=None,
                    names=['fold', 'train_text', 'user_id'])

# keep non conrol user ids
k_fold = k_fold[k_fold['user_id'] > 0]

In [None]:
classifier = 'logit'

print('5 fold CV starting')
for fold_ix in range(1,6):
    
    print('\nFold = {0}'.format(fold_ix))
    
    train_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'training')]['user_id']
    test_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'test')]['user_id']
    
    x_train_df = data[data.index.isin(train_ix)].copy()
    x_test_df = data[data.index.isin(test_ix)].copy()
    
    
    y_train = x_train_df[label]
    x_train = x_train_df.drop([label], axis=1).values
    
    y_test = x_test_df[label]
    x_test = x_test_df.drop([label], axis=1).values
    
#     y_train = x_train_df[label].values
#     x_train = x_train_df.drop([label, 'post_id'], axis=1).values
    
#     y_test = x_test_df[label].values
#     x_test = x_test_df.drop([label, 'post_id'], axis=1).values
    
#     imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#     imp_mean.fit(x_train)
#     x_train = imp_mean.transform(x_train)
    
#     imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#     imp_mean.fit(x_test)
#     x_test = imp_mean.transform(x_test)

#     x_train = x_train.fillna(x_train.mean())
#     x_test = x_test.fillna(x_test.mean())
    
#     x_train = (x_train - x_train.mean()) / (x_train.max() - x_train.min())
#     x_test = (x_test - x_test.mean()) / (x_test.max() - x_test.min())
    
    if classifier == 'nn':
        # basic neural network
        model = Sequential()
        model.add(Dense(5000, input_dim=np.shape(x_train)[1], activation='relu'))
        model.add(Dense(2500, activation='relu'))
        model.add(Dense(2000, activation='relu'))
        model.add(Dense(1000, activation='relu'))
        model.add(Dense(500, activation='relu'))
        model.add(Dense(250, activation='relu'))
        model.add(Dense(100, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(len(y_train.unique()), activation='sigmoid'))

        model = multi_gpu_model(model, gpus=available_gpus)

        # treating every instance of class 1 as 50 instances of class 0
        class_weight = {1: 1, 2: 10, 3:10, 4: 1}

        # Compile model
        model.compile(optimizer='adam', loss=f1_loss, metrics=[f1])
        model.fit(x_train, y_train, epochs=1, batch_size=100, verbose=1, 
                  class_weight=class_weight)

        y_pred_train = np.argmax(model.predict(x_train), axis=1)
        y_pred_test = np.argmax(model.predict(x_test), axis=1)
        y_score = model.predict(x_test)
        
        
        
    elif classifier == 'logit':
        
        ab_indices = y_train < 3
        cd_indices = y_train > 2
        
        model1 = LogisticRegression(class_weight='balanced', n_jobs=-1,
                          multi_class='auto', solver='lbfgs',
                          tol=0.0001, C=120, max_iter=100, verbose=True,
                          random_state=random_state, penalty='l2')  
        model1.fit(x_train[ab_indices], y_train[ab_indices])
        
        model2 = LogisticRegression(class_weight='balanced', n_jobs=-1,
                  multi_class='auto', solver='lbfgs',
                  tol=0.0001, C=120, max_iter=100, verbose=True,
                  random_state=random_state, penalty='l2')  
        model2.fit(x_train[cd_indices], y_train[cd_indices])
        
        break
    
        y_pred_train1 = model1.predict(x_train[ab_indices])
        y_score_train1 =model1.predict_proba(x_train[ab_indices])
#         y_pred_test1 = model1.predict(x_test)
        
        
        y_pred_train1 = model2.predict(x_train[cd_indices])
        y_score_train1 =model2.predict_proba(x_train[cd_indices])
#         y_pred_test1 = model2.predict(x_test)
        
        
        cm = confusion_matrix(y_true=y_train[ab_indices], y_pred=y_pred_train1)
        plot_confusion_matrix(cm, target_names=['a','b'])
        
        cm = confusion_matrix(y_true=y_train[cd_indices], y_pred=y_pred_train2)
        plot_confusion_matrix(cm, target_names=['c','d'])
        
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred_test)
        plot_confusion_matrix(cm, target_names=['a','b','c','d'])
        
        y_score_test1 =model1.predict_proba(x_test)
        y_score_test1 =model2.predict_proba(x_test)
        
        n_dataset = pd.DataFrame(np.concatenate((y_score_test1, y_score_test2), axis=1), columns=[1,2,3,4])
        
        
    
    break
    report_with_auc = class_report(
    y_true=y_test, 
    y_pred=y_pred_test, 
    y_score=y_score,
    average='macro')

    cv_column = [fold_ix]
    cv_column.extend( [''] * (report_with_auc.index.shape[0] - 1))
    report_with_auc['Fold'] = cv_column
    report_with_auc['Risk-Factor'] = report_with_auc.index
    report_with_auc = report_with_auc.set_index(['Fold', 'Risk-Factor'])

    if fold_ix == 1:
        report_with_auc_df = report_with_auc.copy()
    else:
        report_with_auc_df = report_with_auc_df.append(report_with_auc.copy())

In [None]:
y_pred_train1 = model1.predict(x_train[ab_indices])
y_score_train1 =model1.predict_proba(x_train[ab_indices])
y_pred_test1 = model1.predict(x_test)
y_score_test1 =model1.predict_proba(x_test)

y_pred_train2 = model2.predict(x_train[cd_indices])
y_score_train2 =model2.predict_proba(x_train[cd_indices])
y_pred_test2 = model2.predict(x_test)
y_score_test2 =model2.predict_proba(x_test)

cm = confusion_matrix(y_true=y_train[ab_indices], y_pred=y_pred_train1)
plot_confusion_matrix(cm, target_names=['a','b'])

cm = confusion_matrix(y_true=y_train[cd_indices], y_pred=y_pred_train2)
plot_confusion_matrix(cm, target_names=['c','d'])

In [None]:
for ele in zip(y_test, y_score_test1, y_score_test2):
    print(ele)

In [None]:
np.shape(y_score_test2)

In [None]:
all_pred = pd.DataFrame(np.concatenate((y_score_test1, y_score_test2), axis=1), columns=[1,2,3,4])
all_pred = all_pred.div(all_pred.sum(axis=1), axis=0)
final_pred = all_pred.idxmax(axis=1)
all_pred['true'] = y_test.values
all_pred['pred'] = final_pred
all_pred[(all_pred['true'] == 4) & (all_pred['pred'] == 1)]
all_pred[(all_pred['true'] == 3)]

In [None]:
new_pred = list()
for _, ele in all_pred.iterrows():
    ele.drop(['true', 'pred', 'n_pred'], inplace=True)
    if ele[3] > ele[2] and ele[3] > ele[4] and ele[1] < .5:
        new_pred.append(3)
    elif ele[2] > ele[4]:
        new_pred.append(2)
    else:
        new_pred.append(ele.idxmax(axis=1))

In [None]:
# all_pred.drop('n_pred', axis=1, inplace=True)
all_pred.head()

In [None]:
all_pred['n_pred'] = new_pred
all_pred

In [None]:
cm = confusion_matrix(y_true=y_test, y_pred=all_pred['n_pred'])
plot_confusion_matrix(cm, target_names=['a','b','c','d'])
class_report(
    y_true=y_test, 
    y_pred=all_pred['n_pred'],
    average='macro')

In [None]:
y_score_test1 =model1.predict_proba(x_test)
y_score_test2 =model2.predict_proba(x_test)
all_pred = pd.DataFrame(np.concatenate((y_score_test1, y_score_test2), axis=1), columns=[1,2,3,4])
final_pred = all_pred.idxmax(axis=1)
cm = confusion_matrix(y_true=y_test, y_pred=final_pred)
plot_confusion_matrix(cm, target_names=['a','b','c','d'])
class_report(
    y_true=y_test, 
    y_pred=final_pred, 
    y_score=all_pred.values,
    average='macro')

In [None]:
class_report(
    y_true=y_train, 
    y_pred=y_pred_train, 
    y_score=y_score_train,
    average='macro')

In [None]:
class_report(
    y_true=y_test, 
    y_pred=y_pred_test, 
    y_score=y_score_test,
    average='macro')

In [None]:
cm = confusion_matrix(y_true=y_test, y_pred=y_pred_test)
plot_confusion_matrix(cm, target_names=['a','b','c','d'])

In [None]:
y_pred_test

In [None]:
for ele in zip(y_test,y_pred_test, y_score_test):
    print(ele)