In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    model_path = 'D:\Dataset\{0}\models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    model_path = '/Volumes/Dataset/{0}/models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    src_path = Path(project_path, 'src')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path, str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
# import matplotlib.pyplot as plt
import random
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from utils.datapath import data_path_scripts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_recall_fscore_support, classification_report
from numpy import interp 
from sklearn.metrics import confusion_matrix
import re
# from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
# from keras.layers import Reshape, Flatten, Dropout, Concatenate
# from keras.callbacks import ModelCheckpoint
# from keras.optimizers import Adam
# from keras.models import Model
# from keras.layers import Dropout
# from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adamax, Nadam, Adam
# from keras.regularizers import l1_l2
# from keras.models import Sequential
# from keras.layers import Dense
# from keras import backend
# from keras import metrics
# import tensorflow as tf
import _pickle as pickle 
import stanfordnlp
from data_helpers import load_data_and_labels

# %matplotlib inline
# plt.rcParams['figure.figsize'] = [20, 13]

# seed for numpy and sklearn
random_state = 7
np.random.seed(random_state)

In [None]:
%conda install -c conda-forge gensim

In [None]:
#stanfordnlp.download('en')

In [None]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())
assert 'GPU' in str(device_lib.list_local_devices())
print('Tensorflow recognizes GPUs')

# confirm Keras sees the GPU
from keras import backend
available_gpu = backend.tensorflow_backend._get_available_gpus()
assert len(available_gpu) > 0
print('number of available GPUs = {0}'.format(len(available_gpu)))
print('list of GPUs = {0}\n'.format(available_gpu))

In [None]:
def resulting_confusion_matrix(y, y_pred, file_name, train_test, fig_number):
    
    print()
    print("\n{0} results\n".format(train_test))
    
    n_true = len(y[y == 0])
    n_false = len(y[y == 1])
    
    print('total data points = {0}'.format(y.shape[0]))
    print('true data points = {0} (% {1:.3})'.format(n_true, n_true / np.shape(y)[0]))
    print('false data points = {0} (% {1:.3})'.format(n_false, n_false / np.shape(y)[0]))
    
    fpr, tpr, _ = roc_curve(y, y_pred)
    roc_score = auc(fpr, tpr)
    r_auc_score = roc_auc_score(y, y_pred)
    
    print('roc score = {0}'.format(roc_score))
    print('roc auc score = {0}'.format(r_auc_score))
    
    print()    
    print("confusion matrix")

    target_names = ['Show', 'No-Show']
    print(classification_report(y, y_pred, target_names=target_names))

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y, y_pred)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    class_names = ['Show', 'No-Show']
    
    plt.figure(fig_number)
    plt.subplot(221)
    plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization')
    print()
    
    plt.figure(fig_number)
    plt.subplot(222)
    # Plot non-normalized confusion matrix
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                          title='Normalized confusion matrix')
    print()
    
    plt.figure(fig_number)
    plt.subplot(223)
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_score)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    
    print()

In [None]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    return plt

In [None]:
def auc_metric(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    backend.get_session().run(tf.local_variables_initializer())
    return auc

In [None]:
%ls ../dataset

In [None]:
person = 'victor'
if person == 'victor':
    data = pickle.load(Path(data_path, 'preprocessed_corpus.pkl').open('rb'))
elif person == 'wei':
    data = load_data_and_labels(file_path=data_path)

In [None]:
# data.head()
data.head()

In [None]:
stan_nlp = stanfordnlp.Pipeline() # This sets up a default neural pipeline in English
doc = stan_nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
print(doc.sentences[0].print_dependencies())

res = stan_nlp.annotate("I love you. I hate him. You are nice. He is dumb",
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                       'timeout': 1000,
                   })
for s in res["sentences"]:
    print("%d: '%s': %s %s" % (
        s["index"],
        " ".join([t["word"] for t in s["tokens"]]),
        s["sentimentValue"], s["sentiment"]))