# Imports

In [None]:
import pandas as pd
from pandas import Timestamp
import numpy as np
from operator import itemgetter
from datetime import datetime
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
from ast import literal_eval
import joblib

from tqdm import tqdm
tqdm.pandas()
from scipy import sparse
from imblearn.over_sampling import SMOTENC

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

import plaidml.keras
import os
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.utils import multi_gpu_model 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size= 0.2, random_state=1)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Training

In [None]:
'''(Put into a class later)'''
def tfidf(X_data_train, X_data_test, stopwords):
    '''Does TFIDF on training Data.'''
    tfidf_obj = TfidfVectorizer(stop_words = stopwords,
                                sublinear_tf = True,
                                analyzer = 'word',
                                token_pattern = r'\w{2,}',
                                ngram_range = (1, 3),
                                max_features = 25000
                               )
    tvec_mat_train = tfidf_obj.fit_transform(X_data_train)    
    tvec_mat_test = tfidf_obj.transform(X_data_test) 
    tfidf_feature_names = tfidf_obj.get_feature_names()  
    
    return tvec_mat_train, tvec_mat_test, tfidf_feature_names
    
def chi_squared(tvec_mat_train, tvec_mat_test, y_train, y_test, tfidf_feature_names):
    '''Plots the words with highest chi-squared values and returns two new matricies 
    which have the highest chi-squared features.'''
    chi2score = chi2(tvec_mat_train, y_train)[0]
    plt.figure(figsize=(12,8))
    wscores = zip(tfidf_feature_names, chi2score)
    wchi2 = sorted(wscores, key=lambda x:x[1])
    topchi2 = list(zip(*wchi2[-20:]))
    x = range(len(topchi2[1]))
    labels = topchi2[0]
    plt.barh(x,topchi2[1], align='center', alpha=0.2)
    plt.plot(topchi2[1], x, '-o', markersize=5, alpha=0.8)
    plt.yticks(x, labels)
    plt.xlabel('$\chi^2$')
    
    kbest = SelectKBest(score_func = chi2, k = 15000) # Return top 15000 words as features.
    tvec_mat_train_chi = kbest.fit_transform(tvec_mat_train, y_train)
    tvec_mat_test_chi = kbest.fit_transform(tvec_mat_test, y_test)

    return tvec_mat_train_chi, tvec_mat_test_chi

In [None]:
'''No stop words.'''
stopwords = nltk.corpus.stopwords.words('english')

tvec_mat_train, tvec_mat_test, tfidf_feature_names = tfidf(X_train['text'],
                                                           X_test['text'], stopwords)

tvec_mat_train_chi, tvec_mat_test_chi = chi_squared(tvec_mat_train, tvec_mat_test,
                                                    y_train, y_test,
                                                    tfidf_feature_names)

# Chi2

In [None]:
'''Add stop words and return top 15000 words as features.'''
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(('wa', 'ha', 'depression', 'depressive', 'depressed', 'anxiety', 'anxious', 
                  'panic', 'attack','suicide', 'bipolar', 'manic', 'mania', 'hypomania', 
                  'attacks', 'lithium','mg', 'die', 'kill', 'disorder', 'episode', 'episodes', 'polar', 'bi'))

tvec_mat_train, tvec_mat_test, tfidf_feature_names = tfidf(X_train['text'], X_test['text'], 
                                                           stopwords)

tvec_mat_train_chi, tvec_mat_test_chi = chi_squared(tvec_mat_train, tvec_mat_test,
                                                    y_train, y_test,
                                                    tfidf_feature_names)

In [None]:
print(tvec_mat_train_chi.shape)
print(tvec_mat_test_chi.shape)
print(y_train.shape)
print(y_test.shape)

# Standard Scaler

In [None]:
'''Scale the columns which need to be scaled.'''
scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train.loc[:, ~X_train.columns.isin(['text'])]) 
X_test_std = scaler.transform(X_test.loc[:, ~X_train.columns.isin(['text'])]) 

In [None]:
'''Remove columns with low chi2 score from sparse matrix and add them to other variables.'''
num_feats = X_train.loc[:, ~X_train.columns.isin(['text'])].values
X_train_tfidf = sparse.hstack((tvec_mat_train_chi, num_feats))

In [None]:
num_feats = X_test.loc[:, ~X_test.columns.isin(['text'])].values
X_test_tfidf = sparse.hstack((tvec_mat_test_chi, num_feats))

In [None]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)
print(y_train.shape)
print(y_test.shape)

# Grid Search

In [None]:
def docm(y_true, y_pred, labels=None):
    '''Creates Document Matrix'''
    cm = confusion_matrix(y_true, y_pred)
    if gs.classes_ is not None:
        cols = ['p_' + c for c in gs.classes_]
        df = pd.DataFrame(cm, index=gs.classes_, columns=cols)
    else:
        cols = ['p_' + str(i) for i in range(len(cm))]
        df = pd.DataFrame(cm, columns=gs.classes_)
    return df

In [None]:
'''Models and their grid-search parameters.'''
########################################################################
LR = LogisticRegression(multi_class = 'ovr', 
                        random_state = 1,
                        n_jobs=16
                       )
LR_params = {'C': np.logspace(-5, 5, 5),
             'solver': ['lbfgs', 'saga'],
             'max_iter': [500, 1000],
             'penalty': ['l1', 'l2']
            }

########################################################################

SVM_SGD = SGDClassifier(penalty='l2',
                        random_state=1,
                        learning_rate='optimal',
                        tol=None,
                        n_jobs=16
                       )
SVM_SGD_params = {'loss': ['hinge', 'squared_hinge'],
                  'alpha': np.linspace(1e-3, 0.5, 5),
                  'max_iter': [500, 1000],
                  'penalty': ['l1', 'l2', 'elasticnet']
                 }

########################################################################

KNC = KNeighborsClassifier(n_jobs=16
                          )
KNC_params = {'algorithm': ['auto'],
              'n_neighbors': [2, 10, 20],
              'p': [1, 2],
              'weights': ['uniform', 'distance'],
              'metric': ['euclidean', 'manhattan']
             }

########################################################################

RFC = RandomForestClassifier(random_state=1,
                             n_estimators=100,
                             n_jobs=16
                            )
RFC_params = {'criterion': ['gini', 'entropy'],
              'max_depth': [2, 5],
              'ccp_alpha': np.linspace(0., 0.5, 3)
             }

########################################################################

ETC = ExtraTreesClassifier(random_state=1,
                           n_estimators=100,
                           n_jobs=16
                          )
ETC_params = {'criterion': ['gini', 'entropy'],
              'max_depth': [2, 5],
              'ccp_alpha': np.linspace(0., 0.5, 3)
             }

########################################################################

DTC = DecisionTreeClassifier(random_state=1
                            )
DTC_params = {'criterion': ['gini', 'entropy'],
              'max_depth': [2, 5],
              'ccp_alpha': np.linspace(0., 0.5, 3)
             }

########################################################################

In [None]:
'''Grid Search'''

models = {'LogisticRegression': LR, 
          'SGDClassifier': SVM_SGD, 
          'KNeighborsClassifier': KNC,
          'DecisionTreeClassifier': DTC, 
          'RandomForestClassifier': RFC, 
          'ExtraTreeClassifier': ETC
         }

params = {'LogisticRegression': LR_params, 
          'SGDClassifier': SVM_SGD_params, 
          'KNeighborsClassifier': KNC_params,
          'DecisionTreeClassifier': DTC_params, 
          'RandomForestClassifier': RFC_params, 
          'ExtraTreeClassifier': ETC_params
         }

score_list = []
for name, model in models.items():
    accuracy = 0
    gs = GridSearchCV(estimator = model, 
                      param_grid = params[name], 
                      cv=5, 
                      verbose=1,
                      n_jobs=16
                     )
    
    gs.fit(X_train_tfidf, y_train)
    
    best_est = gs.best_estimator_
    best_score = gs.best_score_ 
    best_params = gs.best_params_
    
    train_score = gs.score(X_train_tfidf, y_train)
    test_score = gs.score(X_test_tfidf, y_test)
    
    test_predictions = gs.predict(X_test_tfidf)
    
    gs_results = pd.DataFrame(gs.cv_results_)
    score_list.append([name, train_score, test_score, best_score]) # Append main results of best estimator.
    
    joblib.dump(gs, f'{name}' + '_model.jlib') # Save model.
    gs_results.to_csv(f'{name}' + '_results', encoding='utf-8', index=False) # Export results to csv.
         
    # Print reports.
    print(name)
    print()
    print(best_score)
    print()
    print(classification_report(y_test, test_predictions, target_names=gs.classes_))
    print()
    print(docm(y_test, test_predictions))
    print()
    print('-'*80)

In [None]:
df_scores = pd.DataFrame(score_list, columns=['model', 'train_score', 'test_score', 'best_score']) # Dataframe of main scores.
df_scores

df_scores.to_csv('/Users/francesco/df_scores', encoding='utf-8', index=False)

# Neural Network

In [None]:
d = dict(zip(df.subreddit.unique(), range(0,4)))
d

In [None]:
y_train = y_train.map(d, na_action='ignore')
y_test = y_test.map(d, na_action='ignore')

In [None]:
plaidml.keras.install_backend()
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
np.random.seed(1337)

In [None]:
keras.backend.backend()

In [None]:
model = keras.Sequential()
alpha = 0.001

model.add(keras.layers.Dense(256, activation='relu', input_shape=(X_train_std.shape[1], ), kernel_regularizer=keras.regularizers.l2(alpha))) 
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(alpha)))
# model.add(keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(alpha)))
model.add(keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(alpha)))

model.add(keras.layers.Dense(4, activation='softmax', kernel_regularizer=keras.regularizers.l2(alpha)))

optimizer = keras.optimizers.SGD(lr=0.001)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

In [None]:
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)

EPOCHS = 100

history = model.fit(
  X_train_tfidf, y_train,
  epochs=EPOCHS, validation_split = 0.2, verbose=1, callbacks=[early_stop], batch_size=256)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.show()

In [None]:
predicitions = model.predict(X_test_std)

In [None]:
model.metrics_names

In [None]:
model.evaluate(X_test_std, y_test, verbose=2)

In [None]:
accuracy_score(y_test, predicitions.argmax(axis=1))

# Visualise Results

In [None]:
def docm(y_true, y_pred, labels=None):
    cm = confusion_matrix(y_true, y_pred)
    if model.classes_ is not None:
        cols = ['p_' + c for c in model.classes_]
        df = pd.DataFrame(cm, index=model.classes_, columns=cols)
    else:
        cols = ['p_' + str(i) for i in range(len(cm))]
        df = pd.DataFrame(cm, columns=model.classes_)
    return df

In [None]:
inv_d = {v: k for k, v in d.items()}
inv_d

In [None]:
y_test = y_test.map(inv_d, na_action='ignore')
y_test

In [None]:
def docm(y_true, y_pred, labels=None):
    cm = confusion_matrix(y_true, y_pred)
    if d.keys() is not None:
        cols = [c for c in d.keys()]
        df = pd.DataFrame(cm, index=d.keys(), columns=cols)
    else:
        cols = [str(i) for i in range(len(cm))]
        df = pd.DataFrame(cm, columns=d.keys())
    return df

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    
    import itertools
    plt.figure(figsize=(13, 13))
    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('BuGn')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.title('Confusion Matrix', fontsize=38, color='orange')
    plt.ylabel('True label', fontsize=18)
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass), fontsize=18)
    plt.tight_layout()
    plt.savefig("/Users/francesco/cmat_.png", transparent=False, dpi=300)
    
    plt.show()

In [None]:
plot_confusion_matrix(cm           = np.array([[16420,  1102,   10293, 401],
                                              [1392,  7687,  4834, 462],
                                              [4191,  2042, 69423, 7041],
                                              [565, 366, 17166, 13743]]), 
                      normalize    = False,
                      target_names = ['anxiety', 'bipolar', 'depression', 'suicidewatch'],
                      title        = "Confusion Matrix NN")

In [None]:
print(classification_report(y_test, predicitions.argmax(axis=1), target_names=d.keys()))
print()
print(docm(y_test, predicitions.argmax(axis=1)))

In [None]:
import scikitplot as skplt
import matplotlib.pyplot as plt


y_true = y_test
y_probas = predicitions
skplt.metrics.plot_roc(y_true, y_probas, figsize=(10, 10), text_fontsize=18)
plt.title('ROC Curves NN', fontsize=38, color='orange')

plt.savefig("/Users/francesco/ROC.png", transparent=True, dpi=300)
plt.show()