In [None]:
import os
import sys
from pathlib import Path

In [None]:
project_name = 'gsq'
project_path = Path(os.getcwd()).parent
data_path = Path(project_path, 'dataset')

# including the project folder and the utils folder
if project_name not in ''.join(sys.path):
    sys.path.extend([str(project_path), str(Path(project_path, 'utils'))])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('sys.path =')
sys.path

In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from utils.datapath import data_path_scripts
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_recall_fscore_support, classification_report
from numpy import interp 

In [None]:
# creating directory path for the file
queries_path = Path(data_path, 'final_master_aug.csv')
# reading file
data = pd.read_csv(queries_path)

In [None]:
data.head()

In [None]:
def class_report(y_true, y_pred, y_score=None, average='micro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true %s is not the same shape as y_pred %s" % (
              y_true.shape,
              y_pred.shape)
        )
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred-cnt'] = pred_cnt
    class_report_df['pred-cnt'].iloc[-1] = total

    if not (y_score is None):
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for label_it, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_it])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())

            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    return class_report_df

testing health vs non-health classification

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data['query'],data['health'], random_state=7)

In [None]:
vect = CountVectorizer().fit(x_train)
x_train_vectorized = vect.transform(x_train)
clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(x_train_vectorized, y_train)

preds = clfrNB.predict(vect.transform(x_test))
score = roc_auc_score(y_test, preds)
print('roc auc score = {0}'.format(score))

preds = clfrNB.predict(vect.transform(x_test))
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
print('auc = {0}\n'.format(auc_score))


print(classification_report(y_true=y_test, y_pred=preds, labels=np.unique(y_test)))

report_with_auc = class_report(
    y_true=y_test, 
    y_pred=clfrNB.predict(vect.transform(x_test)), 
    y_score=clfrNB.predict_proba(vect.transform(x_test)),
    average='macro')

print(report_with_auc)
print()

testing all categories

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data['query'],data['label'].astype(int), random_state=7)

In [None]:
vect = CountVectorizer().fit(x_train)
x_train_vectorized = vect.transform(x_train)

clfrNB = MultinomialNB(alpha = 0.1)
clfrNB.fit(x_train_vectorized, y_train)

report_with_auc = class_report(
    y_true=y_test, 
    y_pred=clfrNB.predict(vect.transform(x_test)), 
    y_score=clfrNB.predict_proba(vect.transform(x_test)))

print(report_with_auc)