Simple sentence classification

In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent
src_path = Path(project_path, 'src')
utils_path = Path(project_path, 'utils')

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    embedding_path = 'D:\Dataset\{0}\embedding'.format(project_name)
    model_path = 'D:\Dataset\{0}\embedding'.format(project_name)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    embedding_path = '/Volumes/Dataset/{0}/embedding'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    embedding_path = Path(project_path, 'embedding')

# including the project folder and the utils folder
if str(utils_path) not in ''.join(sys.path):
    sys.path.extend([str(project_path), str(utils_path), str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('embedding path = {0}'.format(embedding_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
# utils
import pandas as pd
import re
import numpy as np
# from utils.result_vis import class_report
# flatten list
from functools import reduce
from operator import iconcat
# processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# model
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV



# visualization
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from numpy import interp 

In [None]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [None]:
# user_info = pd.read_csv(Path(data_path, 'post_user_risk_tb.csv'))
# user_info = user_info.set_index('user_id')
# user_info.drop(['post_id', 'subreddits_x', 'subreddits_y', 'timestamp'], axis=1, inplace=True)
# title_body = user_info['post_title'].astype(str) + user_info['post_body'].astype(str)
# n_title_body = [x.replace('nan', '') for x in title_body]
# user_info['title_body'] = n_title_body
# user_info.drop(['post_title', 'post_body'], axis=1, inplace=True)
# user_info.to_csv(Path(data_path, 'user_id_risk_label_title_body.csv'))
# grouping all the title and body
# data = data.groupby('user_id').agg({'title_body': ' '.join, 
#                                               'risk_label':'first' }).reset_index()
# # removign control
# print(data.shape)
# data = data[~pd.isnull(data['risk_label'])]
# print(data.shape)
# data = data.set_index('user_id')
# data.columns = ['title_body', 'target']

In [None]:
def class_report(y_true, y_pred, y_score=None, average='macro'):
    if y_true.shape != y_pred.shape:
        print("Error! y_true {0} is not the same shape as y_pred {1}".format(
              y_true.shape,
              y_pred.shape)
        )
        return

    lb = LabelBinarizer()

    if len(y_true.shape) == 1:
        lb.fit(y_true)

    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = len(labels)
    pred_cnt = pd.Series(cnt, index=labels)

    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=labels)

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average=average))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=labels)

    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]

    class_report_df = class_report_df.T
    class_report_df['pred-cnt'] = pred_cnt
    class_report_df['pred-cnt'].iloc[-1] = total

    if not (y_score is None):
        # false positive rate
        fpr = dict()
        # true positive rate
        tpr = dict()
        roc_auc = dict()
        for label_ix, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_ix])

            roc_auc[label] = auc(fpr[label], tpr[label])

        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())

            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])

        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))

            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])

            # Finally average it and compute AUC
            mean_tpr /= n_classes

            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr

            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])

        class_report_df['AUC'] = pd.Series(roc_auc)

    return class_report_df

In [None]:
dataset = pd.read_csv(Path(data_path, 'user_id_risk_label_title_body.csv'))

In [None]:
# define 5-fold cross validation test harness
k_fold = pd.read_csv(Path(data_path, 'clpsych19_public_crossvalidation_splits.csv'), header=None,
                    names=['fold', 'train_text', 'user_id'])

# keep non conrol user ids
k_fold = k_fold[k_fold['user_id'] > 0]

In [None]:
data = dataset.copy()
ave_f1_scores = list()

print('5 fold CV starting')
for fold_ix in range(1,6):
    
    print('fold = {0}'.format(fold_ix))
    
    train_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'training')]['user_id']
    test_ix = k_fold[(k_fold['fold'] == fold_ix) & (k_fold['train_text'] == 'test')]['user_id']
    
    x_train = data[data.user_id.isin(train_ix)].copy()
    x_test = data[data.user_id.isin(test_ix)].copy()
    
    y_train = x_train['target']
    x_train.drop(['target'], axis=1, inplace=True)
    
    y_test = x_test['target']
    x_test.drop(['target'], axis=1, inplace=True)
    
#     vectorizer = CountVectorizer(stop_words="english",
#                          preprocessor=clean_text)
    vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))
    
    x_train = vectorizer.fit_transform(x_train['title_body'])    
    x_test = vectorizer.transform(x_test['title_body'])
    
    # Training
    svm = LinearSVC()
    model = CalibratedClassifierCV(svm, cv=None) 
    model.fit(x_train, y_train)
    
    y_pred_test = model.predict(x_test)
    y_score=model.predict_proba(x_test)
    
    report_with_auc = class_report( y_true=y_test, y_pred=y_pred_test, y_score=y_score, average='macro')
        
    ave_f1_scores.append(report_with_auc['f1-score'].values[-1])

    cv_column = [fold_ix]
    cv_column.extend( [''] * (report_with_auc.index.shape[0] - 1))
    report_with_auc['Fold'] = cv_column
    report_with_auc['Risk-Factor'] = report_with_auc.index
    report_with_auc = report_with_auc.set_index(['Fold', 'Risk-Factor'])

    if fold_ix == 1:
        report_with_auc_df = report_with_auc.copy()
    else:
        report_with_auc_df = report_with_auc_df.append(report_with_auc.copy())


In [None]:
report_with_auc_df

In [None]:
print('average f1-score (all folds) = {0}'.format(np.mean(ave_f1_scores)))