In [1]:

import os
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (roc_curve, roc_auc_score, auc, 
                            precision_recall_curve, classification_report
                            )
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.naive_bayes import MultinomialNB


In [3]:

#load data
DATA_FILEPATH = os.path.join(os.curdir, "data", "detect.csv")
data=pd.read_csv(DATA_FILEPATH)

# separate into train and test
x_train, x_test, y_train, y_test = train_test_split(data["text"], data["label"], 
                                                    test_size=0.3, 
                                                    random_state=0
                                                    )



In [1]:



def show_plots(params):
    plt.figure().set_figwidth(5)
    plt.plot(params["x"], params["y"], color='darkorange', lw=2)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])

    if params["curve"] == 'auc':
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')

    if params["curve"] == 'prc':
        plt.xlabel('Recall')
        plt.ylabel('Precision')

    plt.title(params["title"])
    plt.show()

def create_model(x_train, y_train, x_test, y_test, min_df=1, stop_words=None,
                 print_result=True, algorithm_para=1.0):

    # initialize the TfidfVectorizer without any parameters
    vec = TfidfVectorizer(stop_words=stop_words, min_df=min_df)

    # choose classification model and add hyperparams
    model, model_name = MultinomialNB(alpha=algorithm_para), 'Naive-Bayes Model'

    # fit vectorizer using x_train and vectorize training data
    x_train = vec.fit_transform(x_train)
    x_test = vec.transform(x_test)
    # fit classifier model with training data
    model.fit(x_train, y_train)

    # get classification probs and choose "1" class
    pred_scores = model.predict_proba(x_test)[:,1]
    # convert to binary output based on confidence score
    bin_preds = np.where(pred_scores>0.5, 1, 0)

    # get ROC curve using tpr/fpr and calculate AUC score
    fpr, tpr, _ = roc_curve(y_test, pred_scores)
    auc_score = roc_auc_score(y_test, pred_scores)

    # get PRC curve using precision/recall and calculate PRC score
    precision, recall, _ = precision_recall_curve(y_test, pred_scores)
    prc_score = auc(recall, precision)

    if print_result:
        # print curve scores
        print(' AUC:{:.2%}\n'.format(auc_score),'PRC:{:.2%}\n'.format(prc_score))
        # print classification report
        print( classification_report(y_test, bin_preds, target_names=['0','1']) )
        # show curve plots
        show_plots({'x':fpr,
                    'y':tpr,
                    'curve':'auc',
                    'title': 'AUC of '+ model_name})
        show_plots({'x':recall,
                    'y':precision,
                    'curve':'prc',
                    'title': 'PRC of '+ model_name})

    return auc_score, prc_score

# Show the impact of sample size
def sample_size_impact(docs, y):

    train_size = list(range(1,10))
    train_size.reverse()
    train_size = [i/10 for i in train_size]

    performance = []

    print_r = False
    for size in train_size:
        # if show results, print sample size
        if print_r == True:
            print('Training sample size: ',(10-size*10)/10)

        # separate into train and test
        x_train, x_test, y_train, y_test = train_test_split(docs, y,  test_size= size, random_state=0)

        # choose whether to print result or not
        auc, _ = create_model(x_train, y_train, x_test, y_test, min_df = 1,
                            stop_words='english', print_result=print_r, algorithm_para=1.0)

        performance.append(auc)


    plt.figure().set_figwidth(5)
    plt.plot(train_size, performance, color='navy', lw=2, label = 'Model Performance');
    plt.axis([1,0, 0.8, 1])
    plt.xlabel('testing sample percentage')
    plt.ylabel('AUC')
    plt.legend()
    plt.show()


In [None]:

# default tokenizer
auc_score, prc_score = create_model(x_train, y_train, x_test, y_test)



In [None]:


# custom tokenizer
auc_score, prc_score = create_model(x_train, y_train, x_test, y_test, 
                                    stop_words='english'
                                    )



In [None]:

sample_size_impact(data["text"], data["label"])
