In [None]:
%load_ext autoreload
%autoreload 2

import copy
    
import numpy as np
import pandas as pd
# set display options
pd.options.display.max_rows = 150
pd.options.display.max_columns = 150

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# processing scripts
from src.processing import data_pipeline

# training and inference code
from src.algorithms.bayes.inference import inference_naive_bayes
from src.algorithms.bayes.training import train_naive_bayes

## Pipeline

Read in data, process

In [None]:
# path where text is stored
path = '/path/to/data'

# set doctors to use
doctors_list = [1, 4]
# set a minimum length (in tokens): only use lines that have >= min_line_len words
min_line_len = 10

doctors_episodes = data_pipeline.get_all_episode_filenames(path)
corpus_doctors = data_pipeline.read_corpus(doctors_episodes)
df_lines_all = data_pipeline.make_line_dataframe(corpus_doctors)
df_doc = data_pipeline.get_doctor_lines(df_lines_all,
                                        min_line_len=min_line_len,
                                        doctors_list=doctors_list)

# number of stories for each doctor
for k, v in doctors_episodes.items():
    print(k.ljust(50), len(v))
    

print("\n", df_doc.shape)

display(df_doc.sample(n=20))


In [None]:
df_doc.head()

### Training


In [None]:
# split into train and test sets
labels = list(df_doc.sample(frac=1.0).index)
n_train = int(0.7 * len(labels))
df_train = df_doc.copy(deep=True)[df_doc.index.isin(labels[:n_train])]
df_test = df_doc.copy(deep=True)[df_doc.index.isin(labels[n_train:])]
print(len(df_train), len(df_test))

In [None]:
n_splits = 10
n_iter = 10
alpha = 0.3
upsample = True
use_stop_words = False
use_lemmatizer = True
use_stemmer = False
use_singularizer = False
tf_idf = False

df_train, fitted_models = train_naive_bayes(
    df_train,
    sample_col="Line",
    class_col="Doctor",
    alpha=alpha,
    n_splits=n_splits,
    n_iter=n_iter,
    upsample=upsample,
    use_stop_words=use_stop_words,
    use_lemmatizer=use_lemmatizer,
    use_stemmer=use_stemmer,
    use_singularizer=use_singularizer,
    tf_idf=tf_idf,
)

print("")
preds = [f"{actual} - {pred}:   {line}" for
         actual, pred, line in df_train.loc[df_train["preds_NB_final"]
                                            != df_train["Doctor"], :   ]
         .sample(n=20)[["Doctor", "preds_NB_final", "Line"]].values]
print("\n\n".join(preds))

In [None]:
inference_naive_bayes(df_test.copy(deep=True), fitted_models, "Line", "Doctor")