In [None]:
import pandas as pd
from pandasql import sqldf
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import cross_val_score
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
pd.set_option('display.max_colwidth', 70)
pd.set_option('display.max_rows', None)

In [None]:
pysqldf = lambda q: sqldf(q, globals())

In [None]:
#nltk.download('stopwords')
#nltk.download('wordnet')

In [None]:
# setting a couple of workbook variables
text_source = "transcript"
tag = "opioid"

In [None]:
stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [None]:
df_human = pd.read_csv("data/classifications.csv")
df_ml = pd.read_csv("data/transcript_accuracy.csv")

In [None]:
df_ml.head()

In [None]:
df_human.head(1)

In [None]:
# Todo? do we want to denormalize this, or get a list of distinct subjects

In [None]:
df_dataset = pysqldf(f"""
SELECT dh.*, dml.{text_source}
FROM 
    df_human dh
JOIN
    df_ml dml
ON dh.ID = dml.id
    WHERE description is not NULL
    AND description != 'error code 224003'
""")

In [None]:
len(df_dataset)

In [None]:
df_dataset.head(2)

In [None]:
#df_dataset.set_index("ID", inplace=True)

In [None]:
df = pysqldf(f"""
SELECT 
    ID,
    subject,  
    {text_source},
    CASE
        WHEN subject LIKE '%{tag}%' THEN 1
        ELSE 0
    END AS category
FROM df_dataset
""")

In [None]:
df.set_index("ID", inplace=True)

In [None]:
df.head(1)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.5)

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stop)),
    ('clf', RandomForestClassifier())
    ])

In [None]:
X_train = text_clf['vect'].fit_transform(df_train[text_source])

In [None]:
text_clf['vect'].get_feature_names_out()

In [None]:
#X_train.todense()

In [None]:
df_vector = pd.DataFrame(X_train.todense())
df_vector.columns = text_clf['vect'].get_feature_names_out()

In [None]:
pd.set_option('display.max_rows', None)
#df_vector.T

In [None]:
df_train.iloc[0][text_source]

In [None]:
X_train = text_clf.fit(df_train[text_source], df_train['category'])

In [None]:
feature_df = pd.DataFrame({"feature": text_clf['vect'].get_feature_names_out(), 
                           'importance': text_clf['clf'].feature_importances_})

In [None]:
pysqldf("""
SELECT 
    *
FROM
    feature_df
ORDER BY
    importance DESC
""").head(10)

In [None]:
feature_df.sort_values(by=['importance'], ascending=False)[:20]

In [None]:
cross_val = cross_val_score(text_clf, df_test[text_source], df_test['category'], cv=4)

In [None]:
cross_val

In [None]:
y_proba = text_clf.predict_proba(df_test[text_source])
y_pred = text_clf.predict(df_test[text_source])

In [None]:
# only list the first few, we'll build a dataframe on this later anyway
y_proba[:10]

In [None]:
text_clf.classes_

In [None]:
prob_not = [p[0] for p in y_proba]
prob_legal = [p[1] for p in y_proba]

In [None]:
df_test['prob_no_match'] = prob_not
df_test['prob_match'] = prob_legal
df_test['y-pred'] = y_pred

In [None]:
threshold = 0.5

y_adj = []

for p in prob_legal:
    if p <= threshold:
        y_adj.append(0)
    else:
        y_adj.append(1)
    
df_test['y_adj'] = y_adj

In [None]:
df_test[['category', 'y-pred', 'y_adj', 'prob_no_match', 'prob_match']].head()

In [None]:
tag_results = pysqldf(f"""
SELECT df_test.*, df.{text_source}, df.category
FROM df_test
JOIN df
ON df_test.ID = df.ID
""")

In [None]:
tag_results.head()

In [None]:
# let's look into where the model missed it
pysqldf("""
SELECT 
    *
FROM
    tag_results
WHERE 
    category != `y_adj`
""")

In [None]:
y_pred = text_clf.predict(df_test[text_source])

In [None]:
# everything below this line will vary based on the run

In [None]:
precision, recall, fscore, train_support = precision_recall_fscore_support(df_test['category'], y_adj, pos_label=1, average='binary')
print('Precision: {} / Recall: {} / Accuracy: {} / fscore: {}'.format(
    round(precision, 3), round(recall, 3), (round((y_pred==df_test['category']).sum()/len(y_adj), 3)), round(fscore, 3)))

In [None]:
print('f-score', 2 * ((precision * recall) / (precision + recall)))