In [6]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import seaborn as sns

df = pd.concat([pd.read_pickle("./uti/utiTrain.pkl"), pd.read_pickle("./utiTest.pkl")])
df = df.sample(frac=1)

def plotConfMatrix(actual, pred, labels):
    conf_matrix = confusion_matrix(actual, pred)
    labels = ["Exclude", "Include"]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (4,2))
    sns.set(font_scale=1.2)#for label size
    sns.heatmap(df_cm, fmt='d', cmap="Blues", annot=True, annot_kws={"size": 12})# font size
    plt.show()

In [7]:
train, test = train_test_split(df, test_size=0.5)

train_data = train['scibert'].tolist()
train_labels = train['code'].tolist()

model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)

model.fit(train_data, train_labels)
pred = model.predict(train_data)

labels = ["Exclude", "Include"]
plotConfMatrix(train_labels, pred, labels)

test_data = test['scibert'].tolist()
test_labels = test['code'].tolist()

pred = model.predict(test_data)

labels = ["Exclude", "Include"]
plotConfMatrix(test_labels, pred, labels)

# Predict probability [exclusion, inclusion]
pred = model.predict_proba(test_data)
# Calculate score (positive = exclude, negative = include)
def score(i):
    return i[0] - i[1]
pred = list(map(score, pred))
# Attatch label to each one and sort by most likely to be included
pred = sorted(list(zip(pred, test_labels)))
last_index = [x for x, y in enumerate(pred) if y[1] == 1][-1]
print("Percentage until last include:", last_index / len(pred))
print(pred)

NameError: name 'plt' is not defined