# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# df = pd.concat([pd.read_pickle("./uti/utiTrain.pkl"), pd.read_pickle("./utiTest.pkl")])
# use just training (screening) data
df = pd.read_pickle("./uti/utiTrain.pkl")
# Shuffle df
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,code,abstract,scibert
0,0,"A randomized, placebo-controlled, double-blind...","[0.41320926, -0.3184793, 0.016785104, 0.625384..."
1,0,One hundred women who underwent vaginal hyster...,"[0.2548079, -0.6519264, -0.20492533, 0.2531292..."
2,0,Screening for bacteriuria by culture of voided...,"[0.14358793, -0.2668293, -0.3464644, 0.3380412..."
3,0,"In a randomized, double-blind clinical trial, ...","[0.1024194, -0.43026537, -0.34492365, 0.208245..."
4,0,BACKGROUND: The aim of this study is to compar...,"[0.102118134, 0.23197177, -0.12228052, 0.75677..."


# Helper Functions

In [2]:
# Function to plot confusion matrix
def plotConfMatrix(actual, pred, labels):
    conf_matrix = confusion_matrix(actual, pred)
    labels = ["Exclude", "Include"]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (4,2))
    sns.set(font_scale=1.2)#for label size
    sns.heatmap(df_cm, fmt='d', cmap="Blues", annot=True, annot_kws={"size": 12})# font size
    plt.show()
    return
 
# Function to calculate probabilities of each remaining article
def calcProb(model, initial, remaining):
    # Get initial training data and labels 
    initial_data = initial['scibert'].tolist()
    initial_labels = initial['code'].tolist()

    # Fit model to initial training data
    model.fit(initial_data, initial_labels)

    # Get remaining data for testing
    remaining_data = remaining['scibert'].tolist()

    # Predict probability [exclusion, inclusion] on remaining articles
    pred = model.predict_proba(remaining_data)

    # Calculate score (positive = exclude, negative = include)
    pred = list(map(lambda x: x[0]-x[1], pred))
    # Add probability to dataframe
    remaining['prob'] = pred
    # Sort by probability
    remaining = remaining.sort_values(by=['prob']).reset_index(drop=True)
    
    return remaining

# Calculate the total number of articles screened so far (train / train + test)
def calcPercentScreened(initial, remaining):
    return (initial.index[-1] + 1) / ((remaining.index[-1] + 1) + (initial.index[-1] + 1))

# Calculate the number of articles needed to screen out of entire dataset to find all included articles
# (train + last_index / train + test)
def calcPercentNeedToScreen(initial, remaining, last_index):
    return ((initial.index[-1] + 1) + (last_index + 1)) / ((initial.index[-1] + 1) + (remaining.index[-1] + 1))

# Print stats (return false when all included articles found)
def printStats(initial, remaining):
    # Find index of last included article
    include_indicies = remaining[remaining.code == 1].index
    # Calculate total percentage of articles that need to be screened
    if (len(include_indicies) == 0):
        print("All included articles found after screening: %.2f" % (calcPercentScreened(initial, remaining) * 100))
        return False
    else:
        print("Screened: %.2f Need to screen: %.2f" % (calcPercentScreened(initial, remaining) * 100, calcPercentNeedToScreen(initial, remaining, include_indicies[-1]) * 100))
        return True

# Iterative Predition using Probabilities

In [4]:
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)
all_data = df['scibert'].tolist()
all_labels = df['code'].tolist()

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProb(model, initial, remaining)
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 18
Screened: 1.63 Need to screen: 90.60
Screened: 1.72 Need to screen: 90.24
Screened: 1.81 Need to screen: 90.96
Screened: 1.90 Need to screen: 91.14
Screened: 1.99 Need to screen: 90.24
Screened: 2.08 Need to screen: 90.05
Screened: 2.17 Need to screen: 91.14
Screened: 2.26 Need to screen: 91.32
Screened: 2.35 Need to screen: 90.14
Screened: 2.44 Need to screen: 89.96
Screened: 2.53 Need to screen: 87.34
Screened: 2.62 Need to screen: 86.44
Screened: 2.71 Need to screen: 78.84
Screened: 2.80 Need to screen: 72.51
Screened: 2.89 Need to screen: 76.31
Screened: 2.98 Need to screen: 79.20
Screened: 3.07 Need to screen: 81.56
Screened: 3.16 Need to screen: 81.19
Screened: 3.25 Need to screen: 79.75
Screened: 3.35 Need to screen: 79.93
Screened: 3.44 Need to screen: 79.57
Screened: 3.53 Need to screen: 80.47
Screened: 3.62 Need to screen: 79.93
Screened: 3.71 Need to screen: 79.75
Screened: 3.80 Need to screen: 79.75
Screened: 3.89 Need to screen: 79.02
Screened: 

Screened: 21.79 Need to screen: 78.75
Screened: 21.88 Need to screen: 79.66
Screened: 21.97 Need to screen: 78.39
Screened: 22.06 Need to screen: 78.03
Screened: 22.15 Need to screen: 78.75
Screened: 22.24 Need to screen: 78.75
Screened: 22.33 Need to screen: 78.39
Screened: 22.42 Need to screen: 78.93
Screened: 22.51 Need to screen: 75.05
Screened: 22.60 Need to screen: 72.51
Screened: 22.69 Need to screen: 60.94
Screened: 22.78 Need to screen: 55.97
Screened: 22.88 Need to screen: 56.33
Screened: 22.97 Need to screen: 55.33
Screened: 23.06 Need to screen: 55.79
Screened: 23.15 Need to screen: 58.14
Screened: 23.24 Need to screen: 58.41
Screened: 23.33 Need to screen: 59.86
Screened: 23.42 Need to screen: 60.22
Screened: 23.51 Need to screen: 61.84
Screened: 23.60 Need to screen: 61.30
Screened: 23.69 Need to screen: 65.82
Screened: 23.78 Need to screen: 65.64
Screened: 23.87 Need to screen: 65.82
Screened: 23.96 Need to screen: 49.64
Screened: 24.05 Need to screen: 42.31
Screened: 24

Screened: 41.59 Need to screen: 42.13
Screened: 41.68 Need to screen: 42.68
Screened: 41.77 Need to screen: 42.22
Screened: 41.86 Need to screen: 42.04
Screened: 41.95 Need to screen: 42.04
All included articles found after screening: 42.04
