# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# df = pd.concat([pd.read_pickle("./uti/utiTrain.pkl"), pd.read_pickle("./utiTest.pkl")])
# use just training (screening) data
df = pd.read_pickle("./copper/copperTrain.pkl")
# Shuffle df
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,code,abstract,scibert
0,0,Introduction / Patient Information: Intrauteri...,"[-0.2047277, 0.05474719, -0.17026988, 0.635536..."
1,0,Background: Recent studies have suggested that...,"[0.35181525, -0.39828637, -0.19908452, 0.62762..."
2,0,LEARNING OBJECTIVE 1: Diagnose drug induced li...,"[-0.052812707, -0.36207545, -0.110467605, 0.48..."
3,0,The proceedings contain 631 papers. The topics...,"[-0.38313106, -0.08930479, -0.42180964, 0.4287..."
4,0,OBJECTIVES: Reproductive tract actinomyces hav...,"[0.15391812, -0.10781988, -0.35515946, 0.79182..."


# Helper Functions

In [26]:
# Function to plot confusion matrix
def plotConfMatrix(actual, pred, labels):
    conf_matrix = confusion_matrix(actual, pred)
    labels = ["Exclude", "Include"]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (4,2))
    sns.set(font_scale=1.2)#for label size
    sns.heatmap(df_cm, fmt='d', cmap="Blues", annot=True, annot_kws={"size": 12})# font size
    plt.show()
    return
 
# Function to calculate probabilities of each remaining article
def calcProb(model, initial, remaining):
    # Get initial training data and labels 
    initial_data = initial['scibert'].tolist()
    initial_labels = initial['code'].tolist()

    # Fit model to initial training data
    model.fit(initial_data, initial_labels)

    # Get remaining data for testing
    remaining_data = remaining['scibert'].tolist()

    # Predict probability [exclusion, inclusion] on remaining articles
    pred = model.predict_proba(remaining_data)

    # Calculate score (x[1] = probability of inclusion)
    pred = list(map(lambda x: x[1], pred))
    # Add probability to dataframe
    remaining['prob'] = pred
    # Sort by probability
    remaining = remaining.sort_values(by=['prob'], ascending=False).reset_index(drop=True)
    
    return remaining

# Calculate the total number of articles screened so far (train / train + test)
def calcPercentScreened(initial, remaining):
    return (initial.index[-1] + 1) / ((remaining.index[-1] + 1) + (initial.index[-1] + 1))

# Calculate the number of articles needed to screen out of entire dataset to find all included articles
# (train + last_index / train + test)
def calcPercentNeedToScreen(initial, remaining, last_index):
    return ((initial.index[-1] + 1) + (last_index + 1)) / ((initial.index[-1] + 1) + (remaining.index[-1] + 1))

# Print stats (return false when all included articles found)
def printStats(initial, remaining):
    # Find index of last included article
    include_indicies = remaining[remaining.code == 1].index
    # Calculate total percentage of articles that need to be screened
    if (len(include_indicies) == 0):
        print("All included articles found after screening: %d (%.2f percent)" % (len(initial), calcPercentScreened(initial, remaining) * 100))
        return False
    else:
        print("Screened: %.2f (%d) Need to screen: %.2f (%d)" % (
            calcPercentScreened(initial, remaining) * 100,
            len(initial),
            calcPercentNeedToScreen(initial, remaining, include_indicies[-1]) * 100,
            len(remaining)
         ))
        return True

# Iterative Predition using Probabilities

In [27]:
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)
all_data = df['scibert'].tolist()
all_labels = df['code'].tolist()

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProb(model, initial, remaining)
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 21
Screened: 6.03 (21) Need to screen: 58.33 (327)
Screened: 6.32 (22) Need to screen: 51.72 (326)
Screened: 6.61 (23) Need to screen: 47.70 (325)
Screened: 6.90 (24) Need to screen: 48.28 (324)
Screened: 7.18 (25) Need to screen: 47.70 (323)
Screened: 7.47 (26) Need to screen: 35.34 (322)
Screened: 7.76 (27) Need to screen: 35.06 (321)
Screened: 8.05 (28) Need to screen: 38.22 (320)
Screened: 8.33 (29) Need to screen: 36.78 (319)
Screened: 8.62 (30) Need to screen: 36.21 (318)
Screened: 8.91 (31) Need to screen: 33.62 (317)
Screened: 9.20 (32) Need to screen: 30.46 (316)
Screened: 9.48 (33) Need to screen: 32.18 (315)
Screened: 9.77 (34) Need to screen: 31.03 (314)
Screened: 10.06 (35) Need to screen: 29.89 (313)
Screened: 10.34 (36) Need to screen: 21.55 (312)
Screened: 10.63 (37) Need to screen: 22.41 (311)
Screened: 10.92 (38) Need to screen: 21.26 (310)
Screened: 11.21 (39) Need to screen: 22.41 (309)
Screened: 11.49 (40) Need to screen: 19.54 (308)
Screen