# Setup

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# df = pd.concat([pd.read_pickle("./uti/utiTrain.pkl"), pd.read_pickle("./utiTest.pkl")])
# use just training (screening) data
df = pd.read_pickle("./uti/utiScreeningTitleAbstractKeywords.pkl")
# Shuffle df
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,code,title,abstract,keywords,title-scibert,abstract-scibert,keywords-scibert
0,0,No. 275-Antibiotic Prophylaxis in Gynaecologic...,BACKGROUND AND PURPOSE: To evaluate the effica...,Adult; Aged; Benzhydryl Compounds/*administrat...,"[-0.49748182, -0.14884977, 0.04522864, 0.30262...","[0.06336339, -0.082822874, -0.22407193, 0.6200...","[0.71015, -0.14704962, -1.0521674, 0.7442387, ..."
1,0,Efficacy of antibiotic prophylaxis in patients...,PURPOSE: A prospective trial was done to test ...,"Adolescent; Anemia, Iron-Deficiency/*etiology/...","[0.032058507, -0.9371696, -0.6062687, -0.37296...","[0.35011402, -0.27300212, -0.43561977, 0.29421...","[0.67190146, 0.0018364671, -0.75949883, 1.0019..."
2,0,Bacterial characteristics as predictors of pos...,A randomised control trial was undertaken in 1...,Adult; Anti-Bacterial Agents/economics/*therap...,"[-0.068114705, -1.0336735, 0.46364272, 0.22023...","[0.20900984, -0.102352716, 0.07756065, 0.64433...","[0.5547323, -0.12673795, -0.49045548, 1.360038..."
3,0,Weight Gain and Obesity in Infants and Young C...,BACKGROUND: Schistosomiasis remains a public h...,Adolescent; Adult; Anti-Bacterial Agents/thera...,"[-0.4257004, -0.74438167, 0.011787895, 0.14221...","[0.050684925, 0.052820504, -0.045896288, 0.770...","[0.91119504, -0.11157902, -1.2334038, 0.807331..."
4,0,[The use of tolterodine in patients with recur...,BACKGROUND AND OBJECTIVES: Amputations of the ...,Drug Evaluation; Female; Humans; Nicotinic Aci...,"[-0.014838389, -0.46644005, -0.25475898, 0.507...","[-0.022081599, -0.32358795, -0.2160063, 0.4527...","[0.7019616, -0.3110929, -0.8108957, 0.91787827..."


# Helper Functions

In [9]:
# Function to plot confusion matrix
def plotConfMatrix(actual, pred, labels):
    conf_matrix = confusion_matrix(actual, pred)
    labels = ["Exclude", "Include"]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (4,2))
    sns.set(font_scale=1.2)#for label size
    sns.heatmap(df_cm, fmt='d', cmap="Blues", annot=True, annot_kws={"size": 12})# font size
    plt.show()
    return
 
# Function to calculate probabilities of each remaining article
def calcProb(model, initial, remaining, field):
    # Get initial training data and labels 
    initial_data = initial[field + '-scibert'].tolist()
    initial_labels = initial['code'].tolist()

    # Fit model to initial training data
    model.fit(initial_data, initial_labels)

    # Get remaining data for testing
    remaining_data = remaining[field + '-scibert'].tolist()

    # Predict probability [exclusion, inclusion] on remaining articles
    pred = model.predict_proba(remaining_data)

    # Calculate score (positive = exclude, negative = include)
    pred = list(map(lambda x: x[0]-x[1], pred))
    # Add probability to dataframe
    remaining['prob'] = pred
    # Sort by probability
    remaining = remaining.sort_values(by=['prob']).reset_index(drop=True)
    
    return remaining

In [16]:
# Function to calculate probabilities of each remaining article
def calcProbTitleAbstract(model, initial, remaining):
    # Get initial training data and labels 
    initial_data = initial['title-scibert'].tolist()
    initial_labels = initial['code'].tolist()
    # Fit model to initial training data
    model.fit(initial_data, initial_labels)
    # Get remaining data for testing
    remaining_data = remaining['title-scibert'].tolist()
    # Predict probability [exclusion, inclusion] on remaining articles
    pred_title = model.predict_proba(remaining_data)
    # Calculate score (positive = exclude, negative = include)
    pred_title = list(map(lambda x: x[0]-x[1], pred_title))
    
    # Get initial training data and labels 
    initial_data = initial['abstract-scibert'].tolist()
    initial_labels = initial['code'].tolist()
    # Fit model to initial training data
    model.fit(initial_data, initial_labels)
    # Get remaining data for testing
    remaining_data = remaining['abstract-scibert'].tolist()
    # Predict probability [exclusion, inclusion] on remaining articles
    pred_abstract = model.predict_proba(remaining_data)
    # Calculate score (positive = exclude, negative = include)
    pred_abstract = list(map(lambda x: x[0]-x[1], pred_abstract))
    
    # Add title and abstract scores
    pred = list(map(lambda x: x[0]+x[1], zip(pred_title, pred_abstract)))
    
    # Add probability to dataframe
    remaining['prob'] = pred
    # Sort by probability
    remaining = remaining.sort_values(by=['prob']).reset_index(drop=True)
    
    return remaining

In [18]:
# Function to calculate probabilities of each remaining article
def calcProbAppend(model, initial, remaining):
    # Get initial training data and labels 
    initial_data = (initial['title-scibert'] + initial['abstract-scibert']).tolist()
    initial_labels = initial['code'].tolist()

    # Fit model to initial training data
    model.fit(initial_data, initial_labels)

    # Get remaining data for testing
    remaining_data = (remaining['title-scibert'] + remaining['abstract-scibert']).tolist()

    # Predict probability [exclusion, inclusion] on remaining articles
    pred = model.predict_proba(remaining_data)

    # Calculate score (positive = exclude, negative = include)
    pred = list(map(lambda x: x[0]-x[1], pred))
    # Add probability to dataframe
    remaining['prob'] = pred
    # Sort by probability
    remaining = remaining.sort_values(by=['prob']).reset_index(drop=True)
    
    return remaining

In [None]:
# Calculate the total number of articles screened so far (train / train + test)
def calcPercentScreened(initial, remaining):
    return (initial.index[-1] + 1) / ((remaining.index[-1] + 1) + (initial.index[-1] + 1))

# Calculate the number of articles needed to screen out of entire dataset to find all included articles
# (train + last_index / train + test)
def calcPercentNeedToScreen(initial, remaining, last_index):
    return ((initial.index[-1] + 1) + (last_index + 1)) / ((initial.index[-1] + 1) + (remaining.index[-1] + 1))

# Print stats (return false when all included articles found)
def printStats(initial, remaining):
    # Find index of last included article
    include_indicies = remaining[remaining.code == 1].index
    # Calculate total percentage of articles that need to be screened
    if (len(include_indicies) == 0):
        print("All included articles found after screening: %.2f" % (calcPercentScreened(initial, remaining) * 100))
        return False
    else:
        print("Screened: %.2f Need to screen: %.2f" % (calcPercentScreened(initial, remaining) * 100, calcPercentNeedToScreen(initial, remaining, include_indicies[-1]) * 100))
        return True

# Iterative Predition using Probabilities

## Title

In [8]:
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProb(model, initial, remaining, 'title')
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 69
Screened: 6.37 Need to screen: 63.28
Screened: 6.46 Need to screen: 63.28
Screened: 6.55 Need to screen: 65.59
Screened: 6.64 Need to screen: 66.42
Screened: 6.73 Need to screen: 65.96
Screened: 6.83 Need to screen: 67.80
Screened: 6.92 Need to screen: 68.08
Screened: 7.01 Need to screen: 68.54
Screened: 7.10 Need to screen: 68.91
Screened: 7.20 Need to screen: 67.07
Screened: 7.29 Need to screen: 67.34
Screened: 7.38 Need to screen: 65.04
Screened: 7.47 Need to screen: 64.02
Screened: 7.56 Need to screen: 62.36
Screened: 7.66 Need to screen: 61.99
Screened: 7.75 Need to screen: 61.44
Screened: 7.84 Need to screen: 61.07
Screened: 7.93 Need to screen: 68.36
Screened: 8.03 Need to screen: 68.91
Screened: 8.12 Need to screen: 69.46
Screened: 8.21 Need to screen: 58.30
Screened: 8.30 Need to screen: 56.09
Screened: 8.39 Need to screen: 56.73
Screened: 8.49 Need to screen: 57.56
Screened: 8.58 Need to screen: 57.10
Screened: 8.67 Need to screen: 56.18
Screened: 

Screened: 26.48 Need to screen: 64.21
Screened: 26.57 Need to screen: 64.21
Screened: 26.66 Need to screen: 64.21
Screened: 26.75 Need to screen: 63.84
Screened: 26.85 Need to screen: 63.84
Screened: 26.94 Need to screen: 64.21
Screened: 27.03 Need to screen: 64.48
Screened: 27.12 Need to screen: 64.67
Screened: 27.21 Need to screen: 64.67
Screened: 27.31 Need to screen: 64.67
Screened: 27.40 Need to screen: 64.67
Screened: 27.49 Need to screen: 64.11
Screened: 27.58 Need to screen: 64.02
Screened: 27.68 Need to screen: 63.75
Screened: 27.77 Need to screen: 63.56
Screened: 27.86 Need to screen: 63.93
Screened: 27.95 Need to screen: 63.28
Screened: 28.04 Need to screen: 63.28
Screened: 28.14 Need to screen: 63.28
Screened: 28.23 Need to screen: 62.92
Screened: 28.32 Need to screen: 63.01
Screened: 28.41 Need to screen: 63.01
Screened: 28.51 Need to screen: 63.01
Screened: 28.60 Need to screen: 62.73
Screened: 28.69 Need to screen: 62.73
Screened: 28.78 Need to screen: 62.82
Screened: 28

Screened: 46.49 Need to screen: 49.08
Screened: 46.59 Need to screen: 49.08
Screened: 46.68 Need to screen: 48.89
Screened: 46.77 Need to screen: 48.89
Screened: 46.86 Need to screen: 49.08
Screened: 46.96 Need to screen: 49.08
Screened: 47.05 Need to screen: 49.08
Screened: 47.14 Need to screen: 49.08
Screened: 47.23 Need to screen: 49.08
Screened: 47.32 Need to screen: 49.08
Screened: 47.42 Need to screen: 49.08
Screened: 47.51 Need to screen: 48.89
Screened: 47.60 Need to screen: 48.80
Screened: 47.69 Need to screen: 48.80
Screened: 47.79 Need to screen: 48.80
Screened: 47.88 Need to screen: 48.80
Screened: 47.97 Need to screen: 48.80
Screened: 48.06 Need to screen: 48.80
Screened: 48.15 Need to screen: 48.80
Screened: 48.25 Need to screen: 48.80
Screened: 48.34 Need to screen: 48.80
Screened: 48.43 Need to screen: 48.71
Screened: 48.52 Need to screen: 48.71
Screened: 48.62 Need to screen: 48.71
All included articles found after screening: 48.71


## Abstract

In [10]:
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProb(model, initial, remaining, 'abstract')
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 69
Screened: 6.37 Need to screen: 93.08
Screened: 6.46 Need to screen: 93.08
Screened: 6.55 Need to screen: 93.27
Screened: 6.64 Need to screen: 93.27
Screened: 6.73 Need to screen: 92.34
Screened: 6.83 Need to screen: 92.53
Screened: 6.92 Need to screen: 92.34
Screened: 7.01 Need to screen: 92.34
Screened: 7.10 Need to screen: 91.97
Screened: 7.20 Need to screen: 92.34
Screened: 7.29 Need to screen: 92.34
Screened: 7.38 Need to screen: 89.76
Screened: 7.47 Need to screen: 88.84
Screened: 7.56 Need to screen: 82.01
Screened: 7.66 Need to screen: 81.09
Screened: 7.75 Need to screen: 81.37
Screened: 7.84 Need to screen: 82.20
Screened: 7.93 Need to screen: 82.56
Screened: 8.03 Need to screen: 82.75
Screened: 8.12 Need to screen: 81.27
Screened: 8.21 Need to screen: 81.46
Screened: 8.30 Need to screen: 82.20
Screened: 8.39 Need to screen: 89.39
Screened: 8.49 Need to screen: 90.96
Screened: 8.58 Need to screen: 90.59
Screened: 8.67 Need to screen: 90.04
Screened: 

Screened: 26.66 Need to screen: 78.60
Screened: 26.75 Need to screen: 76.75
Screened: 26.85 Need to screen: 77.58
Screened: 26.94 Need to screen: 77.77
Screened: 27.03 Need to screen: 67.07
Screened: 27.12 Need to screen: 63.10
Screened: 27.21 Need to screen: 63.84
Screened: 27.31 Need to screen: 62.92
Screened: 27.40 Need to screen: 63.65
Screened: 27.49 Need to screen: 63.84
Screened: 27.58 Need to screen: 64.02
Screened: 27.68 Need to screen: 66.33
Screened: 27.77 Need to screen: 65.96
Screened: 27.86 Need to screen: 67.99
Screened: 27.95 Need to screen: 67.62
Screened: 28.04 Need to screen: 67.80
Screened: 28.14 Need to screen: 69.10
Screened: 28.23 Need to screen: 70.20
Screened: 28.32 Need to screen: 70.94
Screened: 28.41 Need to screen: 70.76
Screened: 28.51 Need to screen: 70.20
Screened: 28.60 Need to screen: 72.23
Screened: 28.69 Need to screen: 72.88
Screened: 28.78 Need to screen: 72.60
Screened: 28.87 Need to screen: 72.88
Screened: 28.97 Need to screen: 73.43
Screened: 29

## Keywords

In [11]:
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProb(model, initial, remaining, 'keywords')
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 69
Screened: 6.37 Need to screen: 73.71
Screened: 6.46 Need to screen: 73.71
Screened: 6.55 Need to screen: 77.31
Screened: 6.64 Need to screen: 78.60
Screened: 6.73 Need to screen: 76.94
Screened: 6.83 Need to screen: 74.35
Screened: 6.92 Need to screen: 74.26
Screened: 7.01 Need to screen: 73.43
Screened: 7.10 Need to screen: 73.15
Screened: 7.20 Need to screen: 73.89
Screened: 7.29 Need to screen: 73.99
Screened: 7.38 Need to screen: 73.80
Screened: 7.47 Need to screen: 73.99
Screened: 7.56 Need to screen: 73.62
Screened: 7.66 Need to screen: 73.99
Screened: 7.75 Need to screen: 74.72
Screened: 7.84 Need to screen: 74.63
Screened: 7.93 Need to screen: 75.28
Screened: 8.03 Need to screen: 74.54
Screened: 8.12 Need to screen: 75.09
Screened: 8.21 Need to screen: 73.62
Screened: 8.30 Need to screen: 65.77
Screened: 8.39 Need to screen: 62.92
Screened: 8.49 Need to screen: 63.38
Screened: 8.58 Need to screen: 63.01
Screened: 8.67 Need to screen: 60.98
Screened: 

Screened: 26.48 Need to screen: 79.61
Screened: 26.57 Need to screen: 79.61
Screened: 26.66 Need to screen: 79.61
Screened: 26.75 Need to screen: 80.54
Screened: 26.85 Need to screen: 80.07
Screened: 26.94 Need to screen: 80.07
Screened: 27.03 Need to screen: 80.07
Screened: 27.12 Need to screen: 80.07
Screened: 27.21 Need to screen: 80.07
Screened: 27.31 Need to screen: 80.07
Screened: 27.40 Need to screen: 79.98
Screened: 27.49 Need to screen: 88.93
Screened: 27.58 Need to screen: 91.33
Screened: 27.68 Need to screen: 91.33
Screened: 27.77 Need to screen: 91.14
Screened: 27.86 Need to screen: 91.14
Screened: 27.95 Need to screen: 91.14
Screened: 28.04 Need to screen: 91.14
Screened: 28.14 Need to screen: 91.14
Screened: 28.23 Need to screen: 91.14
Screened: 28.32 Need to screen: 91.14
Screened: 28.41 Need to screen: 91.14
Screened: 28.51 Need to screen: 91.14
Screened: 28.60 Need to screen: 90.77
Screened: 28.69 Need to screen: 91.14
Screened: 28.78 Need to screen: 91.33
Screened: 28

Screened: 46.49 Need to screen: 93.82
Screened: 46.59 Need to screen: 93.82
Screened: 46.68 Need to screen: 93.82
Screened: 46.77 Need to screen: 93.82
Screened: 46.86 Need to screen: 93.82
Screened: 46.96 Need to screen: 93.82
Screened: 47.05 Need to screen: 93.82
Screened: 47.14 Need to screen: 93.82
Screened: 47.23 Need to screen: 93.82
Screened: 47.32 Need to screen: 93.82
Screened: 47.42 Need to screen: 93.82
Screened: 47.51 Need to screen: 93.82
Screened: 47.60 Need to screen: 93.82
Screened: 47.69 Need to screen: 93.82
Screened: 47.79 Need to screen: 93.82
Screened: 47.88 Need to screen: 93.82
Screened: 47.97 Need to screen: 93.82
Screened: 48.06 Need to screen: 93.82
Screened: 48.15 Need to screen: 93.82
Screened: 48.25 Need to screen: 93.82
Screened: 48.34 Need to screen: 93.82
Screened: 48.43 Need to screen: 93.82
Screened: 48.52 Need to screen: 93.82
Screened: 48.62 Need to screen: 93.82
Screened: 48.71 Need to screen: 93.82
Screened: 48.80 Need to screen: 93.82
Screened: 48

Screened: 66.42 Need to screen: 98.62
Screened: 66.51 Need to screen: 98.62
Screened: 66.61 Need to screen: 98.62
Screened: 66.70 Need to screen: 98.62
Screened: 66.79 Need to screen: 98.62
Screened: 66.88 Need to screen: 98.62
Screened: 66.97 Need to screen: 98.62
Screened: 67.07 Need to screen: 98.62
Screened: 67.16 Need to screen: 98.62
Screened: 67.25 Need to screen: 98.62
Screened: 67.34 Need to screen: 98.62
Screened: 67.44 Need to screen: 98.62
Screened: 67.53 Need to screen: 98.62
Screened: 67.62 Need to screen: 98.62
Screened: 67.71 Need to screen: 98.62
Screened: 67.80 Need to screen: 98.62
Screened: 67.90 Need to screen: 98.62
Screened: 67.99 Need to screen: 98.62
Screened: 68.08 Need to screen: 98.62
Screened: 68.17 Need to screen: 98.62
Screened: 68.27 Need to screen: 98.62
Screened: 68.36 Need to screen: 98.62
Screened: 68.45 Need to screen: 98.62
Screened: 68.54 Need to screen: 98.62
Screened: 68.63 Need to screen: 98.62
Screened: 68.73 Need to screen: 98.62
Screened: 68

Screened: 86.44 Need to screen: 98.06
Screened: 86.53 Need to screen: 98.06
Screened: 86.62 Need to screen: 98.06
Screened: 86.72 Need to screen: 98.06
Screened: 86.81 Need to screen: 98.06
Screened: 86.90 Need to screen: 98.06
Screened: 86.99 Need to screen: 98.06
Screened: 87.08 Need to screen: 98.06
Screened: 87.18 Need to screen: 98.06
Screened: 87.27 Need to screen: 98.06
Screened: 87.36 Need to screen: 98.06
Screened: 87.45 Need to screen: 98.06
Screened: 87.55 Need to screen: 98.06
Screened: 87.64 Need to screen: 98.06
Screened: 87.73 Need to screen: 98.06
Screened: 87.82 Need to screen: 98.06
Screened: 87.92 Need to screen: 98.06
Screened: 88.01 Need to screen: 98.06
Screened: 88.10 Need to screen: 98.06
Screened: 88.19 Need to screen: 98.06
Screened: 88.28 Need to screen: 98.06
Screened: 88.38 Need to screen: 98.06
Screened: 88.47 Need to screen: 98.06
Screened: 88.56 Need to screen: 98.06
Screened: 88.65 Need to screen: 98.06
Screened: 88.75 Need to screen: 98.06
Screened: 88

## Title and Abstract

In [17]:
# Simple combine
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProbTitleAbstract(model, initial, remaining)
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 69
Screened: 6.37 Need to screen: 89.21
Screened: 6.46 Need to screen: 89.21
Screened: 6.55 Need to screen: 87.45
Screened: 6.64 Need to screen: 85.33
Screened: 6.73 Need to screen: 84.23
Screened: 6.83 Need to screen: 84.04
Screened: 6.92 Need to screen: 82.29
Screened: 7.01 Need to screen: 80.63
Screened: 7.10 Need to screen: 81.00
Screened: 7.20 Need to screen: 79.52
Screened: 7.29 Need to screen: 78.51
Screened: 7.38 Need to screen: 82.38
Screened: 7.47 Need to screen: 79.70
Screened: 7.56 Need to screen: 79.15
Screened: 7.66 Need to screen: 77.58
Screened: 7.75 Need to screen: 77.03
Screened: 7.84 Need to screen: 76.66
Screened: 7.93 Need to screen: 76.38
Screened: 8.03 Need to screen: 75.83
Screened: 8.12 Need to screen: 75.83
Screened: 8.21 Need to screen: 76.20
Screened: 8.30 Need to screen: 76.48
Screened: 8.39 Need to screen: 76.66
Screened: 8.49 Need to screen: 76.48
Screened: 8.58 Need to screen: 76.38
Screened: 8.67 Need to screen: 82.66
Screened: 

Screened: 26.48 Need to screen: 75.18
Screened: 26.57 Need to screen: 75.74
Screened: 26.66 Need to screen: 79.15
Screened: 26.75 Need to screen: 80.54
Screened: 26.85 Need to screen: 80.72
Screened: 26.94 Need to screen: 80.90
Screened: 27.03 Need to screen: 81.64
Screened: 27.12 Need to screen: 81.27
Screened: 27.21 Need to screen: 81.83
Screened: 27.31 Need to screen: 81.83
Screened: 27.40 Need to screen: 81.46
Screened: 27.49 Need to screen: 81.73
Screened: 27.58 Need to screen: 81.27
Screened: 27.68 Need to screen: 81.00
Screened: 27.77 Need to screen: 80.54
Screened: 27.86 Need to screen: 80.90
Screened: 27.95 Need to screen: 80.35
Screened: 28.04 Need to screen: 80.07
Screened: 28.14 Need to screen: 79.98
Screened: 28.23 Need to screen: 79.52
Screened: 28.32 Need to screen: 79.70
Screened: 28.41 Need to screen: 79.70
Screened: 28.51 Need to screen: 79.70
Screened: 28.60 Need to screen: 79.89
Screened: 28.69 Need to screen: 79.89
Screened: 28.78 Need to screen: 79.70
Screened: 28

Screened: 46.40 Need to screen: 49.91
Screened: 46.49 Need to screen: 49.82
Screened: 46.59 Need to screen: 49.82
Screened: 46.68 Need to screen: 49.63
Screened: 46.77 Need to screen: 49.45
Screened: 46.86 Need to screen: 49.26
Screened: 46.96 Need to screen: 49.26
Screened: 47.05 Need to screen: 49.35
Screened: 47.14 Need to screen: 48.71
Screened: 47.23 Need to screen: 48.99
Screened: 47.32 Need to screen: 49.08
Screened: 47.42 Need to screen: 49.08
Screened: 47.51 Need to screen: 49.17
Screened: 47.60 Need to screen: 49.17
Screened: 47.69 Need to screen: 49.35
Screened: 47.79 Need to screen: 49.72
Screened: 47.88 Need to screen: 49.63
Screened: 47.97 Need to screen: 49.54
Screened: 48.06 Need to screen: 49.45
Screened: 48.15 Need to screen: 49.54
Screened: 48.25 Need to screen: 49.91
Screened: 48.34 Need to screen: 49.63
Screened: 48.43 Need to screen: 49.45
Screened: 48.52 Need to screen: 49.45
Screened: 48.62 Need to screen: 49.54
Screened: 48.71 Need to screen: 49.54
Screened: 48

In [19]:
# Simple combine
model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)

# Find first included article index
start = df[df.code == 1].index[0] + 1
print("Initial number screened:", start)

# Split df by minimum size to screen
initial = df.iloc[:start,:].reset_index(drop=True)
remaining = df.iloc[start:,:].reset_index(drop=True)

while(remaining.index[-1] > 0):
    remaining = calcProbAppend(model, initial, remaining)
    if(not printStats(initial, remaining)):
        break
    # Take first remaining and append to initial (to account for extra screened article)
    initial = initial.append(remaining.iloc[0], ignore_index=True)
    remaining.drop(0, inplace=True)

Initial number screened: 69
Screened: 6.37 Need to screen: 51.29
Screened: 6.46 Need to screen: 51.29
Screened: 6.55 Need to screen: 58.30
Screened: 6.64 Need to screen: 56.83
Screened: 6.73 Need to screen: 61.44
Screened: 6.83 Need to screen: 60.15
Screened: 6.92 Need to screen: 62.82
Screened: 7.01 Need to screen: 65.13
Screened: 7.10 Need to screen: 67.71
Screened: 7.20 Need to screen: 68.82
Screened: 7.29 Need to screen: 68.63
Screened: 7.38 Need to screen: 68.27
Screened: 7.47 Need to screen: 69.74
Screened: 7.56 Need to screen: 70.11
Screened: 7.66 Need to screen: 69.93
Screened: 7.75 Need to screen: 70.48
Screened: 7.84 Need to screen: 71.77
Screened: 7.93 Need to screen: 73.71
Screened: 8.03 Need to screen: 74.08
Screened: 8.12 Need to screen: 73.89
Screened: 8.21 Need to screen: 75.00
Screened: 8.30 Need to screen: 75.46
Screened: 8.39 Need to screen: 75.46
Screened: 8.49 Need to screen: 69.74
Screened: 8.58 Need to screen: 71.31
Screened: 8.67 Need to screen: 70.66
Screened: 

Screened: 26.29 Need to screen: 40.50
Screened: 26.38 Need to screen: 40.41
Screened: 26.48 Need to screen: 40.41
Screened: 26.57 Need to screen: 40.50
Screened: 26.66 Need to screen: 40.50
Screened: 26.75 Need to screen: 40.04
Screened: 26.85 Need to screen: 40.22
Screened: 26.94 Need to screen: 40.41
Screened: 27.03 Need to screen: 40.41
Screened: 27.12 Need to screen: 40.50
Screened: 27.21 Need to screen: 40.50
Screened: 27.31 Need to screen: 40.41
Screened: 27.40 Need to screen: 40.22
Screened: 27.49 Need to screen: 40.50
Screened: 27.58 Need to screen: 40.50
Screened: 27.68 Need to screen: 40.50
Screened: 27.77 Need to screen: 40.50
Screened: 27.86 Need to screen: 40.50
Screened: 27.95 Need to screen: 40.50
Screened: 28.04 Need to screen: 40.31
Screened: 28.14 Need to screen: 40.31
Screened: 28.23 Need to screen: 40.31
Screened: 28.32 Need to screen: 40.31
Screened: 28.41 Need to screen: 40.31
Screened: 28.51 Need to screen: 40.13
Screened: 28.60 Need to screen: 40.50
Screened: 28