In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Helper Functions

In [2]:
# Function to plot confusion matrix
def plotConfMatrix(actual, pred, labels):
    conf_matrix = confusion_matrix(actual, pred)
    labels = ["Exclude", "Include"]
    df_cm = pd.DataFrame(conf_matrix, columns=labels, index=labels)
    df_cm.index.name = 'Actual'
    df_cm.columns.name = 'Predicted'
    plt.figure(figsize = (4,2))
    sns.set(font_scale=1.2)#for label size
    sns.heatmap(df_cm, fmt='d', cmap="Blues", annot=True, annot_kws={"size": 12})# font size
    plt.show()
    return
 
# Function to calculate probabilities of each remaining article
def calcProb(model, initial, remaining):
    # Get initial training data and labels 
    initial_data = initial['scibert'].tolist()
    initial_labels = initial['code'].tolist()

    # Fit model to initial training data
    model.fit(initial_data, initial_labels)

    # Get remaining data for testing
    remaining_data = remaining['scibert'].tolist()

    # Predict probability [exclusion, inclusion] on remaining articles
    pred = model.predict_proba(remaining_data)

    # Calculate score (x[1] = probability of inclusion)
    pred = list(map(lambda x: x[1], pred))
    # Add probability to dataframe
    remaining['prob'] = pred
    # Sort by probability
    remaining = remaining.sort_values(by=['prob'], ascending=False).reset_index(drop=True)
    
    return remaining

# Calculate the total number of articles screened so far (train / train + test)
def calcPercentScreened(initial, remaining):
    return (initial.index[-1] + 1) / ((remaining.index[-1] + 1) + (initial.index[-1] + 1))

# Calculate the number of articles needed to screen out of entire dataset to find all included articles
# (train + last_index / train + test)
def calcPercentNeedToScreen(initial, remaining, last_index):
    return ((initial.index[-1] + 1) + (last_index + 1)) / ((initial.index[-1] + 1) + (remaining.index[-1] + 1))

# Print stats (return false when all included articles found)
def printStats(initial, remaining):
    # Find index of last included article
    include_indicies = remaining[remaining.code == 1].index
    # Calculate total percentage of articles that need to be screened
    if (len(include_indicies) == 0):
        print("All included articles found after screening: %d (%.2f percent)" % (len(initial), calcPercentScreened(initial, remaining) * 100))
        return False
    else:
        print("Screened: %.2f (%d) Need to screen: %.2f (%d)" % (
            calcPercentScreened(initial, remaining) * 100,
            len(initial),
            calcPercentNeedToScreen(initial, remaining, include_indicies[-1]) * 100,
            len(remaining)
         ))
        return True

### Simulate Screening

In [72]:
def simulateScreening(df):
    # Shuffle df
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Load model
    model = LogisticRegression(C=0.05, class_weight='balanced', max_iter=1000)
    all_data = df['scibert'].tolist()
    all_labels = df['code'].tolist()

    # Find first included article index
    start = df[df.code == 1].index[0] + 1
    # print("Initial number screened:", start)

    # Split df by minimum size to screen
    initial = df.iloc[:start,:].reset_index(drop=True)
    remaining = df.iloc[start:,:].reset_index(drop=True)

    while(remaining.index[-1] > 0):
        remaining = calcProb(model, initial, remaining)
        # printStats(initial, remaining)
        include_indicies = remaining[remaining.code == 1].index
        if(len(include_indicies) == 0):
            # Have found all articles
            break
        # Take first remaining and append to initial (to account for extra screened article)
        initial = pd.concat([initial, remaining.iloc[[0]]], ignore_index=True)
        remaining.drop(0, inplace=True)
        
    # Return number screened and number remaining    
    return len(initial), len(remaining)

In [74]:
name = "copper"
stats = []
# df = pd.concat([pd.read_pickle("./uti/utiTrain.pkl"), pd.read_pickle("./utiTest.pkl")])
# use just training (screening) data
df = pd.read_pickle("./" + name + "/" + name + "Train.pkl")
total = len(df)
# Simulate screening 100 times
for i in range(5):
    if i % 5 == 0:
        print(i, "%")
    stats.append(simulateScreening(df))

stats_df = pd.DataFrame(stats, columns=["initial", "remaining"])
stats_df["score"] = stats_df["remaining"] / total
stats_df

0 %


Unnamed: 0,initial,remaining,score
0,59,289,0.83046
1,56,292,0.83908
2,43,305,0.876437
3,82,266,0.764368
4,97,251,0.721264
