In [1]:
import bz2
import pandas as pd
import numpy as np

In [2]:
from originality import original

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

In [4]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [5]:
tournament_file = bz2.open("data/sample_tournament.csv.bz2")
training_file = bz2.open("data/sample_training.csv.bz2")

In [6]:
tournament_data = pd.read_csv(tournament_file)
training_data = pd.read_csv(training_file)

In [7]:
f = [c for c in list(training_data) if "feature" in c]
X, Y = training_data[f], training_data["target"]
X_cv = tournament_data[f]

In [8]:
def similar_model(model, num_models=21):
    results = []
    for i in range(num_models):
        
        model.fit(X,Y)
        pred = model.predict_proba(X_cv)[:,1]
        results.append(pred)
        
    return results
    
    

In [9]:
def score_similar_models(predictions):
    baseline = predictions[0]
    baseline.sort()
    
    scores = np.array([original(baseline, predictions[i]) for i in range(1, len(predictions))])

    return np.sum(scores)/len(scores)
    

In [10]:
def find_noise_threshold(model):
    
    model.fit(X,Y)
    baseline = model.predict_proba(X_cv)[:,1]
    
    baseline.sort()
    noise_scale = 0.0
    
    while True:
        noised = baseline + np.random.uniform(low=-noise_scale, high=noise_scale, size=baseline.shape)
        noised = np.clip(noised, 0, 1)
        
        is_original = original(baseline, noised)
        
        if is_original:
            break
        
        noise_scale+= 0.01
        
        if noise_scale>0.5:
            noise_scale = "NAN"
            break
        
    return noise_scale
    

In [11]:
models = [
    ("LogisticRegression", LogisticRegression()),
    ("NaiveBayes", GaussianNB()),
    ("RandomForest", RandomForestClassifier(n_estimators=20, n_jobs=-1)),
    ("QDA", QuadraticDiscriminantAnalysis()),
    #("GP", GaussianProcessClassifier()),
    ("KNN", KNeighborsClassifier()),
    ("SVC", SVC(probability=True)),
    ("MLP", MLPClassifier()),
    ("AdaBoosting", AdaBoostClassifier()),
    ("DecisionTree", DecisionTreeClassifier())
    
]

In [14]:
"""
Test to make sure that multiple submissions from the same model aren't original

Ideally, all these models should return 0. 
"""
for name, model in models:
    
    predictions = similar_model(model, num_models=21)
    score = score_similar_models(predictions)
    print(name, score)
    

LogisticRegression 0.0
NaiveBayes 0.0
RandomForest 1.0
QDA 0.0
KNN 1.0
SVC 1.0
MLP 0.975
AdaBoosting 0.0
DecisionTree 1.0


In [13]:
"""
Find a small noise threshold where it passes originality.

This tests robustness, and ideally these numbers should be high
"""
for name, model in models:
    
    thresh = find_noise_threshold(model)
    
    print(name, thresh)
    

LogisticRegression 0.03
NaiveBayes 0.19000000000000003
RandomForest 0.0
QDA 0.19000000000000003
KNN 0.0
SVC 0.0
MLP 0.02
AdaBoosting 0.01
DecisionTree 0.0
