# Generate and evaluate homework models

### Import training and test data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import f1_score, mean_squared_error as mse
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from PU_Learning import *

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

In [2]:
# import training data
data = pd.read_csv('clean_data/players_17_clean_train.csv')
train_x = data.iloc[:, :-1].values
train_y = data.iloc[:, -1].values
s = data.iloc[:, -1].values
c = Counter(s)[1]/Counter(train_y)[1]

# import test data 
data = pd.read_csv('clean_data/players_17_clean_test.csv')
test_x = data.iloc[:, :-1].values
test_y = data.iloc[:, -1].values

### Golden Standard Classifier

In [3]:
# consider the dataset as fully labeled and use this as the best case in the comparison.
golden_clf = svm.SVC(kernel='rbf', probability=True, random_state=331).fit(np.copy(train_x),np.copy(train_y))
name = "Golden Standard Classifier:"

best_pred_y = golden_clf.predict(np.copy(test_x))
best_prob_y = golden_clf.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, best_pred_y))

Golden Standard Classifier: F1 score: 0.6966759002770083


### Non-Traditional Classifier

In [4]:
non_trad_clf = svm.SVC(kernel='rbf', probability=True, random_state=331).fit(np.copy(train_x), np.copy(s))
name = "Non-Traditional Classifier"

pred_y = non_trad_clf.predict(np.copy(test_x))
pred_prob_y = non_trad_clf.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

Non-Traditional Classifier F1 score: 0.6966759002770083
Non-Traditional Classifier MSE score: 0.0


### Spy Expectation Maximization S-EM

In [5]:
pu_classifier = SEM(tol=1.0e-10, max_iter=1000, spy_prop=0.1, l=0.15, classifier=LogisticRegression(), seed=331)
name = "SEM"

pu_classifier.fit(np.copy(train_x), np.copy(s))

pred_y = pu_classifier.predict(np.copy(test_x))
pred_prob_y = pu_classifier.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

Number of iterations first step: 229
Number of iterations second step: 212
SEM F1 score: 0.3798687089715536
SEM MSE score: 0.7290650355068272


### Modified Logistic Regression MLR

In [6]:
pu_classifier = ModifiedLogisticRegression(max_iter=1000, l_rate=0.001, seed=331)
name = "MLR"

pu_classifier.parameters_update(np.copy(train_x), np.copy(s))
pu_classifier.fit(np.copy(train_x), np.copy(s))
pu_classifier.estimate_c()

pred_y = pu_classifier.predict(np.copy(test_x))
pred_prob_y = pu_classifier.predict_proba(np.copy(test_x))

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

MLR F1 score: 0.3798687089715536
MLR MSE score: 0.7303110999703546
