# Generate models

In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import f1_score, mean_squared_error as mse
from sklearn import svm
from sklearn.linear_model import LogisticRegression

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

from PU_Learning import *

In [2]:
data = pd.read_csv('players_17_clean_train.csv')
X1_train= data.iloc[:, :-1].values
y1_train = data.iloc[:, -1].values
s1 = data.iloc[:, -1].values
c1 = Counter(s1)[1]/Counter(y1_train)[1]

data = pd.read_csv('players_17_clean_test.csv')
X1_test= data.iloc[:, :-1].values
y1_test = data.iloc[:, -1].values

In [3]:
Counter(y1_test)

Counter({1.0: 150})

### Golden Standard Classifier

In [4]:
# Consider the dataset as fully labeled. Use this as the best case in the comparison.

golden_clf = svm.SVC(kernel='rbf',probability=True,random_state = 331).fit(np.copy(X1_train),np.copy(y1_train))
name = "Golden Standard Classifier:"

y_best_pred_1 = golden_clf.predict(np.copy(X1_test))
y_best_prob_1 = golden_clf.predict_proba(np.copy(X1_test))[:,1]

print(name,"F1 score:", f1_score(y1_test, y_best_pred_1))

Golden Standard Classifier: F1 score: 0.2758620689655173


### Non-Traditional Classifier (first dataset)

In [5]:
# Fit a model on (X, s1) and see the performance. Compare it with the two methods.
non_trad_clf = svm.SVC(kernel='rbf', probability=True, random_state = 331).fit(np.copy(X1_train),np.copy(s1))
name = "Non-Traditional Classifier"

y_pred = non_trad_clf.predict(np.copy(X1_test))
y_pred_prob = non_trad_clf.predict_proba(np.copy(X1_test))[:,1]
print(name,"F1 score:", f1_score(y1_test, y_pred))
print(name,"MSE score:", mse(y_best_prob_1, y_pred_prob))

Non-Traditional Classifier F1 score: 0.2758620689655173
Non-Traditional Classifier MSE score: 0.0


### Spy Expectation Maximization S-EM

In [6]:
pu_classifier = SEM(tol = 1.0e-10, max_iter = 10000, spy_prop = 0.1, l = 0.15,
                    classifier = LogisticRegression(), seed=331)
name = "SEM"

pu_classifier.fit(np.copy(X1_train), np.copy(s1))

y_pred = pu_classifier.predict(np.copy(X1_test))

y_pred_prob = pu_classifier.predict_proba(np.copy(X1_test))[:,1]

print(name,"F1 score:", f1_score(y1_test, y_pred))
print(name,"MSE score:", mse(y_best_prob_1, y_pred_prob))

Number of iterations first step: 786
Number of iterations second step: 581
SEM F1 score: 1.0
SEM MSE score: 0.7118504579726814


### Modified Logistic Regression MLR

In [7]:
from PU_Learning import *

pu_classifier = ModifiedLogisticRegression(max_iter = 10000, l_rate = 0.001, seed = 331)
name = "MLR"

pu_classifier.parameters_update(np.copy(X1_train), np.copy(s1))
print("parameter_b", pu_classifier.b)
print("parameter_w", pu_classifier.w)
print("log-likelihood", pu_classifier.log_likelihood(np.copy(X1_train), np.copy(s1)))

pu_classifier.fit(np.copy(X1_train), np.copy(s1))

pu_classifier.estimate_c()

y_pred = pu_classifier.predict(np.copy(X1_test))

y_pred_prob = pu_classifier.predict_proba(np.copy(X1_test))

print(name,"F1 score:", f1_score(y1_test, y_pred))
print(name,"MSE score:", mse(y_best_prob_1, y_pred_prob))

parameter_b 12.003
parameter_w [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
log-likelihood -1596.2726101803817
MLR F1 score: 1.0
MLR MSE score: 0.8156439219978434
