# Generate and evaluate models

### Import training and test data

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import f1_score, mean_squared_error as mse
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from PU_Learning import *

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

In [2]:
# import training data
data = pd.read_csv('clean_data/players_17_clean_train.csv')
train_x = data.iloc[:, :-1].values
train_y = data.iloc[:, -1].values
s = data.iloc[:, -1].values
c = Counter(s)[1]/Counter(train_y)[1]

# import test data 
data = pd.read_csv('clean_data/players_17_clean_test.csv')
test_x = data.iloc[:, :-1].values
test_y = data.iloc[:, -1].values

### Golden Standard Classifier

In [3]:
# consider the dataset as fully labeled and use this as the best case in the comparison.
golden_clf = svm.SVC(kernel='rbf', probability=True, random_state=331).fit(np.copy(train_x),np.copy(train_y))
name = "Golden Standard Classifier:"

best_pred_y = golden_clf.predict(np.copy(test_x))
best_prob_y = golden_clf.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, best_pred_y))

Golden Standard Classifier: F1 score: 0.45197740112994345


### Non-Traditional Classifier

In [4]:
non_trad_clf = svm.SVC(kernel='rbf', probability=True, random_state=331).fit(np.copy(train_x), np.copy(s))
name = "Non-Traditional Classifier"

pred_y = non_trad_clf.predict(np.copy(test_x))
pred_prob_y = non_trad_clf.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

Non-Traditional Classifier F1 score: 0.45197740112994345
Non-Traditional Classifier MSE score: 0.0


### Spy Expectation Maximization S-EM

In [5]:
pu_classifier = SEM(tol=1.0e-10, max_iter=100, spy_prop=0.1, l=0.15, classifier=LogisticRegression(), seed=331)
name = "SEM"

pu_classifier.fit(np.copy(train_x), np.copy(s))

pred_y = pu_classifier.predict(np.copy(test_x))
pred_prob_y = pu_classifier.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

Number of iterations first step: 100
Number of iterations second step: 100
SEM F1 score: 0.8607068607068608
SEM MSE score: 0.10447528062734837


### Modified Logistic Regression MLR

In [9]:
pu_classifier = ModifiedLogisticRegression(max_iter=100, l_rate=0.001, seed=331)
name = "MLR"

pu_classifier.parameters_update(np.copy(train_x), np.copy(s))
pu_classifier.fit(np.copy(train_x), np.copy(s))
pu_classifier.estimate_c()

pred_y = pu_classifier.predict(np.copy(test_x))
pred_prob_y = pu_classifier.predict_proba(np.copy(test_x))

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

MLR F1 score: 1.0
MLR MSE score: 0.44010903760372555


### 1-DNFII

In [10]:
# deviation indicates whether two feature values are considered the same
# if deviation = 0, the two feature values need to be exactly the same
# if deviation = 1, feature value one need to be equal to feature value two - 1
#                   OR feature value one need to be equal to feature value two + 1
#                   OR the two feature values are exactly the same
global deviation 
deviation = 0.1

# the parameter lambda used in 1-DNFII
global parameter_lambda 
parameter_lambda = 0.1

In [34]:
# calculate the frequency of a certain score for a certain feature
def frequency(feature, score, collection):
    count = 0
    for element in collection:
        if (element[feature] > score - deviation) and (element[feature] < score + deviation):
            count += 1
    return count

# algorithm improved 1-DNF (1-DNFII)
def dnf(train_x, train_y):
    num_rows, num_cols = train_x.shape
    
    # PF = positive feature set
    PF = np.zeros(shape=(0, 2))
    
    # RN = reliable negatives
    RN = np.zeros(shape=(0, num_cols))
    RN_y = np.array([])
    
    # U = unlabeled
    U = np.zeros(shape=(0, num_cols))
    U_y = np.array([])
    
    # P = positive
    P = np.zeros(shape=(0, num_cols))
    P_y = np.array([])
    
    # construct P, U and RN
    # at this moment: RN = U
    for i in range(0, len(train_y)-1):
        if (train_y[i] == 1):
            P = np.vstack([P, train_x[i]])
            P_y = np.append(P_y, train_y[i])
        else:
            U = np.vstack([U, train_x[i]])
            U_y = np.append(U_y, train_y[i])
            RN = np.vstack([RN, train_x[i]])
            RN_y = np.append(RN_y, train_y[i])
            
    print("1-DNFII: RN is initialized")

    # construct PF
    for i in range(0, num_cols-1):
        print("Progress constructing PF: {} %".format(round((i / num_cols)*100), 2))
        for j in range(0, num_rows-1):
            symbol = train_x[j][i]
            if (not [i, symbol] in PF):
                constraint_1 = frequency(i, symbol, P) / len(P)
                constraint_2 = frequency(i, symbol, U) / len(U)
                if (constraint_1 > constraint_2) and (constraint_1 > parameter_lambda):
                    new = [i, symbol]
                    PF = np.vstack([PF, new])
                
    # remove duplicates in PF
    # PF = np.unique(PF)
                
    print("1-DNFII: PF is constructed")
              
    # construct list with indices of elements to remove from RN
    # (based on PF)
    index = []        
    for i in range(0, len(RN)-1):
        for constraint in PF:
            element = RN[i]
            if element[int(constraint[0])] == constraint[1]:
                index.append(i)
              
    RN = np.delete(RN, index, 0)
    RN_y = np.delete(RN_y, index, 0)
                
    print("1-DNFII: RN was finalized")
    
    return P, P_y, U, U_y, RN, RN_y, PF

In [35]:
P, P_y, U, U_y, RN, RN_y, PF = dnf(train_x, train_y)

1-DNFII: RN is initialized
Progress constructing PF: 0 %
Progress constructing PF: 5 %
Progress constructing PF: 9 %
Progress constructing PF: 14 %
Progress constructing PF: 18 %
Progress constructing PF: 23 %
Progress constructing PF: 27 %
Progress constructing PF: 32 %
Progress constructing PF: 36 %
Progress constructing PF: 41 %
Progress constructing PF: 45 %
Progress constructing PF: 50 %
Progress constructing PF: 55 %
Progress constructing PF: 59 %
Progress constructing PF: 64 %
Progress constructing PF: 68 %
Progress constructing PF: 73 %
Progress constructing PF: 77 %
Progress constructing PF: 82 %
Progress constructing PF: 86 %
Progress constructing PF: 91 %
1-DNFII: PF is constructed
1-DNFII: RN was finalized


In [42]:
training_x = np.vstack([P, RN])
training_y = np.append(P_y, RN_y)

clf = make_pipeline(StandardScaler(), svm.SVC(gamma='auto'))
clf.fit(np.copy(training_x), np.copy(training_y))
name = "1-DNFII"

pred_y = clf.predict(np.copy(test_x))
#pred_prob_y = clf.predict_proba(np.copy(test_x))
            
print(name,"F1 score:", f1_score(test_y, pred_y))
#print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

1-DNFII F1 score: 0.9757009345794392


In [15]:
print(PF)

[[ 0.          0.69387755]
 [ 1.          0.57692308]
 [ 2.          0.        ]
 [ 3.          0.20183486]
 [ 4.          0.75      ]
 [ 5.          0.5       ]
 [ 6.          0.76      ]
 [ 7.          0.69230769]
 [ 8.          0.82608696]
 [ 9.          0.74285714]
 [10.          0.33333333]
 [11.          0.77777778]
 [12.          0.94117647]
 [13.          0.72043011]
 [14.          0.79569892]
 [15.          0.89534884]
 [16.          0.76744186]
 [17.          0.75294118]
 [18.          0.76923077]
 [19.          0.79268293]
 [20.          0.77659574]]
