# Generate and evaluate models

### Import training and test data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import f1_score, mean_squared_error as mse
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from PU_Learning import *

def ignore_warnings():
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=Warning)

ignore_warnings()

In [2]:
# import training data
data = pd.read_csv('clean_data/players_17_clean_train.csv')
train_x = data.iloc[:, :-1].values
train_y = data.iloc[:, -1].values
s = data.iloc[:, -1].values
c = Counter(s)[1]/Counter(train_y)[1]

# import test data 
data = pd.read_csv('clean_data/players_17_clean_test.csv')
test_x = data.iloc[:, :-1].values
test_y = data.iloc[:, -1].values

### Golden Standard Classifier

In [3]:
# consider the dataset as fully labeled and use this as the best case in the comparison.
golden_clf = svm.SVC(kernel='rbf', probability=True, random_state=331).fit(np.copy(train_x),np.copy(train_y))
name = "Golden Standard Classifier:"

best_pred_y = golden_clf.predict(np.copy(test_x))
best_prob_y = golden_clf.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, best_pred_y))

Golden Standard Classifier: F1 score: 0.6432673899170389


### Non-Traditional Classifier

In [None]:
non_trad_clf = svm.SVC(kernel='rbf', probability=True, random_state=331).fit(np.copy(train_x), np.copy(s))
name = "Non-Traditional Classifier"

pred_y = non_trad_clf.predict(np.copy(test_x))
pred_prob_y = non_trad_clf.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

### Spy Expectation Maximization S-EM

In [None]:
pu_classifier = SEM(tol=1.0e-10, max_iter=1000, spy_prop=0.1, l=0.15, classifier=LogisticRegression(), seed=331)
name = "SEM"

pu_classifier.fit(np.copy(train_x), np.copy(s))

pred_y = pu_classifier.predict(np.copy(test_x))
pred_prob_y = pu_classifier.predict_proba(np.copy(test_x))[:,1]

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

### Modified Logistic Regression MLR

In [3]:
pu_classifier = ModifiedLogisticRegression(max_iter=1000, l_rate=0.001, seed=331)
name = "MLR"

pu_classifier.parameters_update(np.copy(train_x), np.copy(s))
pu_classifier.fit(np.copy(train_x), np.copy(s))
pu_classifier.estimate_c()

pred_y = pu_classifier.predict(np.copy(test_x))
pred_prob_y = pu_classifier.predict_proba(np.copy(test_x))

print(name,"F1 score:", f1_score(test_y, pred_y))
print(name,"MSE score:", mse(best_prob_y, pred_prob_y))

KeyboardInterrupt: 

### 1-DNFII

In [3]:
# deviation indicates whether two feature values are considered the same
# if deviation = 0, the two feature values need to be exactly the same
# if deviation = 1, feature value one need to be equal to feature value two - 1
#                   OR feature value one need to be equal to feature value two + 1
#                   OR the two feature values are exactly the same
global deviation 
deviation = 0.1

# the parameter lambda used in 1-DNFII
global parameter_lambda 
parameter_lambda = 0.1

In [4]:
# calculate the frequency of a certain score for a certain feature
def frequency(feature, score, collection):
    count = 0
    for element in collection:
        if (element[feature] > score - deviation) and (element[feature] < score + deviation):
            count += 1
    return count

# algorithm improved 1-DNF (1-DNFII)
def dnf(train_x, train_y):
    num_rows, num_cols = train_x.shape
    
    # PF = positive feature set
    PF = np.zeros(shape=(0, 2))
    
    # RN = reliable negatives
    RN = np.zeros(shape=(0, num_cols))
    RN_y = np.array([])
    
    # U = unlabeled
    U = np.zeros(shape=(0, num_cols))
    U_y = np.array([])
    
    # P = positive
    P = np.zeros(shape=(0, num_cols))
    P_y = np.array([])
    
    # construct P, U and RN
    # at this moment: RN = U
    for i in range(0, len(train_y)-1):
        if (train_y[i] == 1):
            P = np.vstack([P, train_x[i]])
            P_y = np.append(P_y, train_y[i])
        else:
            U = np.vstack([U, train_x[i]])
            U_y = np.append(U_y, train_y[i])
            RN = np.vstack([RN, train_x[i]])
            RN_y = np.append(RN_y, train_y[i])
            
    print("1-DNFII: RN is initialized")

    # construct PF
    for i in range(0, num_cols-1):
        for j in range(0, num_rows-1):
            symbol = train_x[j][i]
            if (not [i, symbol] in PF):
                constraint_1 = frequency(i, symbol, P) / len(P)
                constraint_2 = frequency(i, symbol, U) / len(U)
                if (constraint_1 > constraint_2) and (constraint_1 > parameter_lambda):
                    new = [i, symbol]
                    PF = np.vstack([PF, new])
                
    print("1-DNFII: PF is constructed")
              
    # construct list with indices of elements to remove from RN
    # (based on PF)
    index = []        
    for i in range(0, len(RN)-1):
        for constraint in PF:
            element = RN[i]
            if element[int(constraint[0])] == constraint[1]:
                index.append(i)
              
    RN = np.delete(RN, index, 0)
    RN_y = np.delete(RN_y, index, 0)
                
    print("1-DNFII: RN was finalized")
    
    return P, P_y, U, U_y, RN, RN_y, PF

In [5]:
P, P_y, U, U_y, RN, RN_y, PF = dnf(train_x, train_y)

1-DNFII: RN is initialized
1-DNFII: PF is constructed
1-DNFII: RN was finalized


In [6]:
# returns a numpy array that holds all the elements of a that are not in b
def setdiff_nd_positivenums(a, b):
    s = np.maximum(a.max(0)+1,b.max(0)+1)
    return a[~np.isin(a.dot(s),b.dot(s))]

In [7]:
FinalWVClassifier = []
FinalWVClassifier_weights = []

# shuffle rows of P and then delete the first 10%
P = np.insert(P, len(P[0]), P_y, axis=1)
np.random.shuffle(P)
PP = P[:round(len(P)/10),:]
PP_y = PP[:, -1]
PP = np.delete(PP, -1, 1)
index = [x for x in range(0, round(len(P)/10))]
P = np.delete(P, index, 0)
P_y = P[:, -1]
P = np.delete(P, -1, 1)

PON = np.vstack([P, RN])
PON_y = np.append(P_y, RN_y)

RN = np.insert(RN, len(RN[0]), RN_y, axis=1)

U = np.insert(U, len(U[0]), U_y, axis=1)
U = setdiff_nd_positivenums(U, RN)
U_y = U[:, -1]
U = np.delete(U, -1, 1)

allPrecision = 0

last_U = 0

while (True):
    if (len(U) == 0):
        break
    
    # create a new SVM
    new_clf = make_pipeline(StandardScaler(), svm.SVC(gamma='auto'))
    
    # round elements of PON_y to their nearest integer
    PON_y = np.rint(PON_y)
    
    # train the new SVM
    new_clf.fit(np.copy(PON), np.copy(PON_y))
    
    # predict the labels of the class U
    NEG = new_clf.predict(np.copy(U))
    
    # predict the labels of the class PP
    predictions = new_clf.predict(np.copy(PP))
    
    # calculate the precision of PP on the trained SVM
    precision = len(predictions[predictions == 1]) / len(predictions)
    #precision = f1_score(np.ones(len(predictions)), predictions)
    allPrecision += precision
    
    # initialize list with indexes that will be removed from U
    index = []
    
    # check which elements can be removed from U
    for i in range(0, len(NEG)):
        if (NEG[i] == 0):
            index.append(i)
            PON = np.vstack([PON, U[i, :]])
            PON_y = np.append(PON_y, NEG[i])
    
    # remove elements of U
    U = np.delete(U, index, 0)
    
    if (len(U) == last_U):
        break
        
    last_U = len(U)
    
    print("size U: " + str(len(U)) + " precision: " + str(precision) + " all: " + str(allPrecision))
    
    # add classifier to list of classifiers
    FinalWVClassifier.append(new_clf)
    FinalWVClassifier_weights.append(precision)
    
for i in range(0, len(FinalWVClassifier_weights)):
    FinalWVClassifier_weights[i] = FinalWVClassifier_weights[i] / allPrecision

print("------------------")
print("A weighted voting method with " + str(len(FinalWVClassifier)) + " classifiers was created.")

size U: 7547 precision: 0.9303482587064676 all: 0.9303482587064676
size U: 6950 precision: 0.9203980099502488 all: 1.8507462686567164
size U: 6234 precision: 0.900497512437811 all: 2.7512437810945274
size U: 5606 precision: 0.8756218905472637 all: 3.626865671641791
size U: 4635 precision: 0.845771144278607 all: 4.472636815920398
size U: 3285 precision: 0.8159203980099502 all: 5.288557213930349
size U: 2020 precision: 0.7711442786069652 all: 6.0597014925373145
size U: 1161 precision: 0.746268656716418 all: 6.805970149253732
size U: 743 precision: 0.6915422885572139 all: 7.497512437810946
size U: 529 precision: 0.6716417910447762 all: 8.169154228855723
size U: 430 precision: 0.6417910447761194 all: 8.810945273631843
size U: 383 precision: 0.6268656716417911 all: 9.437810945273634
size U: 350 precision: 0.6169154228855721 all: 10.054726368159207
size U: 338 precision: 0.6169154228855721 all: 10.67164179104478
size U: 325 precision: 0.6169154228855721 all: 11.288557213930352
size U: 316 pr

In [8]:
# predict the label of the elements of the given list with the given classifiers and weigths
# returns a list with these labels
def predict_weighted_voting_list(classifiers, weights, listt):
    result = []
    for elem in listt:
        result.append(predict_weighted_voting(classifiers, weights, elem))
    return result
    
# predict the label of x with the given classifiers and weigths
def predict_weighted_voting(classifiers, weigths, x):
    score = 0
    for i in range(0, len(FinalWVClassifier)):
        score += FinalWVClassifier_weights[i] * FinalWVClassifier[i].predict([x])
    return round(score[0])

In [9]:
name = "1-DNFII"
pred_y = predict_weighted_voting_list(FinalWVClassifier, FinalWVClassifier_weights, np.copy(test_x))         
print(name,"F1 score:", f1_score(test_y, pred_y))

1-DNFII F1 score: 0.7019027484143764
