# Course Section K1. Weekcurve 1-NN Classification with 10-fold CV

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [33]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [15]:
ofEx = pd.read_csv("")

In [16]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [17]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result', 'fem', 'fg', 'urm'], axis=1)

In [18]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
0,315,241,93,366,117,117,309,167,223,114,327,65
1,444,285,223,374,103,82,93,233,74,59,492,65
2,292,112,91,379,90,93,43,321,25,42,204,209
3,304,161,159,171,85,70,89,272,44,67,212,3
4,106,114,96,359,98,53,81,275,41,68,60,78


## A vs. C bis F

In [19]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [20]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
1,444,285,223,374,103,82,93,233,74,59,492,65
3,304,161,159,171,85,70,89,272,44,67,212,3
5,131,13,66,243,18,0,15,145,0,23,7,39
6,62,91,97,191,76,44,69,208,28,33,188,24
7,177,259,272,665,218,172,50,428,67,84,335,30


In [21]:
len(X)

221

In [22]:
len(y)

221

In [23]:
# Logarithmieren
X_l = np.log1p(X) 

In [24]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [25]:
y.value_counts()

1    166
0     55
Name: final_result, dtype: int64

In [39]:
distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df


\begin{tabular}{llllll}
\toprule
     distance &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
    Manhattan &  0.687945 &  0.535049 &  0.770649 &  0.836765 &  0.799495 \\
    Euclidean &  0.710672 &  0.575392 &  0.792221 &  0.844118 &  0.814267 \\
      Maximum &  0.719763 &    0.6019 &  0.801401 &  0.837132 &    0.8165 \\
           DF &   0.74249 &  0.602463 &  0.796425 &  0.884926 &  0.836605 \\
          DTW &  0.720158 &  0.596507 &   0.79769 &  0.843015 &   0.81852 \\
          WDF &  0.760474 &  0.644338 &  0.818563 &  0.878676 &  0.845525 \\
         WDTW &  0.738538 &  0.616066 &   0.80503 &  0.862132 &  0.830078 \\
k\_g\_Manhattan &  0.724111 &  0.596483 &  0.803037 &  0.849632 &  0.822417 \\
          EMD &  0.725099 &  0.624902 &  0.816801 &  0.826471 &  0.818853 \\
\bottomrule
\end{tabular}



Unnamed: 0,distance,accuracy,roc_auc,precision,recall,f1
0,Manhattan,0.687945,0.535049,0.770649,0.836765,0.799495
1,Euclidean,0.710672,0.575392,0.792221,0.844118,0.814267
2,Maximum,0.719763,0.6019,0.801401,0.837132,0.8165
3,DF,0.74249,0.602463,0.796425,0.884926,0.836605
4,DTW,0.720158,0.596507,0.79769,0.843015,0.81852
5,WDF,0.760474,0.644338,0.818563,0.878676,0.845525
6,WDTW,0.738538,0.616066,0.80503,0.862132,0.830078
7,k_g_Manhattan,0.724111,0.596483,0.803037,0.849632,0.822417
8,EMD,0.725099,0.624902,0.816801,0.826471,0.818853


In [38]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDTW')
print(df.to_latex(index=False))
df

WDTW
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.720158 &  0.596507 &   0.79769 &  0.843015 &   0.81852 \\
      1 &  0.710672 &  0.575392 &  0.792221 &  0.844118 &  0.814267 \\
      2 &  0.724506 &  0.610049 &  0.806519 &  0.836765 &  0.819886 \\
      3 &  0.733597 &  0.624841 &  0.811832 &  0.843015 &  0.825744 \\
      4 &  0.729051 &  0.616507 &  0.807739 &  0.843015 &  0.823391 \\
      5 &  0.720158 &  0.598174 &  0.798425 &  0.843015 &  0.818731 \\
      6 &  0.724704 &  0.606507 &  0.803204 &  0.843015 &  0.821193 \\
      7 &  0.724704 &  0.606507 &  0.803204 &  0.843015 &  0.821193 \\
      8 &  0.720158 &  0.596507 &   0.79769 &  0.843015 &   0.81852 \\
      9 &  0.720158 &  0.596507 &   0.79769 &  0.843015 &   0.81852 \\
     10 &  0.720158 &  0.596507 &   0.79769 &  0.843015 &   0.81852 \\
     11 &  0.720158 &  0.596507 &   0.79769 &  0.843015 &   0.81852 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.720158,0.596507,0.79769,0.843015,0.81852
1,1,0.710672,0.575392,0.792221,0.844118,0.814267
2,2,0.724506,0.610049,0.806519,0.836765,0.819886
3,3,0.733597,0.624841,0.811832,0.843015,0.825744
4,4,0.729051,0.616507,0.807739,0.843015,0.823391
5,5,0.720158,0.598174,0.798425,0.843015,0.818731
6,6,0.724704,0.606507,0.803204,0.843015,0.821193
7,7,0.724704,0.606507,0.803204,0.843015,0.821193
8,8,0.720158,0.596507,0.79769,0.843015,0.81852
9,9,0.720158,0.596507,0.79769,0.843015,0.81852


In [40]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDF')
print(df.to_latex(index=False))
df

WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.751186 &       0.5 &  0.751186 &       1.0 &   0.85774 \\
      1 &  0.760672 &  0.632255 &  0.813286 &  0.891176 &  0.847576 \\
      2 &  0.760672 &  0.651581 &    0.8235 &  0.873162 &  0.844816 \\
      3 &  0.751383 &  0.638456 &  0.816631 &  0.866912 &  0.839177 \\
      4 &  0.760474 &  0.644338 &  0.818563 &  0.878676 &  0.845525 \\
      5 &  0.751581 &  0.620797 &  0.806644 &  0.884926 &  0.841778 \\
      6 &  0.751581 &  0.620797 &  0.806644 &  0.884926 &  0.841778 \\
      7 &  0.751581 &  0.620797 &  0.806644 &  0.884926 &  0.841778 \\
      8 &   0.74249 &  0.602463 &  0.796425 &  0.884926 &  0.836605 \\
      9 &   0.74249 &  0.602463 &  0.796425 &  0.884926 &  0.836605 \\
     10 &   0.74249 &  0.602463 &  0.796425 &  0.884926 &  0.836605 \\
     11 &   0.74249 &  0.602463 &  0.796425 &  0.884926 &  0.836605 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.751186,0.5,0.751186,1.0,0.85774
1,1,0.760672,0.632255,0.813286,0.891176,0.847576
2,2,0.760672,0.651581,0.8235,0.873162,0.844816
3,3,0.751383,0.638456,0.816631,0.866912,0.839177
4,4,0.760474,0.644338,0.818563,0.878676,0.845525
5,5,0.751581,0.620797,0.806644,0.884926,0.841778
6,6,0.751581,0.620797,0.806644,0.884926,0.841778
7,7,0.751581,0.620797,0.806644,0.884926,0.841778
8,8,0.74249,0.602463,0.796425,0.884926,0.836605
9,9,0.74249,0.602463,0.796425,0.884926,0.836605


In [41]:
nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('kgMan')
print(df.to_latex(index=False))
df  

kgMan
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.719293 &  0.569964 &  0.783853 &  0.867201 &  0.822713 \\
      1 &  0.728384 &  0.612745 &  0.806635 &  0.843672 &  0.823474 \\
      2 &  0.746566 &  0.630838 &  0.814236 &  0.861676 &  0.836052 \\
      3 &  0.728889 &  0.606863 &  0.801527 &  0.850089 &  0.823463 \\
      4 &  0.719596 &  0.582442 &  0.789085 &  0.855793 &  0.820238 \\
      5 &  0.728384 &  0.600446 &  0.799011 &  0.855437 &  0.825096 \\
      6 &  0.737475 &  0.612567 &  0.804894 &  0.861497 &  0.831066 \\
      7 &  0.741919 &  0.609447 &  0.802879 &   0.87344 &  0.835653 \\
      8 &  0.741919 &  0.609447 &  0.801802 &   0.87344 &  0.835389 \\
      9 &  0.723636 &  0.585116 &  0.791344 &  0.861141 &  0.823638 \\
     10 &  0.705758 &  0.560963 &  0.780456 &  0.849198 &   0.81237 \\
     11 &  0.714747 &  0.566934 &  0.782606 &  0.861141 &  0.819094 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.719293,0.569964,0.783853,0.867201,0.822713
1,1,0.728384,0.612745,0.806635,0.843672,0.823474
2,2,0.746566,0.630838,0.814236,0.861676,0.836052
3,3,0.728889,0.606863,0.801527,0.850089,0.823463
4,4,0.719596,0.582442,0.789085,0.855793,0.820238
5,5,0.728384,0.600446,0.799011,0.855437,0.825096
6,6,0.737475,0.612567,0.804894,0.861497,0.831066
7,7,0.741919,0.609447,0.802879,0.87344,0.835653
8,8,0.741919,0.609447,0.801802,0.87344,0.835389
9,9,0.723636,0.585116,0.791344,0.861141,0.823638
