# Course Section K. Daycurve 1-NN Classification with 10-fold CV

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [3]:
ofEx = pd.read_csv("")

In [4]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [5]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result', 'fem', 'fg', 'urm'], axis=1)

In [6]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
0,149,176,131,417,6,8,6,138,2,23,424,13
1,169,272,199,397,253,182,157,328,146,173,210,347
2,118,163,217,173,98,65,94,269,210,186,468,69
3,303,337,126,454,80,110,131,269,76,102,138,116
4,461,92,108,571,17,32,0,314,85,55,129,62


## A vs. C bis F

In [7]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [8]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
1,169,272,199,397,253,182,157,328,146,173,210,347
8,84,81,58,309,31,54,84,365,40,85,275,24
10,0,187,97,171,34,90,23,157,133,70,348,163
11,92,169,174,323,132,122,110,219,55,158,167,13
12,135,269,518,917,111,105,224,688,122,152,472,200


In [9]:
len(X)

213

In [10]:
len(y)

213

In [11]:
# Logarithmieren
X_l = np.log1p(X) 

In [12]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [13]:
y.value_counts()

0    119
1     94
Name: final_result, dtype: int64

In [14]:
distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df


\begin{tabular}{llllll}
\toprule
     distance &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
    Manhattan &  0.713853 &  0.715606 &  0.664006 &  0.743333 &  0.693613 \\
    Euclidean &  0.676623 &  0.676717 &  0.626567 &  0.682222 &  0.645677 \\
      Maximum &  0.681169 &  0.681465 &  0.632105 &  0.681111 &  0.648211 \\
           DF &  0.606061 &  0.609697 &  0.552078 &      0.63 &  0.583342 \\
          DTW &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
          WDF &  0.629654 &  0.631364 &  0.577186 &      0.64 &  0.600463 \\
         WDTW &  0.681602 &  0.683409 &  0.624806 &  0.693333 &  0.651565 \\
k\_g\_Manhattan &  0.699784 &  0.702096 &  0.645996 &  0.732222 &  0.680155 \\
          EMD &  0.648268 &  0.650429 &  0.587478 &  0.662222 &   0.61596 \\
\bottomrule
\end{tabular}



Unnamed: 0,distance,accuracy,roc_auc,precision,recall,f1
0,Manhattan,0.713853,0.715606,0.664006,0.743333,0.693613
1,Euclidean,0.676623,0.676717,0.626567,0.682222,0.645677
2,Maximum,0.681169,0.681465,0.632105,0.681111,0.648211
3,DF,0.606061,0.609697,0.552078,0.63,0.583342
4,DTW,0.65303,0.655076,0.598432,0.67,0.628797
5,WDF,0.629654,0.631364,0.577186,0.64,0.600463
6,WDTW,0.681602,0.683409,0.624806,0.693333,0.651565
7,k_g_Manhattan,0.699784,0.702096,0.645996,0.732222,0.680155
8,EMD,0.648268,0.650429,0.587478,0.662222,0.61596


In [15]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDTW')
print(df.to_latex(index=False))
df

WDTW
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
      1 &  0.676623 &  0.676717 &  0.626567 &  0.682222 &  0.645677 \\
      2 &  0.675758 &  0.677955 &  0.621818 &      0.69 &  0.651637 \\
      3 &  0.675974 &   0.67851 &  0.623757 &  0.691111 &  0.649628 \\
      4 &  0.671429 &  0.672576 &  0.616867 &      0.68 &   0.64314 \\
      5 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
      6 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
      7 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
      8 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
      9 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
     10 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
     11 &   0.65303 &  0.655076 &  0.598432 &      0.67 &  0.628797 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.65303,0.655076,0.598432,0.67,0.628797
1,1,0.676623,0.676717,0.626567,0.682222,0.645677
2,2,0.675758,0.677955,0.621818,0.69,0.651637
3,3,0.675974,0.67851,0.623757,0.691111,0.649628
4,4,0.671429,0.672576,0.616867,0.68,0.64314
5,5,0.65303,0.655076,0.598432,0.67,0.628797
6,6,0.65303,0.655076,0.598432,0.67,0.628797
7,7,0.65303,0.655076,0.598432,0.67,0.628797
8,8,0.65303,0.655076,0.598432,0.67,0.628797
9,9,0.65303,0.655076,0.598432,0.67,0.628797


In [16]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDF')
print(df.to_latex(index=False))
df

WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.441126 &       0.5 &  0.441126 &       1.0 &  0.612016 \\
      1 &  0.615152 &  0.618586 &  0.563153 &  0.631111 &  0.587918 \\
      2 &  0.619913 &  0.623409 &  0.565809 &      0.64 &  0.592976 \\
      3 &  0.634199 &  0.633687 &  0.592741 &  0.618889 &  0.596665 \\
      4 &  0.629654 &  0.631364 &  0.577186 &      0.64 &  0.600463 \\
      5 &  0.605844 &  0.609141 &  0.552778 &  0.628889 &  0.583553 \\
      6 &  0.610823 &  0.614697 &  0.555412 &      0.64 &  0.590185 \\
      7 &  0.606061 &  0.609697 &  0.552078 &      0.63 &  0.583342 \\
      8 &  0.606061 &  0.609697 &  0.552078 &      0.63 &  0.583342 \\
      9 &  0.606061 &  0.609697 &  0.552078 &      0.63 &  0.583342 \\
     10 &  0.606061 &  0.609697 &  0.552078 &      0.63 &  0.583342 \\
     11 &  0.606061 &  0.609697 &  0.552078 &      0.63 &  0.583342 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.441126,0.5,0.441126,1.0,0.612016
1,1,0.615152,0.618586,0.563153,0.631111,0.587918
2,2,0.619913,0.623409,0.565809,0.64,0.592976
3,3,0.634199,0.633687,0.592741,0.618889,0.596665
4,4,0.629654,0.631364,0.577186,0.64,0.600463
5,5,0.605844,0.609141,0.552778,0.628889,0.583553
6,6,0.610823,0.614697,0.555412,0.64,0.590185
7,7,0.606061,0.609697,0.552078,0.63,0.583342
8,8,0.606061,0.609697,0.552078,0.63,0.583342
9,9,0.606061,0.609697,0.552078,0.63,0.583342


In [29]:
nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('kgMan')
print(df.to_latex(index=False))
df  

kgMan
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.727796 &  0.728372 &  0.679144 &  0.733918 &  0.704155 \\
      1 &  0.680509 &  0.681223 &   0.63051 &  0.681287 &   0.65229 \\
      2 &  0.675858 &  0.678706 &   0.61866 &  0.702339 &  0.657086 \\
      3 &  0.685604 &  0.688428 &  0.629596 &   0.71345 &  0.668051 \\
      4 &  0.681063 &  0.686273 &  0.621886 &  0.734503 &  0.671656 \\
      5 &  0.681285 &   0.68572 &  0.623667 &  0.723977 &  0.668905 \\
      6 &  0.690587 &  0.692957 &  0.637079 &   0.71345 &   0.67206 \\
      7 &   0.69546 &   0.69639 &   0.65019 &  0.702924 &  0.672704 \\
      8 &  0.681174 &  0.682138 &  0.632377 &  0.691813 &  0.658966 \\
      9 &  0.685936 &   0.68539 &  0.639557 &  0.681287 &  0.658361 \\
     10 &  0.685825 &   0.68539 &  0.636728 &  0.681287 &  0.657602 \\
     11 &  0.699668 &  0.700083 &  0.649057 &  0.702339 &  0.673604 \\
     12 &  0.727796 &  0.728

Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.727796,0.728372,0.679144,0.733918,0.704155
1,1,0.680509,0.681223,0.63051,0.681287,0.65229
2,2,0.675858,0.678706,0.61866,0.702339,0.657086
3,3,0.685604,0.688428,0.629596,0.71345,0.668051
4,4,0.681063,0.686273,0.621886,0.734503,0.671656
5,5,0.681285,0.68572,0.623667,0.723977,0.668905
6,6,0.690587,0.692957,0.637079,0.71345,0.67206
7,7,0.69546,0.69639,0.65019,0.702924,0.672704
8,8,0.681174,0.682138,0.632377,0.691813,0.658966
9,9,0.685936,0.68539,0.639557,0.681287,0.658361
