# Course Section K2. Daycurve 1-NN Classification with 10-fold CV

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [3]:
ofEx = pd.read_csv("")

In [4]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [5]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result', 'fem', 'fg', 'urm'], axis=1)

In [6]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
0,116,313,92,47,122,181,113,79,105,227,174,58
1,125,341,146,84,347,127,95,79,34,86,229,145
2,289,269,174,110,55,26,126,107,60,252,225,297
3,0,29,154,58,338,121,146,38,60,180,147,52
5,58,122,80,55,134,64,109,61,122,85,114,172


## A vs. C bis F

In [7]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [8]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
0,116,313,92,47,122,181,113,79,105,227,174,58
1,125,341,146,84,347,127,95,79,34,86,229,145
2,289,269,174,110,55,26,126,107,60,252,225,297
5,58,122,80,55,134,64,109,61,122,85,114,172
7,132,204,147,202,154,177,155,126,60,247,345,137


In [9]:
len(X)

205

In [10]:
len(y)

205

In [11]:
# Logarithmieren
X_l = np.log1p(X) 

In [12]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [13]:
y.value_counts()

0    103
1    102
Name: final_result, dtype: int64

In [14]:
distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df


\begin{tabular}{llllll}
\toprule
     distance &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
    Manhattan &  0.610476 &  0.610909 &  0.595459 &  0.678182 &  0.631074 \\
    Euclidean &  0.630952 &  0.630455 &  0.624718 &  0.667273 &  0.641255 \\
      Maximum &     0.645 &     0.645 &  0.633452 &  0.685455 &  0.650895 \\
           DF &  0.615238 &  0.615909 &  0.600746 &  0.628182 &  0.608223 \\
          DTW &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
          WDF &  0.630476 &  0.630909 &  0.627152 &  0.628182 &  0.620097 \\
         WDTW &  0.669286 &  0.671364 &  0.667544 &  0.698182 &  0.675041 \\
k\_g\_Manhattan &   0.60619 &  0.606364 &  0.594079 &  0.658182 &  0.620978 \\
          EMD &  0.643571 &  0.645455 &   0.63343 &  0.709091 &  0.658536 \\
\bottomrule
\end{tabular}



Unnamed: 0,distance,accuracy,roc_auc,precision,recall,f1
0,Manhattan,0.610476,0.610909,0.595459,0.678182,0.631074
1,Euclidean,0.630952,0.630455,0.624718,0.667273,0.641255
2,Maximum,0.645,0.645,0.633452,0.685455,0.650895
3,DF,0.615238,0.615909,0.600746,0.628182,0.608223
4,DTW,0.625714,0.626818,0.625581,0.610909,0.615343
5,WDF,0.630476,0.630909,0.627152,0.628182,0.620097
6,WDTW,0.669286,0.671364,0.667544,0.698182,0.675041
7,k_g_Manhattan,0.60619,0.606364,0.594079,0.658182,0.620978
8,EMD,0.643571,0.645455,0.63343,0.709091,0.658536


In [15]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDTW')
print(df.to_latex(index=False))
df

WDTW
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
      1 &  0.630952 &  0.630455 &  0.624718 &  0.667273 &  0.641255 \\
      2 &  0.669524 &  0.670455 &  0.664661 &  0.698182 &  0.676106 \\
      3 &  0.630476 &  0.632273 &  0.637505 &  0.630909 &  0.628247 \\
      4 &      0.64 &  0.640909 &  0.650025 &      0.62 &  0.630798 \\
      5 &  0.630476 &  0.631364 &  0.632525 &  0.610909 &  0.618267 \\
      6 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
      7 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
      8 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
      9 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
     10 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
     11 &  0.625714 &  0.626818 &  0.625581 &  0.610909 &  0.615343 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.625714,0.626818,0.625581,0.610909,0.615343
1,1,0.630952,0.630455,0.624718,0.667273,0.641255
2,2,0.669524,0.670455,0.664661,0.698182,0.676106
3,3,0.630476,0.632273,0.637505,0.630909,0.628247
4,4,0.64,0.640909,0.650025,0.62,0.630798
5,5,0.630476,0.631364,0.632525,0.610909,0.618267
6,6,0.625714,0.626818,0.625581,0.610909,0.615343
7,7,0.625714,0.626818,0.625581,0.610909,0.615343
8,8,0.625714,0.626818,0.625581,0.610909,0.615343
9,9,0.625714,0.626818,0.625581,0.610909,0.615343


In [16]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDF')
print(df.to_latex(index=False))
df

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.502381 &       0.5 &       0.0 &       0.0 &       0.0 \\
      1 &  0.669286 &  0.670455 &  0.650615 &  0.719091 &  0.676996 \\
      2 &  0.664286 &  0.665455 &  0.664102 &      0.66 &  0.653676 \\
      3 &  0.630476 &  0.631818 &  0.630342 &  0.629091 &  0.619958 \\
      4 &  0.630476 &  0.630909 &  0.627152 &  0.628182 &  0.620097 \\
      5 &  0.630238 &  0.630455 &  0.622733 &  0.628182 &  0.619034 \\
      6 &  0.625238 &  0.625909 &  0.617452 &  0.628182 &  0.615175 \\
      7 &  0.620238 &  0.620909 &  0.609674 &  0.628182 &  0.611491 \\
      8 &  0.615238 &  0.615909 &  0.600746 &  0.628182 &  0.608223 \\
      9 &  0.615238 &  0.615909 &  0.600746 &  0.628182 &  0.608223 \\
     10 &  0.615238 &  0.615909 &  0.600746 &  0.628182 &  0.608223 \\
     11 &  0.615238 &  0.615909 &  0.600746 &  0.628182 &  0.608223 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.502381,0.5,0.0,0.0,0.0
1,1,0.669286,0.670455,0.650615,0.719091,0.676996
2,2,0.664286,0.665455,0.664102,0.66,0.653676
3,3,0.630476,0.631818,0.630342,0.629091,0.619958
4,4,0.630476,0.630909,0.627152,0.628182,0.620097
5,5,0.630238,0.630455,0.622733,0.628182,0.619034
6,6,0.625238,0.625909,0.617452,0.628182,0.615175
7,7,0.620238,0.620909,0.609674,0.628182,0.611491
8,8,0.615238,0.615909,0.600746,0.628182,0.608223
9,9,0.615238,0.615909,0.600746,0.628182,0.608223


In [17]:
nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('kgMan')
print(df.to_latex(index=False))
df  

kgMan
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.629268 &  0.629524 &  0.618925 &  0.665238 &  0.640399 \\
      1 &  0.629268 &      0.63 &  0.621775 &  0.667143 &  0.639682 \\
      2 &  0.658537 &  0.659048 &   0.64677 &  0.695714 &  0.668884 \\
      3 &  0.668293 &  0.669048 &  0.655753 &  0.715714 &  0.681438 \\
      4 &  0.653659 &  0.654524 &  0.639837 &   0.70619 &  0.670402 \\
      5 &   0.62439 &  0.625476 &   0.61865 &  0.657143 &   0.63586 \\
      6 &  0.619512 &  0.620714 &  0.614106 &   0.66619 &  0.636778 \\
      7 &   0.62439 &  0.624762 &  0.613043 &  0.665714 &  0.638095 \\
      8 &  0.629268 &  0.629524 &  0.618746 &  0.665714 &  0.641126 \\
      9 &  0.653659 &   0.65381 &  0.640749 &  0.695238 &  0.666404 \\
     10 &  0.634146 &  0.634524 &  0.624534 &  0.675238 &  0.647951 \\
     11 &  0.629268 &  0.629524 &  0.619697 &  0.665238 &  0.640945 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.629268,0.629524,0.618925,0.665238,0.640399
1,1,0.629268,0.63,0.621775,0.667143,0.639682
2,2,0.658537,0.659048,0.64677,0.695714,0.668884
3,3,0.668293,0.669048,0.655753,0.715714,0.681438
4,4,0.653659,0.654524,0.639837,0.70619,0.670402
5,5,0.62439,0.625476,0.61865,0.657143,0.63586
6,6,0.619512,0.620714,0.614106,0.66619,0.636778
7,7,0.62439,0.624762,0.613043,0.665714,0.638095
8,8,0.629268,0.629524,0.618746,0.665714,0.641126
9,9,0.653659,0.65381,0.640749,0.695238,0.666404
