# Course Section K4. Weekcurve 1-NN Classification with 10-fold CV

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [3]:
ofEx = pd.read_csv("")

In [4]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [5]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result', 'fem', 'fg', 'urm'], axis=1)

In [6]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
0,0,96,38,333,22,0,0,228,0,0,0,0
1,218,270,394,860,395,239,179,497,191,112,546,335
2,274,222,204,351,91,90,133,213,67,74,75,13
3,101,157,155,208,104,130,60,191,107,14,291,25
4,116,43,0,67,0,0,0,0,0,0,0,0


## A vs. C bis F

In [7]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [8]:
X.head()

Unnamed: 0,week0,week1,week2,week3,week4,week5,week6,week7,week8,week9,week10,week11
0,0,96,38,333,22,0,0,228,0,0,0,0
1,218,270,394,860,395,239,179,497,191,112,546,335
2,274,222,204,351,91,90,133,213,67,74,75,13
3,101,157,155,208,104,130,60,191,107,14,291,25
4,116,43,0,67,0,0,0,0,0,0,0,0


In [9]:
len(X)

224

In [10]:
len(y)

224

In [11]:
# Logarithmieren
X_l = np.log1p(X) 

In [12]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [13]:
y.value_counts()

1    122
0    102
Name: final_result, dtype: int64

In [14]:
distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df


\begin{tabular}{llllll}
\toprule
     distance &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
    Manhattan &  0.668972 &  0.665629 &  0.694058 &  0.723077 &  0.700577 \\
    Euclidean &  0.654941 &  0.652547 &  0.687901 &  0.687821 &  0.681203 \\
      Maximum &  0.620158 &  0.621783 &  0.679131 &  0.615385 &  0.639163 \\
           DF &  0.601779 &  0.598887 &  0.639876 &   0.63141 &  0.631979 \\
          DTW &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
          WDF &  0.588538 &  0.585361 &    0.6241 &  0.624359 &  0.619869 \\
         WDTW &   0.61996 &  0.616597 &  0.660698 &  0.664103 &  0.650809 \\
k\_g\_Manhattan &  0.633399 &  0.630309 &  0.667738 &  0.672436 &  0.663341 \\
          EMD &  0.682609 &  0.679988 &  0.719899 &  0.721795 &  0.710095 \\
\bottomrule
\end{tabular}



Unnamed: 0,distance,accuracy,roc_auc,precision,recall,f1
0,Manhattan,0.668972,0.665629,0.694058,0.723077,0.700577
1,Euclidean,0.654941,0.652547,0.687901,0.687821,0.681203
2,Maximum,0.620158,0.621783,0.679131,0.615385,0.639163
3,DF,0.601779,0.598887,0.639876,0.63141,0.631979
4,DTW,0.633597,0.632104,0.676351,0.666026,0.660492
5,WDF,0.588538,0.585361,0.6241,0.624359,0.619869
6,WDTW,0.61996,0.616597,0.660698,0.664103,0.650809
7,k_g_Manhattan,0.633399,0.630309,0.667738,0.672436,0.663341
8,EMD,0.682609,0.679988,0.719899,0.721795,0.710095


In [15]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDTW')
print(df.to_latex(index=False))
df

WDTW
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
      1 &  0.654941 &  0.652547 &  0.687901 &  0.687821 &  0.681203 \\
      2 &  0.673518 &  0.670629 &  0.706536 &  0.723077 &  0.704737 \\
      3 &  0.642292 &   0.64127 &  0.684968 &  0.674359 &  0.669515 \\
      4 &  0.625099 &  0.624866 &  0.671436 &  0.650641 &  0.649156 \\
      5 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
      6 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
      7 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
      8 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
      9 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
     10 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
     11 &  0.633597 &  0.632104 &  0.676351 &  0.666026 &  0.660492 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.633597,0.632104,0.676351,0.666026,0.660492
1,1,0.654941,0.652547,0.687901,0.687821,0.681203
2,2,0.673518,0.670629,0.706536,0.723077,0.704737
3,3,0.642292,0.64127,0.684968,0.674359,0.669515
4,4,0.625099,0.624866,0.671436,0.650641,0.649156
5,5,0.633597,0.632104,0.676351,0.666026,0.660492
6,6,0.633597,0.632104,0.676351,0.666026,0.660492
7,7,0.633597,0.632104,0.676351,0.666026,0.660492
8,8,0.633597,0.632104,0.676351,0.666026,0.660492
9,9,0.633597,0.632104,0.676351,0.666026,0.660492


In [16]:
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('WDF')
print(df.to_latex(index=False))
df

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.455336 &       0.5 &       0.0 &       0.0 &       0.0 \\
      1 &  0.629249 &  0.624476 &  0.661888 &  0.680769 &  0.663306 \\
      2 &  0.628854 &    0.6269 &  0.663124 &  0.647436 &  0.652321 \\
      3 &  0.597628 &  0.596649 &  0.633448 &  0.616026 &  0.620654 \\
      4 &  0.588538 &  0.585361 &    0.6241 &  0.624359 &  0.619869 \\
      5 &  0.592885 &  0.589207 &  0.627422 &  0.632051 &  0.625565 \\
      6 &  0.597431 &  0.594207 &  0.633482 &  0.632051 &  0.628463 \\
      7 &  0.601779 &  0.598887 &  0.639876 &   0.63141 &  0.631979 \\
      8 &  0.601779 &  0.598887 &  0.639876 &   0.63141 &  0.631979 \\
      9 &  0.601779 &  0.598887 &  0.639876 &   0.63141 &  0.631979 \\
     10 &  0.601779 &  0.598887 &  0.639876 &   0.63141 &  0.631979 \\
     11 &  0.601779 &  0.598887 &  0.639876 &   0.63141 &  0.631979 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.455336,0.5,0.0,0.0,0.0
1,1,0.629249,0.624476,0.661888,0.680769,0.663306
2,2,0.628854,0.6269,0.663124,0.647436,0.652321
3,3,0.597628,0.596649,0.633448,0.616026,0.620654
4,4,0.588538,0.585361,0.6241,0.624359,0.619869
5,5,0.592885,0.589207,0.627422,0.632051,0.625565
6,6,0.597431,0.594207,0.633482,0.632051,0.628463
7,7,0.601779,0.598887,0.639876,0.63141,0.631979
8,8,0.601779,0.598887,0.639876,0.63141,0.631979
9,9,0.601779,0.598887,0.639876,0.63141,0.631979


In [17]:
nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score

print('kgMan')
print(df.to_latex(index=False))
df  

kgMan
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.669596 &  0.662476 &  0.682745 &  0.737333 &   0.70722 \\
      1 &   0.60697 &  0.603952 &  0.644406 &  0.639333 &  0.640259 \\
      2 &  0.642525 &   0.64031 &  0.674304 &     0.673 &  0.672028 \\
      3 &  0.629091 &   0.62381 &  0.651524 &      0.69 &  0.667572 \\
      4 &  0.633737 &  0.628048 &  0.653545 &     0.698 &  0.673448 \\
      5 &  0.629394 &  0.624119 &  0.651574 &  0.689667 &  0.667043 \\
      6 &  0.633737 &  0.627952 &  0.653545 &  0.697333 &  0.673248 \\
      7 &  0.647172 &  0.639048 &  0.659362 &      0.73 &  0.692265 \\
      8 &  0.665152 &  0.657881 &  0.676312 &  0.737667 &  0.704812 \\
      9 &  0.642929 &   0.63419 &  0.654937 &  0.729333 &  0.689107 \\
     10 &  0.633939 &  0.624714 &   0.64897 &  0.721333 &  0.680643 \\
     11 &  0.656162 &  0.648714 &   0.67097 &  0.729333 &  0.696523 \\
\bottomrule
\end{tabular}



Unnamed: 0,window,accuracy,roc_auc,precision,recall,f1
0,0,0.669596,0.662476,0.682745,0.737333,0.70722
1,1,0.60697,0.603952,0.644406,0.639333,0.640259
2,2,0.642525,0.64031,0.674304,0.673,0.672028
3,3,0.629091,0.62381,0.651524,0.69,0.667572
4,4,0.633737,0.628048,0.653545,0.698,0.673448
5,5,0.629394,0.624119,0.651574,0.689667,0.667043
6,6,0.633737,0.627952,0.653545,0.697333,0.673248
7,7,0.647172,0.639048,0.659362,0.73,0.692265
8,8,0.665152,0.657881,0.676312,0.737667,0.704812
9,9,0.642929,0.63419,0.654937,0.729333,0.689107


In [38]:
from scipy.spatial.distance import cityblock
mat = np.zeros((X_l.shape[0], X_l.shape[0]))
               
for i in range(X_l.shape[0]):
    for j in range(X_l.shape[0]):
        mat[i, j] = cityblock(X_l.iloc[i], X_l.iloc[j])

In [40]:

mat2 = np.zeros((X_l.shape[0], X_l.shape[0]))
               
for i in range(X_l.shape[0]):
    for j in range(X_l.shape[0]):
        mat2[i, j] = k_greatest_manhattan(X_l.iloc[i], X_l.iloc[j], w=0)

In [44]:
np.max(mat - mat2)

2.1316282072803006e-14

Wegen Rundungsfehlern ab der 14. Nachkommastelle unterscheiden sich die 1-NN Ergebnisse von L_1 und L_1^0!!!

In [49]:
len(mat[(mat-mat2) > 0])

11594

In [28]:
k_greatest_manhattan(X_l.iloc[42], X_l.iloc[71], w=0 )

9.240414926761137

In [32]:
X_l.shape[0]

224