# Course Section K4. Daycurve 1-NN Classification with 10-fold CV

In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dtaidistance.dtw import distance as dtw_dist

# Filtering warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate
%matplotlib inline

In [2]:
import Fred as fred
import math

# Create own Distance Measures and import from Fred-Frechet

# Discrete Frechet
def disc_frechet(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_frechet(a, b)
    return dist.value

# Discrete Dynamic Time Warping
def disc_dtw(x, y):
    a = fred.Curve(x)
    b = fred.Curve(y)
    dist = fred.discrete_dynamic_time_warping(a, b)
    return dist.value

# Discrete Dynamic Time Warping with traversal constraint
def window_dtw(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dtw[i, j] = cost + min(dtw[i-1, j],
                                   dtw[i, j-1],
                                   dtw[i-1, j-1])

    return math.sqrt(dtw[n-1, m-1])

# Discrete Frechet with traversal constraint
def window_disc_frechet(x, y, w=4):
    n = len(x)
    m = len(y)

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dfre = np.full((n, m), math.inf)
    dfre[0, 0] = 0

    # all possible paths filled with zeros
    for i in range(1, n):
        for j in range(max(1, i-w), min(m, i+w)):
            cost = (x[i] - y[j])**2
            dfre[i, j] = max(cost, min(dfre[i-1, j],
                                       dfre[i, j-1],
                                       dfre[i-1, j-1]))

    return math.sqrt(dfre[n-1, m-1])

def window_df(x, y, w=4, p=2):
    n = len(x) 
    m = len(y) 

    # maximal possible windowsize
    w = max(w, abs(n-m))

    # distancematrix filled with infinity
    dtw = np.full((n, m), math.inf)
    dtw[0, 0] = 0
    
    for i in range(0,n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i, j] = 0
    
    
    # all possible paths filled with zeros
    for i in range(n):
        for j in range(max(0, i-w), min(m, i+w)):
            dtw[i,j] = abs(x[i] - y[j]) ** p
            if i > 0 or j > 0:
                dtw[i, j] = max(dtw[i,j], min(dtw[i-1, j] if i > 0 else math.inf,
                                              dtw[i, j-1] if j > 0 else math.inf,
                                              dtw[i-1, j-1] if (i > 0 and j > 0) else math.inf
                                             )
                               )
     
    return (dtw[n-1, m-1]) ** (1/p)

# k-greatest-distances-Mahnhattandistanz
def k_greatest_manhattan(x, y, w=6):
    dists = np.abs(x - y)
    return np.sum(np.sort(dists)[-w:][::-1])

# Earth mover's distance
from scipy.stats import wasserstein_distance

def emd(u, v):
    # create bin-vectors
    bin_u = [i for i in range(0, len(u))]
    bin_v = [i for i in range(0, len(v))]
    
    # normalize input vectors to unit-vectors
    u_norm = u / np.linalg.norm(u, ord=1)
    v_norm = v / np.linalg.norm(v, ord=1)
    
    return wasserstein_distance(bin_u, bin_v, u_norm, v_norm)

def dtw_ai(x, y, w=None):
    return dtw_dist(x, y, window=w, use_c=True)

In [3]:
ofEx = pd.read_csv("")

In [4]:
# -97 rausschmeissen
ofEx = ofEx.drop(ofEx[ofEx.final_result < 0].index)

In [5]:
y = ofEx['final_result']
X = ofEx.drop(['id', 'final_result'], axis=1)

In [6]:
X.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,...,day71,day72,day73,day74,day75,day76,day77,day78,day79,day80
0,0,0,0,0,28,45,20,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,118,29,71,48,58,23,9,100,8,...,244,58,102,0,89,6,9,101,125,5
2,87,90,41,56,23,7,48,0,92,22,...,16,5,6,0,3,0,0,10,0,0
3,0,86,15,0,39,5,14,8,91,0,...,30,0,9,0,10,0,4,4,7,0
4,103,13,0,0,14,27,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## A vs. C bis F

In [7]:
X = X.drop(X[(y > 6) & (y < 10)].index)
y = y[X.index]

In [8]:
X.head()

Unnamed: 0,day1,day2,day3,day4,day5,day6,day7,day8,day9,day10,...,day71,day72,day73,day74,day75,day76,day77,day78,day79,day80
0,0,0,0,0,28,45,20,3,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,118,29,71,48,58,23,9,100,8,...,244,58,102,0,89,6,9,101,125,5
2,87,90,41,56,23,7,48,0,92,22,...,16,5,6,0,3,0,0,10,0,0
3,0,86,15,0,39,5,14,8,91,0,...,30,0,9,0,10,0,4,4,7,0
4,103,13,0,0,14,27,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
len(X)

224

In [10]:
len(y)

224

In [11]:
# Logarithmieren
X_l = np.log1p(X) 

In [12]:
# Zu binärem Problem machen
y[y < 10] = 0
y[y > 0] = 1

In [13]:
y.value_counts()

1    122
0    102
Name: final_result, dtype: int64

In [14]:
start = time.time()

distances = {'Manhattan':'cityblock', 'Euclidean': 'euclidean', 'Maximum': 'chebyshev',
             'DF': disc_frechet, 'DTW': dtw_ai, 'WDF': window_df, 'WDTW': window_dtw,
             'k_g_Manhattan': k_greatest_manhattan, 'EMD': emd
                     }
nachbar = 1
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

df = pd.DataFrame(columns = ['distance', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8])
 
df['distance'] = distances

row = 0
for key, dist in distances.items():
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dist)
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [key, np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]

    df.loc[row, 0:] = score

    row = row + 1

print(df.to_latex(index=False))
df


end = time.time()
total = end - start
print(total)

\begin{tabular}{llllll}
\toprule
     distance &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
    Manhattan &   0.71917 &  0.722296 &   0.78851 &   0.70641 &  0.735188 \\
    Euclidean &  0.683202 &   0.68595 &   0.74552 &  0.673718 &  0.696688 \\
      Maximum &  0.583399 &  0.576707 &  0.611026 &  0.657051 &  0.630876 \\
           DF &   0.58834 &  0.576224 &  0.603501 &  0.711538 &  0.651422 \\
          DTW &  0.633202 &  0.636253 &  0.693783 &  0.607051 &  0.641976 \\
          WDF &  0.587945 &  0.584802 &  0.609573 &  0.620513 &   0.61035 \\
         WDTW &  0.633399 &  0.633566 &  0.677291 &  0.630769 &  0.645388 \\
k\_g\_Manhattan &  0.624506 &  0.622407 &  0.656993 &  0.649359 &  0.651118 \\
          EMD &  0.615415 &  0.608001 &  0.644104 &  0.687821 &  0.658972 \\
\bottomrule
\end{tabular}

791.6011710166931


In [15]:
start = time.time()

df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,81):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=dtw_ai, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    #df.loc[i, 0:] = score
    print(score)
    
print('WDTW')
print(df.to_latex(index=False))
df


end = time.time()
total = end - start
print(total)

[0, 0.633201581027668, 0.6362529137529138, 0.6937834387834388, 0.607051282051282, 0.6419760765550239]
[1, 0.6832015810276679, 0.6859498834498835, 0.745519758019758, 0.6737179487179488, 0.6966880945141813]
[2, 0.6555335968379447, 0.6554778554778555, 0.6979220779220779, 0.6564102564102564, 0.6723949528732136]
[3, 0.6065217391304347, 0.6082400932400932, 0.6470127095127095, 0.591025641025641, 0.6112194616977226]
[4, 0.6154150197628458, 0.6125407925407924, 0.6532284382284382, 0.6487179487179487, 0.6480388275170884]
[5, 0.673913043478261, 0.6728088578088578, 0.7066450216450216, 0.6974358974358974, 0.698614067092328]
[6, 0.633399209486166, 0.6298543123543123, 0.6611255411255411, 0.6724358974358974, 0.664322662844402]
[7, 0.6428853754940711, 0.6426165501165502, 0.6839416139416139, 0.657051282051282, 0.6620883464361724]
[8, 0.6243083003952569, 0.6213869463869465, 0.6629727135609488, 0.6564102564102564, 0.6539821193299453]
[9, 0.63300395256917, 0.6322785547785548, 0.680986790986791, 0.6391025641

[80, 0.633201581027668, 0.6362529137529138, 0.6937834387834388, 0.607051282051282, 0.6419760765550239]
WDTW
\begin{tabular}{rlllll}
\toprule
 window & accuracy & roc\_auc & precision & recall &  f1 \\
\midrule
      0 &      NaN &     NaN &       NaN &    NaN & NaN \\
      1 &      NaN &     NaN &       NaN &    NaN & NaN \\
      2 &      NaN &     NaN &       NaN &    NaN & NaN \\
      3 &      NaN &     NaN &       NaN &    NaN & NaN \\
      4 &      NaN &     NaN &       NaN &    NaN & NaN \\
      5 &      NaN &     NaN &       NaN &    NaN & NaN \\
      6 &      NaN &     NaN &       NaN &    NaN & NaN \\
      7 &      NaN &     NaN &       NaN &    NaN & NaN \\
      8 &      NaN &     NaN &       NaN &    NaN & NaN \\
      9 &      NaN &     NaN &       NaN &    NaN & NaN \\
     10 &      NaN &     NaN &       NaN &    NaN & NaN \\
     11 &      NaN &     NaN &       NaN &    NaN & NaN \\
\bottomrule
\end{tabular}

546.40429854393


In [17]:
start = time.time()

df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,12):
    knn = KNeighborsClassifier(n_neighbors=nachbar, metric=window_df, metric_params={'w': i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=10, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    df.loc[i, 0:] = score
    print(i)

print('WDF')
print(df.to_latex(index=False))
df

end = time.time()
total = end - start
print(total)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

0
1
2
3
4
5
6
7
8
9
10
11
WDF
\begin{tabular}{rlllll}
\toprule
 window &  accuracy &   roc\_auc & precision &    recall &        f1 \\
\midrule
      0 &  0.455336 &       0.5 &       0.0 &       0.0 &       0.0 \\
      1 &  0.655336 &  0.647739 &  0.671324 &  0.728205 &  0.694561 \\
      2 &  0.619368 &  0.619266 &  0.660974 &  0.623077 &  0.639802 \\
      3 &   0.61581 &   0.61595 &  0.659089 &  0.623718 &    0.6351 \\
      4 &  0.587945 &  0.584802 &  0.609573 &  0.620513 &   0.61035 \\
      5 &   0.64249 &  0.633322 &  0.649282 &  0.738462 &   0.68879 \\
      6 &   0.58419 &   0.57461 &  0.605458 &  0.680128 &  0.639516 \\
      7 &  0.628854 &    0.6187 &  0.644197 &  0.730128 &   0.67979 \\
      8 &  0.637352 &  0.630425 &  0.656503 &  0.704487 &  0.676353 \\
      9 &  0.632609 &  0.625484 &  0.654939 &  0.695513 &  0.670328 \\
     10 &  0.601186 &  0.596125 &  0.634303 &  0.646795 &  0.633072 \\
     11 &  0.605929 &  0.600425 &  0.633256 &  0.654487 &  0.637262 \\
\bot

In [16]:
start = time.time()

nachbar=1
df = pd.DataFrame(columns = ['window', 'accuracy', 'roc_auc', 'precision', 'recall', 'f1'], 
                  index = [0,1,2,3,4,5,6,7,8,9,10,11])
 
df['window'] = [0,1,2,3,4,5,6,7,8,9,10,11]
scoring = ['accuracy', 'roc_auc', 'precision', 'recall', 'f1']

for i in range(0,81):
    knn = KNeighborsClassifier(n_neighbors=nachbar,metric=k_greatest_manhattan, metric_params={'w':i})
    scores = cross_validate(knn, X_l, y, scoring=scoring, n_jobs=-1,
                            cv=5, return_train_score=True)
    score = [i,np.mean(scores['test_accuracy']), np.mean(scores['test_roc_auc']), 
             np.mean(scores['test_precision']), np.mean(scores['test_recall']), 
             np.mean(scores['test_f1'])]
    #df.loc[i, 0:] = score
    print(score)

print('kgMan')
print(df.to_latex(index=False))
df  

end = time.time()
total = end - start
print(total)

[0, 0.7138383838383839, 0.716547619047619, 0.8000267284897268, 0.6883333333333332, 0.725107003485503]
[1, 0.5845454545454546, 0.576547619047619, 0.6090114942528736, 0.665, 0.6351309290044029]
[2, 0.5621212121212121, 0.5542619047619047, 0.5913520553520553, 0.639, 0.6140755472000121]
[3, 0.5933333333333333, 0.5862380952380952, 0.6234367484367485, 0.6553333333333333, 0.6378615300477483]
[4, 0.6112121212121211, 0.6044761904761906, 0.6355248657722422, 0.6713333333333333, 0.6521010058869885]
[5, 0.6022222222222222, 0.5966428571428571, 0.6328864468864469, 0.6556666666666667, 0.6435414781297135]
[6, 0.6244444444444444, 0.6195952380952381, 0.6581989849231229, 0.663, 0.6582825088415152]
[7, 0.6466666666666667, 0.6451904761904762, 0.6911455211455211, 0.6546666666666667, 0.6699545955365731]
[8, 0.6467676767676768, 0.6461904761904763, 0.6939438250799412, 0.6466666666666667, 0.6660174964441138]
[9, 0.6246464646464647, 0.6257619047619047, 0.6702380952380953, 0.6053333333333334, 0.6319007424597488]
[1