In [196]:
import pandas as pd
import numpy as np
import scipy.cluster.hierarchy as hc
import matplotlib.pyplot as plt
import scipy.spatial.distance as ssd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.io import loadmat
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import LabelBinarizer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from collections import Counter
from collections import defaultdict

# Question 1

In [198]:
pine_corrected = loadmat('./data/Indian_pines_corrected.mat')['indian_pines_corrected']
pine_gt = loadmat('./data/Indian_pines_gt.mat')['indian_pines_gt']

In [197]:
pine_corrected.shape

(145, 145, 200)

In [199]:
X = pine_corrected.reshape((145*145,200))
y = pine_gt.reshape(145*145)

In [200]:
lbl = LabelBinarizer()
y_bin = lbl.fit_transform(y)

In [201]:
print(X.shape)
y_bin.shape

(21025, 200)


(21025, 17)

In [98]:
def modelEvaluate(model_type, X, y_bin):
    lr_acc = []
    svc_acc =  []
    knn_acc = []
    for i in range(17):
        X_train, X_test, y_train, y_test = train_test_split(X, y_bin[:,i], test_size=0.8)
        
        if model_type == 'LR':
            model = LogisticRegression(max_iter=200,solver='liblinear')
            model.fit(X_train, y_train)
            crossval_lr = cross_val_score(model, X_test, y_test, cv=10)
            lr_acc.append(crossval_lr)
            
        elif model_type == 'SVC':
            model = SVC(kernel='rbf',C=1.0)
            model.fit(X_train,y_train)
            crossval_svc = cross_val_score(model, X_test, y_test, cv=10)
            svc_acc.append(crossval_svc)
            
        elif model_type == 'KNN':
            model = KNeighborsClassifier(n_neighbors=7)
            model.fit(X_train, y_train)
            crossval_knn = cross_val_score(model, X_test, y_test, cv=10)
            knn_acc.append(crossval_knn)
    df_lr = pd.DataFrame(lr_acc)
    df_svc = pd.DataFrame(svc_acc)
    df_knn = pd.DataFrame(knn_acc)
    return df_lr, df_svc, df_knn

# Accuracy score of Logistic Regression model for each of the 17 class

In [84]:
df_LogReg = modelEvaluate('LR', X, y_bin)[0]

In [122]:
df_LogReg = df_LogReg.T

In [123]:
df_LogReg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.771106,0.764566,0.758026,0.772889,0.772889,0.763377,0.757432,0.777646,0.760999,0.778835
1,1.0,0.999405,1.0,0.998811,0.999405,1.0,0.999405,0.999405,0.999405,0.998216
2,0.963734,0.96195,0.962545,0.956005,0.958383,0.959572,0.960166,0.961356,0.958383,0.962545
3,0.970273,0.972652,0.967301,0.967895,0.972057,0.966706,0.976219,0.970868,0.970868,0.964923
4,0.989893,0.990488,0.989298,0.989893,0.989893,0.986326,0.991677,0.988109,0.991677,0.987515
5,0.989298,0.988704,0.98692,0.988109,0.988109,0.991677,0.986326,0.988109,0.988704,0.988704
6,0.986326,0.987515,0.986326,0.985731,0.987515,0.985731,0.985731,0.988109,0.992271,0.98157
7,0.998216,0.999405,0.997027,0.998811,0.999405,0.998216,0.998216,0.999405,0.998811,0.997622
8,0.994055,0.996433,0.99346,0.991677,0.99346,0.994649,0.99346,0.996433,0.997027,0.996433
9,1.0,0.999405,0.998811,0.999405,0.998811,0.997622,0.999405,0.998216,0.998811,0.999405


# Accuracy score of SVC model for each of the 17 class

In [100]:
df_SVC = modelEvaluate('SVC', X, y_bin)[1]

In [126]:
df_SVC = df_SVC.T

In [127]:
df_SVC

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.753864,0.764566,0.771106,0.755054,0.76635,0.768728,0.739001,0.765161,0.753864,0.737218
1,0.998216,0.997622,0.997622,0.997622,0.997622,0.997622,0.997622,0.997622,0.997622,0.997622
2,0.931629,0.931629,0.931629,0.931629,0.931034,0.931034,0.931034,0.931034,0.931034,0.931034
3,0.960761,0.960761,0.960761,0.960761,0.960761,0.960761,0.960761,0.960761,0.960166,0.960166
4,0.989298,0.989298,0.989298,0.989298,0.989298,0.988704,0.988704,0.988704,0.988704,0.988704
5,0.977408,0.977408,0.977408,0.977408,0.976813,0.976813,0.976813,0.976813,0.976813,0.976813
6,0.964328,0.964328,0.964328,0.964328,0.964328,0.964328,0.963734,0.963734,0.963734,0.963734
7,0.999405,0.999405,0.998811,0.998811,0.998811,0.998811,0.998811,0.998811,0.998811,0.998811
8,0.979191,0.979191,0.978002,0.978597,0.978002,0.98038,0.978597,0.978597,0.978002,0.977408
9,0.999405,0.999405,0.999405,0.999405,0.998811,0.998811,0.998811,0.998811,0.998811,0.998811


# Accuracy score of KNN model for each of the 17 class

In [102]:
df_KNN = modelEvaluate('KNN', X, y_bin)[2]

In [130]:
df_KNN = df_KNN.T

In [156]:
df_KNN

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.788347,0.79786,0.805589,0.81629,0.818074,0.802616,0.808561,0.806183,0.80321,0.817479
1,0.997622,0.998811,0.998216,1.0,0.998216,0.996433,0.998216,0.999405,0.998216,0.998216
2,0.951843,0.945303,0.950059,0.958977,0.953032,0.953627,0.945303,0.945303,0.951249,0.953032
3,0.966706,0.969084,0.974435,0.969084,0.972652,0.972057,0.967895,0.971463,0.972652,0.969084
4,0.991677,0.990488,0.989298,0.989893,0.988704,0.989893,0.989298,0.989298,0.989298,0.98692
5,0.987515,0.988109,0.985731,0.987515,0.990488,0.990488,0.983948,0.988704,0.988109,0.992866
6,0.980975,0.972652,0.979191,0.979191,0.982164,0.980975,0.983948,0.980975,0.975624,0.977408
7,0.999405,1.0,1.0,1.0,0.998216,1.0,0.998811,0.999405,0.998811,1.0
8,0.995244,0.997027,0.994649,0.992866,0.991677,0.991677,0.99346,0.99346,0.992271,0.992866
9,0.999405,0.999405,0.999405,0.999405,0.998811,0.998811,0.998811,0.998811,0.998811,0.998811


In [167]:
df_LogReg.T.mean()

0     0.767776
1     0.999405
2     0.960464
3     0.969976
4     0.989477
5     0.988466
6     0.986683
7     0.998514
8     0.994709
9     0.998989
10    0.966766
11    0.931510
12    0.981153
13    0.997741
14    0.956361
15    0.980856
16    0.998157
dtype: float64

In [168]:
df_SVC.T.mean()

0     0.757491
1     0.997681
2     0.931272
3     0.960642
4     0.989001
5     0.977051
6     0.964090
7     0.998930
8     0.978597
9     0.999049
10    0.953151
11    0.882164
12    0.971760
13    0.989596
14    0.941855
15    0.981332
16    0.997562
dtype: float64

In [169]:
df_KNN.T.mean()

0     0.806421
1     0.998335
2     0.950773
3     0.970511
4     0.989477
5     0.988347
6     0.979310
7     0.999465
8     0.993520
9     0.999049
10    0.968847
11    0.936742
12    0.976992
13    0.996433
14    0.944649
15    0.981510
16    0.997384
dtype: float64

### For class 0: KNN is the best model.
### For class 1: Logistic Regression is the best model.
### For class 2: Logistic Regression is the best model.
### For class 3: KNN is the best model.
### For class 4: KNN and Logistic Regression have the same score.
### For class 5: Logistic Regression is the best model.
### For class 6: Logistic Regression is the best model.
### For class 7: KNN is the best model.
### For class 8: Logistic Regression is the best model.
### For class 9: KNN and SVC have the same.
### For class 10: KNN is the best model.
### For class 11: KNN is the best model.
### For class 12: Logistic Regression is the best model.
### For class 13: Logistic Regression is the best model.
### For class 14: Logistic Regression is the best model.
### For class 15: KNN is the best model.
### For class 16: Logistic Regression is the best model.
### According to scores Logistic Regression Model is the best model overall.

# Question 2

In [173]:
NBA = pd.read_csv('https://sports-statistics.com/database/basketball-data/nba/2018-19_pbp.csv')
NBA.columns

Index(['Unnamed: 0', 'EVENTMSGACTIONTYPE', 'EVENTMSGTYPE', 'EVENTNUM',
       'GAME_ID', 'HOMEDESCRIPTION', 'NEUTRALDESCRIPTION', 'PCTIMESTRING',
       'PERIOD', 'PERSON1TYPE', 'PERSON2TYPE', 'PERSON3TYPE', 'PLAYER1_ID',
       'PLAYER1_NAME', 'PLAYER1_TEAM_ABBREVIATION', 'PLAYER1_TEAM_CITY',
       'PLAYER1_TEAM_ID', 'PLAYER1_TEAM_NICKNAME', 'PLAYER2_ID',
       'PLAYER2_NAME', 'PLAYER2_TEAM_ABBREVIATION', 'PLAYER2_TEAM_CITY',
       'PLAYER2_TEAM_ID', 'PLAYER2_TEAM_NICKNAME', 'PLAYER3_ID',
       'PLAYER3_NAME', 'PLAYER3_TEAM_ABBREVIATION', 'PLAYER3_TEAM_CITY',
       'PLAYER3_TEAM_ID', 'PLAYER3_TEAM_NICKNAME', 'SCORE', 'SCOREMARGIN',
       'VISITORDESCRIPTION', 'WCTIMESTRING'],
      dtype='object')

## Question 2.1

In [253]:
pd.crosstab(NBA['PLAYER1_TEAM_ABBREVIATION'], NBA['PLAYER2_TEAM_ABBREVIATION'])

PLAYER2_TEAM_ABBREVIATION,ATL,BKN,BOS,CHA,CHI,CLE,DAL,DEN,DET,GSW,...,OKC,ORL,PHI,PHX,POR,SAC,SAS,TOR,UTA,WAS
PLAYER1_TEAM_ABBREVIATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATL,4603,114,142,132,129,102,80,70,99,58,...,69,133,130,67,64,70,48,100,68,140
BKN,99,3955,118,119,100,108,69,59,97,63,...,60,76,137,52,67,57,52,130,58,113
BOS,132,112,4180,111,60,103,59,49,122,62,...,69,75,117,54,53,60,39,129,61,72
CHA,111,111,102,3903,76,84,55,57,104,39,...,66,93,133,63,51,48,42,67,50,117
CHI,137,118,85,84,3526,107,75,59,97,57,...,85,106,107,56,48,63,43,110,49,106
CLE,94,124,114,106,97,3454,48,53,105,39,...,65,79,77,62,45,49,55,122,53,124
DAL,58,57,53,41,56,59,3880,82,58,100,...,137,45,55,122,114,85,116,64,129,53
DEN,57,71,55,63,43,61,95,4122,52,106,...,132,51,56,128,111,88,115,46,136,52
DET,100,96,125,111,108,129,62,56,3488,44,...,63,100,141,58,54,58,47,89,68,95
GSW,65,54,63,59,72,56,123,146,61,4247,...,100,47,61,135,114,128,74,58,87,59


In [254]:
pd.crosstab(NBA['PLAYER1_TEAM_ABBREVIATION'], NBA['PLAYER3_TEAM_ABBREVIATION'])

PLAYER3_TEAM_ABBREVIATION,ATL,BKN,BOS,CHA,CHI,CLE,DAL,DEN,DET,GSW,...,OKC,ORL,PHI,PHX,POR,SAC,SAS,TOR,UTA,WAS
PLAYER1_TEAM_ABBREVIATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATL,40,17,33,20,24,6,16,11,21,14,...,11,24,22,10,12,5,17,11,10,16
BKN,14,39,34,21,25,13,6,14,15,14,...,13,25,16,8,15,5,9,15,14,15
BOS,25,20,32,19,8,7,5,10,15,11,...,13,17,20,12,5,7,9,16,9,13
CHA,31,30,28,20,16,14,10,19,29,18,...,12,21,36,11,20,8,12,25,8,23
CHI,24,9,16,19,32,16,10,12,21,16,...,9,28,31,10,10,11,7,30,19,30
CLE,20,26,24,25,19,27,10,10,23,14,...,21,13,19,18,13,9,11,22,15,14
DAL,8,4,8,14,8,4,37,15,6,25,...,16,8,15,24,23,17,13,10,29,8
DEN,17,15,15,13,10,6,5,27,11,33,...,22,7,8,26,15,12,22,9,27,8
DET,15,12,18,21,19,9,12,6,51,10,...,12,19,30,10,9,14,12,8,9,16
GSW,8,6,10,10,4,3,15,15,4,36,...,17,12,9,10,17,11,15,10,16,3


In [251]:
pd.crosstab(NBA['PLAYER2_TEAM_ABBREVIATION'], NBA['PLAYER3_TEAM_ABBREVIATION'])

PLAYER3_TEAM_ABBREVIATION,ATL,BKN,BOS,CHA,CHI,CLE,DAL,DEN,DET,GSW,...,OKC,ORL,PHI,PHX,POR,SAC,SAS,TOR,UTA,WAS
PLAYER2_TEAM_ABBREVIATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATL,27,2,2,0,3,3,2,0,2,0,...,1,2,2,0,3,0,1,0,1,3
BKN,0,36,0,0,1,4,1,0,1,1,...,0,0,2,1,0,1,0,0,0,1
BOS,3,2,35,0,2,1,2,0,3,0,...,1,0,1,1,1,1,1,1,1,1
CHA,4,5,1,24,1,1,0,1,2,1,...,1,3,3,0,1,1,1,2,1,2
CHI,4,1,0,2,36,1,0,0,3,1,...,3,1,1,0,1,0,0,1,2,3
CLE,0,3,1,2,2,22,1,1,3,2,...,0,1,1,0,1,1,1,0,1,4
DAL,0,1,1,0,0,0,28,1,1,1,...,1,1,1,0,2,2,3,1,0,0
DEN,1,2,0,2,1,0,0,28,1,3,...,3,1,0,3,1,1,1,0,4,1
DET,0,2,1,1,0,1,0,0,37,0,...,0,1,1,0,0,1,0,0,1,3
GSW,1,1,1,1,0,0,2,1,1,36,...,0,1,1,1,1,0,1,2,2,0


## Question 2.4

In [264]:
mat_1 = pd.crosstab(NBA['PLAYER1_NAME'], NBA['PLAYER2_NAME'])

In [265]:
mat_2 = pd.crosstab(NBA['PLAYER1_NAME'], NBA['PLAYER3_NAME'])

In [266]:
mat_3 = pd.crosstab(NBA['PLAYER2_NAME'], NBA['PLAYER3_NAME'])