## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

Import packages

In [1]:
import pandas as pd
import numpy as np
import collections
from sklearn.metrics import calinski_harabasz_score
from sklearn.model_selection import KFold
from IPython.display import Image
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans

### (a) Download the Anuran Calls (MFCCs) Data Set

In [2]:
df = pd.read_csv('../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv')

In [3]:
family_label_encoder = preprocessing.LabelEncoder()
family_label_encoder.fit(df['Family'])
genus_label_encoder = preprocessing.LabelEncoder()
genus_label_encoder.fit(df['Genus'])
species_label_encoder = preprocessing.LabelEncoder()
species_label_encoder.fit(df['Species'])
df['Family'] = family_label_encoder.transform(df['Family'])
df['Genus'] = genus_label_encoder.transform(df['Genus'])
df['Species'] = species_label_encoder.transform(df['Species'])
display(df)

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,3,0,0,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,3,0,0,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,3,0,0,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,3,0,0,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,2,7,9,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,2,7,9,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,2,7,9,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,2,7,9,60


In [4]:
X = df.iloc[:, :-4].to_numpy()
y = df.iloc[:, -4:-1].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### (b) Train a classifier for each label

#### (i) Research

__Exact Match:__ Calculate subset accuracy meaning the prdeicted set of labels should exactly match with the true set of labels.<br>
__Hamming Loss:__ The fraction of the wrong labels to the total number of labels.<br>

#### (ii) Train a SVM for each of the labels

In [5]:
def gaissianSVC_best_paras(X, y):
    best_score = float('-inf')
    best_c = None
    best_gamma = None
    for c in c_range:
        for gamma in gamma_range:
            score = 0
            kf = KFold(n_splits=10)
            for train_index, test_index in kf.split(X):
                cur_X_train, cur_X_val = X[train_index], X[test_index]
                cur_y_train, cur_y_val = y[train_index], y[test_index]

                clf = OneVsRestClassifier(SVC(kernel='rbf', random_state=42, gamma=gamma, C=c))
                clf.fit(cur_X_train, cur_y_train)
                score += clf.score(cur_X_val, cur_y_val)
            cur_score = score / 10
            if cur_score > best_score:
                best_score = cur_score
                best_c = c
                best_gamma = gamma
    return best_score, best_c, best_gamma

c_range = np.logspace(start=-3, stop=6, num=5, base=10)
gamma_range = np.linspace(start=0.1, stop=2, num=5)
family_loss, family_c, family_gamma = gaissianSVC_best_paras(X_train, y_train[:, 0])
genus_loss, genus_c, genus_gamma = gaissianSVC_best_paras(X_train, y_train[:, 1])
species_loss, species_c, species_gamma = gaissianSVC_best_paras(X_train, y_train[:, 2])

In [6]:
print("Family label: weight of the SVM penalty = %f , width of the Gaussian Kernel = %f" % (family_gamma, family_c))
print("Genus label: weight of the SVM penalty = %f , width of the Gaussian Kernel = %f" % (genus_gamma, genus_c))
print("Species label: weight of the SVM penalty = %f , width of the Gaussian Kernel = %f" % (species_gamma, species_c))

Family label: weight of the SVM penalty = 2.000000 , width of the Gaussian Kernel = 31.622777
Genus label: weight of the SVM penalty = 2.000000 , width of the Gaussian Kernel = 31.622777
Species label: weight of the SVM penalty = 1.525000 , width of the Gaussian Kernel = 31.622777


In [7]:
def train_on_best(cur_X, cur_y, cur_X_test, gamma, c):
    clf = OneVsRestClassifier(SVC(kernel='rbf', random_state=42, gamma=gamma, C=c))
    clf.fit(cur_X, cur_y)
    return clf.predict(cur_X_test)

family_predicts = train_on_best(X_train, y_train[:, 0], X_test, family_gamma, family_c)
genus_predicts = train_on_best(X_train, y_train[:, 1], X_test, genus_gamma, genus_c)
species_predicts = train_on_best(X_train, y_train[:, 2], X_test, species_gamma, species_c)
final_predicts = np.column_stack((family_predicts, genus_predicts, species_predicts))
gaussian_exact_match = np.all(final_predicts == y_test, axis=1).mean()
gaussian_hamming_loss = np.sum(np.not_equal(y_test, final_predicts)) / float(y_test.size)
print("Gaussian kernel SVM's exact match score: %f " % gaussian_exact_match)
print("Gaussian kernel SVM's hamming loss: %f " %gaussian_hamming_loss)

Gaussian kernel SVM's exact match score: 0.990736 
Gaussian kernel SVM's hamming loss: 0.006948 


#### (iii) Repeat 1(b)ii with L1-penalized SVMs

In [None]:
def l1SVC_best_paras(X, y):
    best_score = float('-inf')
    best_c = None
    for c in c_range:
        score = 0
        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(X):
            cur_X_train, cur_X_val = X[train_index], X[test_index]
            cur_y_train, cur_y_val = y[train_index], y[test_index]

            clf = OneVsRestClassifier(LinearSVC(penalty='l1', random_state=42, C=c, dual=False))
            clf.fit(cur_X_train, cur_y_train)
            score += clf.score(cur_X_val, cur_y_val)
        cur_score = score / 10
        if cur_score > best_score:
            best_score = cur_score
            best_c = c
    return best_score, best_c

scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train_standardized = scaler.transform(X_train)
scaler.fit(X_test)
X_test_standardized = scaler.transform(X_test)
c_range = np.logspace(start=-3, stop=6, num=5, base=10)
family_loss, family_c = l1SVC_best_paras(X_train_standardized, y_train[:, 0])
genus_loss, genus_c = l1SVC_best_paras(X_train_standardized, y_train[:, 1])
species_loss, species_c = l1SVC_best_paras(X_train_standardized, y_train[:, 2])

In [9]:
print("Family label: weight of the L1 SVM penalty = %f" % (family_c))
print("Genus label: weight of the L1 SVM penalty = %f" % (genus_c))
print("Species label: weight of the L1 SVM penalty = %f" % (species_c))

Family label: weight of the L1 SVM penalty = 31.622777
Genus label: weight of the L1 SVM penalty = 31.622777
Species label: weight of the L1 SVM penalty = 31.622777


In [None]:
def train_on_best_l1(cur_X, cur_y, cur_X_test, c):
    clf = OneVsRestClassifier(LinearSVC(penalty='l1', random_state=42, C=c, dual=False))
    clf.fit(cur_X, cur_y)
    return clf.predict(cur_X_test)

family_predicts = train_on_best_l1(X_train_standardized, y_train[:, 0], X_test_standardized, family_c)
genus_predicts = train_on_best_l1(X_train_standardized, y_train[:, 1], X_test_standardized, genus_c)
species_predicts = train_on_best_l1(X_train_standardized, y_train[:, 2], X_test_standardized, species_c)
final_predicts = np.column_stack((family_predicts, genus_predicts, species_predicts))
l1_exact_match = np.all(final_predicts == y_test, axis=1).mean()
l1_hamming_loss = np.sum(np.not_equal(y_test, final_predicts)) / float(y_test.size)


In [11]:
print("L1 linear SVM's exact match score: %f " % l1_exact_match)
print("L1 linear SVM's hamming loss: %f " %l1_hamming_loss)

L1 linear SVM's exact match score: 0.909217 
L1 linear SVM's hamming loss: 0.058515 


#### (iv) Repeat 1(b)iii by using SMOTE or any other method for imbalance

In [None]:
oversample = SMOTE()
f_X_train, f_y_train = oversample.fit_resample(X_train_standardized, y_train[:, 0])
g_X_train, g_y_train = oversample.fit_resample(X_train_standardized, y_train[:, 1])
s_X_train, s_y_train = oversample.fit_resample(X_train_standardized, y_train[:, 2])
family_loss, family_c = l1SVC_best_paras(f_X_train, f_y_train)
genus_loss, genus_c = l1SVC_best_paras(g_X_train, g_y_train)
species_loss, species_c = l1SVC_best_paras(s_X_train, s_y_train)

In [13]:
print("Family label: weight of the L1 SVM penalty with SMOTE = %f" % (family_c))
print("Genus label: weight of the L1 SVM penalty with SMOTE = %f" % (genus_c))
print("Species label: weight of the L1 SVM penalty with SMOTE = %f" % (species_c))

Family label: weight of the L1 SVM penalty with SMOTE = 5623.413252
Genus label: weight of the L1 SVM penalty with SMOTE = 31.622777
Species label: weight of the L1 SVM penalty with SMOTE = 31.622777


In [None]:
family_predicts = train_on_best_l1(f_X_train, f_y_train, X_test_standardized, family_c)
genus_predicts = train_on_best_l1(g_X_train, g_y_train, X_test_standardized, genus_c)
species_predicts = train_on_best_l1(s_X_train, s_y_train, X_test_standardized, species_c)
final_predicts = np.column_stack((family_predicts, genus_predicts, species_predicts))
l1_exact_match_SMOTE = np.all(final_predicts == y_test, axis=1).mean()
l1_hamming_loss_SMOTE = np.sum(np.not_equal(y_test, final_predicts)) / float(y_test.size)

In [15]:
print("L1 linear SVM's exact match score with SMOTE: %f " % l1_exact_match_SMOTE)
print("L1 linear SVM's hamming loss with SMOTE: %f " % l1_hamming_loss_SMOTE)

L1 linear SVM's exact match score with SMOTE: 0.863826 
L1 linear SVM's hamming loss with SMOTE: 0.073954 


### Q: Report your conclusions about the classifiers you trained. <br>
Ans: Looks like gaussian kernal SVM has the best result, and for the two L1 penalized SVM, without SMOTE has a better result.

## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

In [16]:
kMean_X = df.iloc[:, :-4].to_numpy()
kMean_y = df.iloc[:, -4:-1].to_numpy()
kMean_y_f = df.iloc[:, -4].to_numpy()
kMean_y_g = df.iloc[:, -3].to_numpy()
kMean_y_s = df.iloc[:, -2].to_numpy()

### (a) Use k-means clustering

In [17]:
def runKmeans(k, training_data):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(training_data)
    cluster_labels = kmeans.predict(training_data)
    ch_index = calinski_harabasz_score(X, cluster_labels)

    return cluster_labels, ch_index

### (b) Determine which family is the majority

In [18]:
def find_majority(members):
    vote_count = collections.Counter(members)
    return vote_count.most_common(1)[0][0]

def determine_label(predicts_clusters, true_labels):
    cluster_members = collections.defaultdict(list)
    label_match = {}
    for index, cluster in enumerate(predicts_clusters):
        cluster_members[cluster].append(true_labels[index])

    for clus, members in cluster_members.items():
        label_match[clus] = find_majority(members)
    
    predict_labels = [label_match[i] for i in predicts_clusters]
    return np.array(predict_labels)

### (c) Calculate the average Hamming distance, Hamming score, and Hamming loss

In [19]:
def hamming_result(predicts, truth):
    hamming_dis = np.sum(np.not_equal(truth, predicts))/float(predicts.size)*3
    hamming_score = 1 - np.sum(np.not_equal(truth, predicts))/float(predicts.size)
    hamming_loss = np.sum(np.not_equal(truth, predicts)) / float(predicts.size)
    return hamming_dis, hamming_score, hamming_loss

__Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.__

In [None]:
dis_list = []
score_list = []
loss_list = []
for _ in range(50):
    best_ch = float('-inf')
    best_k = None
    # Determine best k
    for k in range(2, 51):
        cluster_labels, ch = runKmeans(k, kMean_X)
        if ch > best_ch:
            best_ch = ch
            best_k = k

    # Determine label by best k mean
    cluster_labels, ch = runKmeans(best_k, kMean_X)
    f_predict_labels = determine_label(cluster_labels, kMean_y_f)
    g_predict_labels = determine_label(cluster_labels, kMean_y_g)
    s_predict_labels = determine_label(cluster_labels, kMean_y_s)

    # Hamming result
    final_predicts = np.column_stack((f_predict_labels, g_predict_labels, s_predict_labels))
    hamming_dis, hamming_score, hamming_loss = hamming_result(final_predicts, kMean_y)
    dis_list.append(hamming_dis)
    score_list.append(hamming_score)
    loss_list.append(hamming_loss)


In [21]:
print('Average for hamming distance: ', np.mean(np.array(dis_list)))
print('Average for hamming score: ', np.mean(np.array(score_list)))
print('Average for hamming loss: ', np.mean(np.array(loss_list)))

print('Std for hamming distance: ', np.std(np.array(dis_list)))
print('Std for hamming score: ', np.std(np.array(score_list)))
print('Std for hamming loss: ', np.std(np.array(loss_list)))

Average for hamming distance:  0.895621959694232
Average for hamming score:  0.7014593467685891
Average for hamming loss:  0.29854065323141077
Std for hamming distance:  1.1102230246251565e-16
Std for hamming score:  1.1102230246251565e-16
Std for hamming loss:  5.551115123125783e-17
