HW6
Name:Qifan Chen
USCID:9166932624


Programming Part: Multi-class and Multi-Label Classification Using Sup-
port Vector Machines

In [164]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

(a)Download the Anuran Calls (MFCCs) Data Set

In [41]:
data_path = "../data/Frogs_MFCCs.csv"
df = pd.read_csv(data_path)
X = df.iloc[:, 0:22]
y = df.iloc[:, 22:-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

enc = preprocessing.OrdinalEncoder().fit(y)
y_train = enc.transform(y_train)
y_test = enc.transform(y_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((5036, 22), (2159, 22), (5036, 3), (2159, 3))

(b)Each instance has three labels: Families, Genus, and Species.
    i.Research exact match and hamming score/ loss methods for evaluating multi-
       label classification and use them in evaluating the classifiers in this problem.

In [32]:
def exact_match(y_true, y_pred):

    mach = np.all(y_true==y_pred, axis=1) # match result for each samples
    return np.count_nonzero(mach)/y_true.shape[0]

def hamming_score(y_true, y_pred):

    d = y_true.shape[0]*y_true.shape[1]
    return np.count_nonzero(y_true==y_pred) / d

In [45]:
# Example1
a = np.array([[1, 2, 3], [0, 1, 1], [1, 3, 1]])
b = np.array([[1, 2, 0], [0, 1, 2], [1, 3, 1]])
print(exact_match(a, b), 1/3) # 1 / 3
print(hamming_score(a, b), 7/9) # 7 / 9

# Example2
print(exact_match(y_train,y_train), 1.0)
print(hamming_score(y_train,y_train), 1.0)

0.3333333333333333 0.3333333333333333
0.7777777777777778 0.7777777777777778
1.0 1.0
1.0 1.0


 ii.Train a SVM for each of the labels, using Gaussian kernels and one versus all
    classifier

In [97]:
def find_Cg(X_, y_):
    '''find the best C and gamma
    '''
    log_Cs = np.arange(-1.0, 3.0)
    Gs = np.arange(1.5, 4.5, 0.1) # gamma list
    max_score = 0
    for g in Gs:
        for C in log_Cs:
            svc = SVC(C=10**C, gamma=g)
            cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
            scores = cross_val_score(svc, X_, y_, scoring='accuracy', cv=cv, n_jobs=-1)
            if np.mean(scores) > max_score:
                C_res = C
                g_res = g
                max_score = np.mean(scores)
    return (C_res, g_res)

In [98]:
X_ = X_train[:]
para_res = []
svc_list = []
for i in range(3):
    y_ = y_train[:, i]
    res = find_Cg(X_, y_)
    svc = SVC(C=10**res[0], gamma=res[1]).fit(X_, y_)
    svc_list.append(svc)
    para_res.append(res)

In [105]:
y_pred_train = np.array([svc_list[i].predict(X_train) for i in range(3)]).T
print(f"Exact match score of training is {exact_match(y_train, y_pred_train)*100:.2f}%")
print(f"Hamming score of training is {hamming_score(y_train, y_pred_train)*100:.2f}%")

y_pred_test = np.array([svc_list[i].predict(X_test) for i in range(3)]).T
print(f"Exact match score of test is {exact_match(y_test, y_pred_test)*100:.2f}%")
print(f"Hamming score of test is {hamming_score(y_test, y_pred_test)*100:.2f}%")

Exact match score of training is 100.00%
Hamming score of training is 100.00%
Exact match score of test is 98.66%
Hamming score of test is 99.03%


 iii. Repeat 6(b)ii with L1-penalized SVMs. Remember to normalize the attributes.

In [190]:
def find_C_L1(X_, y_):
    '''find the best C for L1-penalized svm
    '''
    log_Cs = np.arange(-1.0, 9.0)
    max_score = 0
    for C in log_Cs:
        svc_L1 = LinearSVC(penalty='l1', C=10**C, dual=False, max_iter=100)
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
        scores = cross_val_score(svc_L1, X_, y_, scoring='accuracy', 
        cv=cv, n_jobs=1)
        if np.mean(scores) > max_score:
            C_res = C
            max_score = np.mean(scores)
    return C_res

In [191]:
X_ = X_train[:]
para_L1_res = []
svc_L1_list = []
for i in range(3):
    y_ = y_train[:, i]
    res = find_C_L1(X_, y_)
    svc_L1 = LinearSVC(penalty='l1', C=10**res, dual=False).fit(X_, y_)
    svc_L1_list.append(svc_L1)
    para_L1_res.append(res)

In [192]:
para_L1_res

[7.0, 2.0, 1.0]

In [195]:
print("L1 penalized SVM:")
y_pred_train = np.array([svc_L1_list[i].predict(X_train) for i in range(3)]).T
print(f"Exact match score of training is {exact_match(y_train, y_pred_train)*100:.2f}%")
print(f"Hamming score of training is {hamming_score(y_train, y_pred_train)*100:.2f}%")

y_pred_test = np.array([svc_L1_list[i].predict(X_test) for i in range(3)]).T
print(f"Exact match score of test is {exact_match(y_test, y_pred_test)*100:.2f}%")
print(f"Hamming score of test is {hamming_score(y_test, y_pred_test)*100:.2f}%")

L1 penalized SVM:
Exact match score of training is 92.36%
Hamming score of training is 95.39%
Exact match score of test is 91.20%
Hamming score of test is 94.46%


iv. Repeat 6(b)iii by using SMOTE or any other method you know to remedy
class imbalance. Report your conclusions about the classifiers you trained.

In [197]:
print("Original data : ", Counter(y_train[:, 0]))
print("Original data : ", Counter(y_train[:, 1]))
print("Original data : ", Counter(y_train[:, 2]))

Original data :  Counter({3.0: 3092, 2.0: 1509, 1.0: 387, 0.0: 48})
Original data :  Counter({0.0: 2891, 3.0: 1111, 1.0: 387, 2.0: 209, 4.0: 201, 7.0: 109, 5.0: 80, 6.0: 48})
Original data :  Counter({1.0: 2404, 5.0: 777, 0.0: 487, 2.0: 387, 4.0: 334, 3.0: 209, 6.0: 201, 9.0: 109, 7.0: 80, 8.0: 48})


In [206]:
ss = {3.0: 3092, 2.0: 1509, 1.0: 1000, 0.0: 500}
sm = SMOTE(sampling_strategy=ss, random_state=42, k_neighbors=5)
X_train_sm0, y_train_sm0 = sm.fit_resample(X_train, y_train[:, 0])

ss = {0.0: 2891, 3.0: 1111, 1.0: 800, 2.0: 800,
      4.0: 800, 7.0: 600, 5.0: 500, 6.0: 500}
sm = SMOTE(sampling_strategy=ss, random_state=42, k_neighbors=5)
X_train_sm1, y_train_sm1 = sm.fit_resample(X_train, y_train[:, 1])

ss = {1.0: 2404, 5.0: 1000, 0.0: 1000, 2.0: 900, 4.0: 900,
      3.0: 800, 6.0: 799, 9.0: 600, 7.0: 500, 8.0: 500}
sm = SMOTE(sampling_strategy=ss, random_state=42, k_neighbors=5)
X_train_sm2, y_train_sm2 = sm.fit_resample(X_train, y_train[:, 2])

X_train_sms = [X_train_sm0, X_train_sm1, X_train_sm2]
y_train_sms = [y_train_sm0, y_train_sm1, y_train_sm2]

In [207]:
para_sm_res = []
svc_sm_list = []
for i in range(3):
    X_ = X_train_sms[i]
    y_ = y_train_sms[i]
    res = find_C_L1(X_, y_)
    svc_sm = LinearSVC(penalty='l1', C=10**res, dual=False).fit(X_, y_)
    svc_sm_list.append(svc_sm)
    para_sm_res.append(res)

In [209]:
print("L1 penalized SVM with smote:")
y_pred_train = np.array([svc_sm_list[i].predict(X_train) for i in range(3)]).T
print(f"Exact match score of training is {exact_match(y_train, y_pred_train)*100:.2f}%")
print(f"Hamming score of training is {hamming_score(y_train, y_pred_train)*100:.2f}%")

y_pred_test = np.array([svc_sm_list[i].predict(X_test) for i in range(3)]).T
print(f"Exact match score of test is {exact_match(y_test, y_pred_test)*100:.2f}%")
print(f"Hamming score of test is {hamming_score(y_test, y_pred_test)*100:.2f}%")

L1 penalized SVM with smote:
Exact match score of training is 91.74%
Hamming score of training is 95.39%
Exact match score of test is 91.20%
Hamming score of test is 94.72%
