## Training SVM classifier on ADD dataset

The first step is to read dataset from CSV file

In [1]:
import pandas as pd
from sklearn import preprocessing


add_dataset = pd.read_csv('~/Desktop/d/dataset_add.csv')
add_dataset = add_dataset.dropna()

In [2]:
add_dataset.head()

Unnamed: 0,F1,F10,F11,F12,F13,F14,F15,F16,F17,F18,...,F61,F62,F63,F64,F7,F8,F9,drug_id,label,target_id
0,-2.776226,0.34892,-1.165489,-0.289754,0.348107,-1.40724,0.397983,-0.145894,-0.077631,-0.168685,...,1.380475,1.097712,-1.445931,-0.634376,0.032874,-0.668759,-2.458569,0.0,1,8229.0
1,-1.967628,-0.146283,2.013751,0.200184,0.552616,-1.906731,-1.759623,-1.751837,-0.693581,-0.345042,...,0.230374,0.832758,-1.382336,1.292443,-0.23941,-0.049024,0.15397,1.0,1,8215.0
2,-2.543488,-0.14897,1.039684,-0.372909,0.630169,-1.921702,-0.792904,-0.949426,-0.829654,-0.682305,...,0.224066,0.193441,-0.105573,1.119177,-0.374138,-0.228932,0.258706,1.0,1,8048.0
3,-1.372312,1.482211,1.192444,0.350745,0.69756,-1.689651,-0.45059,0.200719,-0.126603,-0.215285,...,0.833958,2.028638,-0.384316,1.187856,-0.585721,-0.466567,-1.274458,1.0,1,8230.0
4,-1.369142,1.435262,0.260705,-0.29259,1.278427,-1.234602,-1.428129,-0.693645,-0.680362,-0.347972,...,0.633864,0.40682,-0.721664,1.593717,0.181801,-1.474598,-0.099709,1.0,1,8342.0


Create a training set from pandas data frame

In [3]:
train_data = add_dataset.iloc[:,:64].values

# Normalize dataset
min_max_scaler = preprocessing.MinMaxScaler()
train_scaled = min_max_scaler.fit_transform(train_data)
train_data = train_scaled

print("Number of samples: %d, Number of features: %d" % (train_data.shape[0], train_data.shape[1]))

Number of samples: 44254, Number of features: 64


In [4]:
# Show normlaized data

train = pd.DataFrame(train_data)

train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.332839,0.414468,0.244435,0.564447,0.480736,0.379224,0.62515,0.587034,0.559226,0.439795,...,0.407095,0.637354,0.514412,0.656972,0.514045,0.268728,0.343273,0.611083,0.485445,0.270499
1,0.436057,0.350062,0.647862,0.626001,0.504606,0.324058,0.382762,0.368773,0.48393,0.417487,...,0.5476,0.639347,0.203183,0.513849,0.481104,0.276226,0.566027,0.584239,0.554136,0.578609
2,0.362548,0.349713,0.524259,0.554,0.513658,0.322404,0.491364,0.477827,0.467296,0.374824,...,0.495149,0.606581,0.414757,0.513064,0.401618,0.426774,0.545996,0.570956,0.534195,0.590961
3,0.51205,0.561864,0.543643,0.644917,0.521524,0.348033,0.52982,0.634142,0.553239,0.4339,...,0.569271,0.57013,0.414446,0.588961,0.629786,0.393906,0.553936,0.550096,0.507856,0.410147
4,0.512455,0.555757,0.425411,0.564091,0.589324,0.398291,0.420002,0.51259,0.485546,0.417116,...,0.45553,0.545761,0.362075,0.56406,0.428147,0.354128,0.600857,0.625766,0.396126,0.548691


## SVM classifier

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Hyperparameters
kernel = 'rbf'

# An instance of SVM classifier
svm_cl = SVC(kernel=kernel)

Define range of parameters for Grid Search

In [6]:
# Penalty parameter
#c_range = {'C': [float(2**i) for i in range(-14, 14)]}
# Gamma parameter for RBF kernel
#gamma_range = {'gamma': [float(2**i) for i in range(-14, 14)]} if kernel == 'rbf' else {}

#param_range = {**c_range, **gamma_range}

# Arguments for grid search
#cv_fold = 2
#n_workers = -1 # Number of CPU threads

#result = GridSearchCV(svm_cl, param_range, cv=cv_fold, n_jobs=n_workers, refit=True,
 #                     verbose=1)

Start grid search!

In [7]:
svm_cl.fit(train_data, add_dataset['label'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
y_pred=svm_cl.predict(train_data)
print("Accuracy: ", (accuracy_score(add_dataset['label'], y_pred) * 100))


Accuracy:  63.544538346816104


In [13]:
print(classification_report(add_dataset['label'], y_pred))

             precision    recall  f1-score   support

         -1       0.65      0.58      0.61     22106
          1       0.62      0.69      0.66     22148

avg / total       0.64      0.64      0.63     44254

