In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, zero_one_loss

%config InlineBackend.figure_format='svg'
%matplotlib inline

In [3]:
covertype_path = './covtype/covtype.data'

names = [
    'Elevation',
    'Aspect',
    'Slope',
    'Horizontal_Distance_To_Hydrology',
    'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways',
    'Hillshade_9am',
    'Hillshade_Noon',
    'Hillshade_3pm',
    'Horizontal_Distance_To_Fire_Points',
]

Wilderness_Area = []
for i in range(4):
    Wilderness_Area.append('Wilderness_Area_{}'.format(i+1))

Soil_Type = []
for i in range(40):
    Soil_Type.append('Soil_Type_{}'.format(i+1))

names.extend(Wilderness_Area + Soil_Type + ['Cover_Type'])
covertype_df = pd.read_csv(covertype_path, names=names)

In [3]:
X = covertype_df.drop('Cover_Type',axis=1)
y = covertype_df['Cover_Type']

X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25, random_state=42)

In [4]:
subset_ratio=0.5#0.75,1

In [5]:
num_train=len(X_train)
print num_train
num_test=len(X_test)
print num_test

435759
145253


In [6]:

# Xsample_train=X_train[0:100000]
# Xsampe_test=X_test
# ysample_train=y_train[0:100000]
# ysample_test=y_test

Xsample_train=X_train
Xsampe_test=X_test
ysample_train=y_train
ysample_test=y_test

In [7]:
ysample_test

array([1, 2, 2, ..., 3, 1, 6])

In [8]:
def compute_distances_no_loops(X_train,X_test):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.
    Input / Output: Same as compute_distances_two_loops
    """
    num_train=len(X_train)
    num_test=len(X_test)
    dists = np.zeros((num_test, num_train)) 
    
    # split (p-q)^2 to p^2 + q^2 - 2pq
    dists = np.sqrt((X_test**2).sum(axis=1, keepdims=True) + (X_train**2).sum(axis=1) - 2 * X_test.dot(X_train.T))
    
    #########################################################################
    #                         END OF YOUR CODE                              #
    #########################################################################
    return dists


In [9]:
 def predict_labels(dists, k,y_train):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.
    Inputs:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.
    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    num_test = dists.shape[0]
    y_pred = np.zeros(num_test)
    for i in xrange(num_test):
      
      closest_y = []
      
      closest_y = y_train[np.argsort(dists[i])][:k]
      
      y_pred[i] = np.argmax(np.bincount(closest_y))
     

    return y_pred

In [10]:
#dists=compute_distances_no_loops(Xsample_train,Xsampe_test)

In [11]:
#y_test_pred=predict_labels(dists,5,ysample_train)

# KNN without using sklearn
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/5
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        dists=compute_distances_no_loops(X_train_dat,X_train_folds[n])
        y_validation_pred = predict_labels(dists,k,y_train_dat)
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
#### full data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/5
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k,metric='euclidean')
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

k = 1, accuracy = 0.963144
k = 1, accuracy = 0.962295
k = 1, accuracy = 0.963041
k = 1, accuracy = 0.962490
k = 1, accuracy = 0.962800
mean of k=1,accuracy=0.962754
k = 3, accuracy = 0.964957
k = 3, accuracy = 0.965026
k = 3, accuracy = 0.964269
k = 3, accuracy = 0.964315
k = 3, accuracy = 0.965221
mean of k=3,accuracy=0.964758
k = 5, accuracy = 0.962307
k = 5, accuracy = 0.962422
k = 5, accuracy = 0.963248
k = 5, accuracy = 0.962031
k = 5, accuracy = 0.963018
mean of k=5,accuracy=0.962605
k = 8, accuracy = 0.954573
k = 8, accuracy = 0.952967
k = 8, accuracy = 0.955044
k = 8, accuracy = 0.954791
k = 8, accuracy = 0.955950
mean of k=8,accuracy=0.954665
k = 10, accuracy = 0.951429
k = 10, accuracy = 0.948997
k = 10, accuracy = 0.951693
k = 10, accuracy = 0.950672
k = 10, accuracy = 0.952370
mean of k=10,accuracy=0.951032
k = 12, accuracy = 0.948434
k = 12, accuracy = 0.945726
k = 12, accuracy = 0.947597
k = 12, accuracy = 0.947115
k = 12, accuracy = 0.948549
mean of k=12,accuracy=0.94748

In [45]:
#### full data, 2 folds
num_folds = 2
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/2
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k,metric='euclidean')
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

k = 1, accuracy = 0.956233
k = 1, accuracy = 0.956260
mean of k=1,accuracy=0.956246
k = 3, accuracy = 0.952988
k = 3, accuracy = 0.953561
mean of k=3,accuracy=0.953275
k = 5, accuracy = 0.947352
k = 5, accuracy = 0.948315
mean of k=5,accuracy=0.947833
k = 8, accuracy = 0.935689
k = 8, accuracy = 0.937667
mean of k=8,accuracy=0.936678
k = 10, accuracy = 0.930971
k = 10, accuracy = 0.932765
mean of k=10,accuracy=0.931868
k = 12, accuracy = 0.926519
k = 12, accuracy = 0.927327
mean of k=12,accuracy=0.926923
k = 15, accuracy = 0.921075
k = 15, accuracy = 0.920800
mean of k=15,accuracy=0.920938
k = 20, accuracy = 0.907761
k = 20, accuracy = 0.909037
mean of k=20,accuracy=0.908399
k = 50, accuracy = 0.857572
k = 50, accuracy = 0.860932
mean of k=50,accuracy=0.859252
k = 100, accuracy = 0.813231
k = 100, accuracy = 0.815976
mean of k=100,accuracy=0.814604


In [None]:
###leave one out
Xsample_train=X_train[0:50000]
ysample_train=y_train[0:50000]
num_folds = 50000
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/50000
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    #for accuracy in k_to_accuracies[k]:
        #print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))



In [48]:
Xsample_train=X_train[0:435759/2]
Xsampe_test=X_test
ysample_train=y_train[0:435759/2]
ysample_test=y_test

In [49]:
#### 50 percent data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/5
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k,metric='euclidean')
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

k = 1, accuracy = 0.949765
k = 1, accuracy = 0.952565
k = 1, accuracy = 0.951578
k = 1, accuracy = 0.950866
k = 1, accuracy = 0.952450
mean of k=1,accuracy=0.951445
k = 3, accuracy = 0.945978
k = 3, accuracy = 0.946598
k = 3, accuracy = 0.946437
k = 3, accuracy = 0.945726
k = 3, accuracy = 0.946047
mean of k=3,accuracy=0.946157
k = 5, accuracy = 0.937877
k = 5, accuracy = 0.939598
k = 5, accuracy = 0.938887
k = 5, accuracy = 0.938199
k = 5, accuracy = 0.939782
mean of k=5,accuracy=0.938869
k = 8, accuracy = 0.926013
k = 8, accuracy = 0.927963
k = 8, accuracy = 0.927206
k = 8, accuracy = 0.926954
k = 8, accuracy = 0.927068
mean of k=8,accuracy=0.927041
k = 10, accuracy = 0.920987
k = 10, accuracy = 0.922249
k = 10, accuracy = 0.921767
k = 10, accuracy = 0.920987
k = 10, accuracy = 0.921790
mean of k=10,accuracy=0.921556
k = 12, accuracy = 0.915548
k = 12, accuracy = 0.916489
k = 12, accuracy = 0.915387
k = 12, accuracy = 0.915273
k = 12, accuracy = 0.916076
mean of k=12,accuracy=0.91575

In [15]:
#### 50 percent data, 2 folds
num_folds = 2
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/2
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

k = 1, accuracy = 0.937736
k = 1, accuracy = 0.938929
mean of k=1,accuracy=0.938332
k = 3, accuracy = 0.927244
k = 3, accuracy = 0.926968
mean of k=3,accuracy=0.927106
k = 5, accuracy = 0.918147
k = 5, accuracy = 0.918018
mean of k=5,accuracy=0.918083
k = 8, accuracy = 0.902129
k = 8, accuracy = 0.902624
mean of k=8,accuracy=0.902377
k = 10, accuracy = 0.894748
k = 10, accuracy = 0.895024
mean of k=10,accuracy=0.894886
k = 12, accuracy = 0.887598
k = 12, accuracy = 0.888598
mean of k=12,accuracy=0.888098
k = 15, accuracy = 0.878455
k = 15, accuracy = 0.879593
mean of k=15,accuracy=0.879024
k = 20, accuracy = 0.862015
k = 20, accuracy = 0.863988
mean of k=20,accuracy=0.863001
k = 50, accuracy = 0.807397
k = 50, accuracy = 0.811812
mean of k=50,accuracy=0.809604
k = 100, accuracy = 0.771276
k = 100, accuracy = 0.774938
mean of k=100,accuracy=0.773107


In [22]:
Xsample_train=X_train[0:435759*3/4]
Xsampe_test=X_test
ysample_train=y_train[0:435759*3/4]
ysample_test=y_test

In [23]:
#### 75 percent data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/5
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

k = 1, accuracy = 0.959059
k = 1, accuracy = 0.959167
k = 1, accuracy = 0.959580
k = 1, accuracy = 0.960559
k = 1, accuracy = 0.958952
mean of k=1,accuracy=0.959463
k = 3, accuracy = 0.959442
k = 3, accuracy = 0.958937
k = 3, accuracy = 0.960314
k = 3, accuracy = 0.958325
k = 3, accuracy = 0.958968
mean of k=3,accuracy=0.959197
k = 5, accuracy = 0.953705
k = 5, accuracy = 0.953322
k = 5, accuracy = 0.954669
k = 5, accuracy = 0.953980
k = 5, accuracy = 0.953582
mean of k=5,accuracy=0.953852
k = 8, accuracy = 0.944693
k = 8, accuracy = 0.942965
k = 8, accuracy = 0.944036
k = 8, accuracy = 0.945275
k = 8, accuracy = 0.943959
mean of k=8,accuracy=0.944186
k = 10, accuracy = 0.940257
k = 10, accuracy = 0.939691
k = 10, accuracy = 0.939308
k = 10, accuracy = 0.939813
k = 10, accuracy = 0.939920
mean of k=10,accuracy=0.939798
k = 12, accuracy = 0.935805
k = 12, accuracy = 0.935116
k = 12, accuracy = 0.935269
k = 12, accuracy = 0.935300
k = 12, accuracy = 0.936554
mean of k=12,accuracy=0.93560

In [17]:
#### 75 percent data, 2 folds
num_folds = 2
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/2
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

k = 1, accuracy = 0.950180
k = 1, accuracy = 0.950823
mean of k=1,accuracy=0.950502
k = 3, accuracy = 0.943755
k = 3, accuracy = 0.944330
mean of k=3,accuracy=0.944042
k = 5, accuracy = 0.936319
k = 5, accuracy = 0.937066
mean of k=5,accuracy=0.936693
k = 8, accuracy = 0.923964
k = 8, accuracy = 0.924967
mean of k=8,accuracy=0.924466
k = 10, accuracy = 0.917838
k = 10, accuracy = 0.918083
mean of k=10,accuracy=0.917960
k = 12, accuracy = 0.912018
k = 12, accuracy = 0.912679
mean of k=12,accuracy=0.912349
k = 15, accuracy = 0.904675
k = 15, accuracy = 0.905256
mean of k=15,accuracy=0.904965
k = 20, accuracy = 0.889933
k = 20, accuracy = 0.892442
mean of k=20,accuracy=0.891187
k = 50, accuracy = 0.836386
k = 50, accuracy = 0.839807
mean of k=50,accuracy=0.838096
k = 100, accuracy = 0.794142
k = 100, accuracy = 0.798126
mean of k=100,accuracy=0.796134


In [25]:
##find out the best k = 1
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train) 
y_pred = neigh.predict(X_test)
num_correct = np.sum(y_pred == y_test)
accuracy = float(num_correct) / (len(X_test))

In [26]:
print accuracy

0.964517083984


In [8]:
############SVM
from sklearn import svm
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
## sample
Xsample_train=X_train[0:1000]
Xsampe_test=X_test[0:1000]
ysample_train=y_train[0:1000]
ysample_test=y_test[0:1000]



In [9]:
from sklearn import svm
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
#feature selection
clf = ExtraTreesClassifier()
clf = clf.fit(Xsample_train, ysample_train)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
Xsample_train_new = model.transform(Xsample_train)
X_test_new=model.transform(X_test)

In [10]:
X_test_new.shape

(145253, 11)

In [13]:
###using linear 5 fold
num_folds = 5
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train`
num_test_sample=len(Xsample_train)/5
linear=[]
## linear
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 1.0  # SVM regularization parameter
    lin_clf = svm.SVC(kernel='linear', C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=lin_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    linear.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for linear SVM is :", np.mean(linear)

accuracy = 0.66
accuracy = 0.695
accuracy = 0.655
accuracy = 0.63
accuracy = 0.645
mean accuracy for linear SVM is : 0.657


In [11]:
###using poly 2 fold
num_folds = 2
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/2
linear=[]
## linear
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 1.0  # SVM regularization parameter
    lin_clf = svm.SVC(kernel='linear', C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=lin_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    linear.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for linear SVM is :", np.mean(linear)

accuracy = 0.664
accuracy = 0.642
mean accuracy for linear SVM is : 0.653


In [None]:
###using poly 5 fold
num_folds = 5
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/5
poly=[]
## linear
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 0.01  # SVM regularization parameter
    poly_clf = svm.SVC(kernel='poly', C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=poly_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    poly.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for poly SVM is :", np.mean(poly)

In [9]:
###using poly 2 fold
num_folds = 2
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/2
poly=[]
## linear
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 0.01  # SVM regularization parameter
    poly_clf = svm.SVC(kernel='poly',degree=3, C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=poly_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    poly.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for poly SVM is :", np.mean(poly)

accuracy = 0.52
accuracy = 0.64
mean accuracy for poly SVM is : 0.58


In [19]:
###using rbf 5 fold
num_folds = 5
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/5
rbf=[]
## linear
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 0.01  # SVM regularization parameter
    rbf_clf = svm.SVC(kernel='rbf',gamma=0.7, C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=rbf_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    rbf.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for rbf SVM is :", np.mean(rbf)

accuracy = 0.495
accuracy = 0.501
accuracy = 0.483
accuracy = 0.48
accuracy = 0.461
mean accuracy for rbf SVM is : 0.484


In [20]:
###using rbf 2 fold
num_folds = 2
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/2
rbf=[]
## linear
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 0.01  # SVM regularization parameter
    rbf_clf = svm.SVC(kernel='rbf',gamma=0.7, C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=rbf_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    rbf.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for rbf SVM is :", np.mean(rbf)

accuracy = 0.4948
accuracy = 0.4732
mean accuracy for rbf SVM is : 0.484


In [38]:
###using sigmoid 5 fold
num_folds = 5
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/5
sig=[]
## sigmoid
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 10  # SVM regularization parameter
    sig_clf = svm.SVC(kernel='sigmoid', C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=rbf_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    sig.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for sigmoid SVM is :", np.mean(sig)

accuracy = 0.495
accuracy = 0.501
accuracy = 0.483
accuracy = 0.48
accuracy = 0.461
mean accuracy for sigmoid SVM is : 0.484


In [37]:
###using sigmoid 2 fold
num_folds = 2
X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train_new, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
num_test_sample=len(Xsample_train)/2
sig=[]
## sigmoid
for n in range(0,num_folds):
    temp= [x for x in range(0,num_folds) if x != n]
    X_train_dat=np.concatenate(X_train_folds[temp])
    y_train_dat=np.concatenate(y_train_folds[temp])
    C = 10  # SVM regularization parameter
    sig_clf = svm.SVC(kernel='sigmoid', C=C).fit(X_train_dat, y_train_dat)### add regularization factor = 1
    y_validation_pred=rbf_clf.predict(X_train_folds[n])   
    # Compute and print the fraction of correctly predicted examples
    num_correct = np.sum(y_validation_pred == y_train_folds[n])
    accuracy = float(num_correct) / num_test_sample
    sig.append(accuracy)
    print "accuracy =",accuracy
print "mean accuracy for sigmoid SVM is :", np.mean(sig)

accuracy = 0.4948
accuracy = 0.4732
mean accuracy for sigmoid SVM is : 0.484


In [15]:
C=10
lin_clf = svm.SVC(kernel='linear', C=C).fit(Xsample_train_new, ysample_train)

In [11]:
ysample_train.shape

(1000,)

In [19]:
y_pred=lin_clf.predict(X_test_new)
num_correct = np.sum(y_pred == y_test)
accuracy = float(num_correct) / (len(X_test_new))

In [20]:
accuracy

0.6701479487514888