Start with the first predictor (variable) and fit the model M1  

• For k = 1, . . . , p − 1:  
– Consider all p−k models that augment the predictor in Mk with one additional predictor.  
– Choose the best among these p − k models and call it Mk+1. Best is defined as having smallest error on cross-validation.

• Select the best model from among M1, . . . , Mp using cross-validation.  

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import grid_search
from sklearn.cross_validation import cross_val_score
from scipy.stats import uniform
from sklearn.grid_search import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import multiprocessing as mp
%matplotlib inline
score_threshold = 0.9



In [2]:
# load data in dataframe
col_names = ['X' + str(i) for i in range(73)]
data = pd.read_excel("ozon.xlsx", header = None, names = col_names)
data.shape

(1847, 73)

In [3]:
#define x and y
feature_cols = ['X' + str(i) for i in range(72)]
x = data[feature_cols]
y = data.X72
data['X72'] = data['X72'].replace([0, 1], [-1, 1]) 
y = data['X72']
standard_scaler = StandardScaler()
x = pd.DataFrame(standard_scaler.fit_transform(x), columns = feature_cols)
x.shape

(1847, 72)

In [4]:
# split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 553, train_size = 1292)
Cs = np.linspace(0.001, 1001.0, num=2000, endpoint=False)

# Designate distributions to sample hyperparameters from 
g_range = np.random.uniform(0.0, 2.3, 60).astype(float)
C_range = np.random.normal(0.1, 14.0, 100).astype(float)

# Check that gamma>0 and C>0 
C_range[C_range < 0] = 0.0001

hyperparameters = {'gamma': list(g_range), 
                    'C': list(C_range)}


# Run randomized search with range of hyperparameter values
svm = SVC(kernel='rbf', probability=True)
randomCV = RandomizedSearchCV(svm, param_distributions=hyperparameters, n_iter=20)
randomCV.fit(x_train, y_train)

# Identify optimal hyperparameter values and add to list
best_gamma  = randomCV.best_params_['gamma']
best_C      = randomCV.best_params_['C']

# Train SVM and output predictions
rbfSVM = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
rbfSVM.probability=True
rbfSVM.fit(x_train, y_train)
svm_predictions = rbfSVM.predict(x_test)
y_pred_prob = rbfSVM.predict_proba(x_test)[:, 1]
y_pred = rbfSVM.predict(x_test)
metrics.accuracy_score(y_test, y_pred)

0.94394213381555159

Run 72 different models for each feature, pick the best one and use that as the base

In [5]:
accuracy_scores = np.zeros(shape = (72,2))

for i in range(72):
    # split data
    new_x = x['X' + str(i)]
    x_train, x_test, y_train, y_test = train_test_split(new_x.values[:,np.newaxis], y, test_size = .3, train_size = .7)
    # Train SVM and output predictions
    rbfSVM = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
    rbfSVM.probability=True
    rbfSVM.fit(x_train, y_train)
    svm_predictions = rbfSVM.predict(x_test)
    y_pred_prob = rbfSVM.predict_proba(x_test)[:, 1]
    y_pred = rbfSVM.predict(x_test)
    score = metrics.accuracy_score(y_test, y_pred)
    accuracy_scores[i][0] = i
    accuracy_scores[i][1] = score

    
#get the feature that perform best and use that as the base    
x_max,y_max = accuracy_scores.max(axis=0)
high_score,index = np.where(accuracy_scores == y_max)
max_accuracy = y_max
high_score= high_score[0]
index = index[0]

In [6]:
indices = list(range(index))+ list(range(index+1,72))
new_x = pd.DataFrame(x['X' + str(index)])
print(new_x.shape)
print(x.shape)
best_indices_list = []
best_indices_list.append(index)
previous_score = 0

(1847, 1)
(1847, 72)


In [7]:
def forward_selection(i):

    # add the new feature to x
    temp_x = new_x.copy()
    temp_x['X' + str(i)] = x['X' + str(i)]

    # split data
    x_train, x_test, y_train, y_test = train_test_split(temp_x, y, test_size = 553, train_size = 1292)

    # fit model
    rbfSVM = SVC(kernel='rbf', C=best_C, gamma=best_gamma)
    rbfSVM.probability=True
    rbfSVM.fit(x_train, y_train)
    svm_predictions = rbfSVM.predict(x_test)
    y_pred_prob = rbfSVM.predict_proba(x_test)[:, 1]
    y_pred = rbfSVM.predict(x_test)

    #add performance score to accuracy_list
    score = metrics.accuracy_score(y_test, y_pred)
    return i, score

Use multiprocessing to run forward_selection function and pick the best model

In [8]:
for k in range(71):
    pool = mp.Pool(processes=4)
    results = pool.map(forward_selection, indices)
    pool.terminate()
    
    #get the feature that performs best and add that
    scores_list = [tup[1] for tup in results]
    indices_list = [tup[0] for tup in results]
    high_score = max(scores_list)
    max_index = scores_list.index(high_score) 
    index = indices_list[max_index]

    
    if (high_score > previous_score):
        new_x['X' + str(index)] = x['X' + str(index)]
        indices.remove(index)
        best_indices_list.append(index)
        previous_score = high_score
    
    # split data
    x_train, x_test, y_train, y_test = train_test_split(new_x, y, test_size = 553, train_size = 1292)
    #find optimal parameters for next go around
    Cs = np.linspace(0.001, 1001.0, num=2000, endpoint=False)
    # Designate distributions to sample hyperparameters from 
    g_range = np.random.uniform(0.0, 2.3, 60).astype(float)
    C_range = np.random.normal(0.1, 14.0, 100).astype(float)
    # Check that gamma>0 and C>0 
    C_range[C_range < 0] = 0.0001
    hyperparameters = {'gamma': list(g_range), 
                        'C': list(C_range)}
    # Run randomized search with range of hyperparameter values
    svm = SVC(kernel='rbf', probability=True)
    randomCV = RandomizedSearchCV(svm, param_distributions=hyperparameters, n_iter=5)
    randomCV.fit(x_train, y_train)

    # Identify optimal hyperparameter values and save
    best_gamma  = randomCV.best_params_['gamma']
    best_C      = randomCV.best_params_['C']

In [9]:
print("Indices used in best model", best_indices_list)
print("Best Model Score", previous_score)

Indices used in best model [1, 65, 45, 22, 44, 47]
Best Model Score 0.960216998192
