In [1]:
import pandas 

data_set = pandas.read_csv( '../data_sets/experimental_data/thermo_paper_data_set.csv', index_col=0 )
expression_data = data_set[ [ 'expression' ] ] 

df = pandas.read_csv( '/Users/alex/Documents/bagel-benchmark/data_sets/rosetta/enzyme_design_noncovalent.csv', sep='\s+' )
df.description = df.description.str.split( '_' ).str[1]
df = df.groupby( 'description' ).apply( lambda x: x.sort_values( by='total_score' ).head( 10 ).mean() ) 

df = df.join( expression_data ).dropna() 
y = df.expression
X = df.drop( 'expression', axis=1 ).copy()

In [2]:
from __future__ import print_function

from sklearn import datasets, pipeline , preprocessing, feature_selection
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [
    { 
        'svm__kernel': ['rbf', 'linear'], 
        'svm__gamma': [ 1e-3, 1e-4, 1e-2 ],
        'svm__C': [ 0.3, 3, 30 ], 
        'kbest__k': [ 1, 3, 9, 21, ], 
    },    
]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    

    pln = pipeline.Pipeline([ 
        ( 'scaler', preprocessing.StandardScaler() ),
        ( 'kbest', feature_selection.SelectKBest( feature_selection.f_classif ) ), 
        ( 'svm', SVC( class_weight='balanced' ) ), 
    ])
    
    clf = GridSearchCV(pln, tuned_parameters, cv=3,
                       scoring='%s_weighted' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision



  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:

{'kbest__k': 21, 'svm__C': 3, 'svm__kernel': 'linear', 'svm__gamma': 0.001}

Grid scores on development set:

0.611 (+/-0.017) for {'kbest__k': 1, 'svm__C': 0.3, 'svm__kernel': 'rbf', 'svm__gamma': 0.001}
0.648 (+/-0.202) for {'kbest__k': 1, 'svm__C': 0.3, 'svm__kernel': 'linear', 'svm__gamma': 0.001}
0.611 (+/-0.017) for {'kbest__k': 1, 'svm__C': 0.3, 'svm__kernel': 'rbf', 'svm__gamma': 0.0001}
0.648 (+/-0.202) for {'kbest__k': 1, 'svm__C': 0.3, 'svm__kernel': 'linear', 'svm__gamma': 0.0001}
0.611 (+/-0.017) for {'kbest__k': 1, 'svm__C': 0.3, 'svm__kernel': 'rbf', 'svm__gamma': 0.01}
0.648 (+/-0.202) for {'kbest__k': 1, 'svm__C': 0.3, 'svm__kernel': 'linear', 'svm__gamma': 0.01}
0.611 (+/-0.017) for {'kbest__k': 1, 'svm__C': 3, 'svm__kernel': 'rbf', 'svm__gamma': 0.001}
0.595 (+/-0.142) for {'kbest__k': 1, 'svm__C': 3, 'svm__kernel': 'linear', 'svm__gamma': 0.001}
0.611 (+/-0.017) for {'kbest__k': 1, 'svm__C': 3, 'svm__kernel': 'rbf', 'sv

  'precision', 'predicted', average, warn_for)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV


# Utility function to move the midpoint of a colormap to be around
# the values of interest.

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

##############################################################################
# Load and prepare data set
#
# dataset for grid search

# Dataset for decision function visualization: we only keep the first two
# features in X and sub-sample the dataset to keep only 2 classes and
# make it a binary classification problem.

X = X.as_matrix()
y = y.ravel()

X_2d = X[:, :2]
X_2d = X_2d[y > 0]
y_2d = y[y > 0]
y_2d -= 1

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the training set and
# just applying it on the test set.

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_2d = scaler.fit_transform(X_2d)

##############################################################################
# Train classifiers
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(class_weight='balanced'), param_grid=param_grid, cv=cv)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

# # Now we need to fit a classifier for all parameters in the 2d version
# # (we use a smaller set of parameters here because it takes a while to train)

# C_2d_range = [1e-2, 1, 1e2]
# gamma_2d_range = [1e-1, 1, 1e1]
# classifiers = []
# for C in C_2d_range:
#     for gamma in gamma_2d_range:
#         clf = SVC(C=C, gamma=gamma)
#         clf.fit(X_2d, y_2d)
#         classifiers.append((C, gamma, clf))

# ##############################################################################
# # visualization
# #
# # draw visualization of parameter effects

# plt.figure(figsize=(8, 6))
# xx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))
# for (k, (C, gamma, clf)) in enumerate(classifiers):
#     # evaluate decision function in a grid
#     Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
#     Z = Z.reshape(xx.shape)

#     # visualize decision function for these parameters
#     plt.subplot(len(C_2d_range), len(gamma_2d_range), k + 1)
#     plt.title("gamma=10^%d, C=10^%d" % (np.log10(gamma), np.log10(C)),
#               size='medium')

#     # visualize parameter's effect on decision function
#     plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu)
#     plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r)
#     plt.xticks(())
#     plt.yticks(())
#     plt.axis('tight')

# plot the scores of the grid
# grid_scores_ contains parameter settings and scores
# We extract just the scores
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(gamma_range))

# Draw heatmap of the validation accuracy as a function of gamma and C
#
# The score are encoded as colors with the hot colormap which varies from dark
# red to bright yellow. As the most interesting scores are all located in the
# 0.92 to 0.97 range we use a custom normalizer to set the mid-point to 0.92 so
# as to make it easier to visualize the small variations of score values in the
# interesting range while not brutally collapsing all the low score values to
# the same color.

plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy')
plt.show()

The best parameters are {'C': 1.0, 'gamma': 0.10000000000000001} with a score of 0.75
