# Volume 3: Sklearn Guide
    Darren Lund
    NaiveBayes
    Single

In [17]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd

## Problem 1

Take your Naive Bayes classifier from your homework and rewrite it as a class that inherits from `BaseEstimator` and `ClassifierMixin`.
Implement `__init__()`, `fit()`, and `predict()` in a way that matches `sklearn` conventions.

Test your model on the iris dataset.

In [2]:
class Naive_Bayes(BaseEstimator, ClassifierMixin) :
    '''
    A naive bayesian classifier that inherits from sklearn BaseEstimator
    '''
    def __init__(self,verbose=False) :
        '''
        Initialize the Naive_Bayes classifier
        '''
        self.verbose=verbose
    
    def fit(self,X,y) :
        '''
        Fits the data for the classifier
        
        Inputs :
            X (nparray) - Data to fit
            y (1d array)
        '''
        self.classes_ = list(set(y))
        self.classes_.sort()
        self.mus_ = []
        self.sig2s_ = []
        self.y_probs_ = []
        for i in range(len(self.classes_)) :
            self.mus_.append(np.average(X[y==self.classes_[i]],axis=0))
            self.sig2s_.append(np.var(X[y==self.classes_[i]],axis=0))
            self.y_probs_.append(sum(y == self.classes_[i])/len(y))
        return self
            
    def predict_proba(self,X) :
        '''
        Predicts the probability of data X for each class
        '''
        m = X.shape[0]
        n = len(self.classes_)
        #log_prob_0 = np.log(self.prob_0) + np.sum(-(X-self.mu_0)**2/(2*self.sigma2_0),axis=1) - np.sum(0.5*np.log(2*np.pi*self.sigma2_0))
        probs = np.zeros((m,n))
        for i in range(n) :
            probs[:,i] = np.log(self.y_probs_[i])*np.ones_like(probs[:,i]) + np.sum(-(X-self.mus_[i])**2/(2*self.sig2s_[i]),axis=1) - np.sum(0.5*np.log(2*np.pi*self.sig2s_[i]))
        return probs
    
    def predict(self,X) :
        '''
        Predicts class labels of data X
        '''
        probs = self.predict_proba(X)
        indices = np.argmax(probs,axis=1)
        labels = [self.classes_[indices[i]] for i in range(len(indices))]
        return labels

In [3]:
iris = load_iris()
tr_x, ts_x, tr_y, ts_y = train_test_split(iris.data,iris.target)
mygnb = Naive_Bayes()
mygnb.fit(tr_x,tr_y)
mygnb.score(ts_x,ts_y)

0.97368421052631582

## Problem 2

Write a transformer class where the `fit()` and `transform()` methods takes in $X$ as a pandas Data Frame.
For each numerical column, replace any `nan` entries with the mean of the column.
Drop string columns.
Return the data as a NumPy array.

In [4]:
class Transformers(BaseEstimator, TransformerMixin) :
    '''
    Transforms a pandas dataframe with numeric and string values to np.array
    Drops string columns and fills nans with column averages.
    '''
    def __init__(self) :
        '''
        Initialize transformer
        '''
        pass
    
    def fit(self,X,y=None) :
        '''
        Creates needed parameters for transformation
        Ignores y
        '''
        assert isinstance(X,pd.Dataframe)
        
        self.drop_cols_ = X.columns[X.dtypes == object]
        self.mus_ = np.sum(X.fillna(0),axis=0)
        return self
    
    def transform(self,X) :
        '''
        Transforms the data
        '''
        assert isinstance(X,pd.Dataframe)
        
        X = X.drop(self.drop_cols_)
        cols = X.columns
        for i in range(len(cols)) :
            X[cols[i]].fillna(self.mus_[i])
        return X.values

## Problem 3

Use `cross_validate()` to score your class from Problem 1 on the iris dataset.
Do the same for a `LogisticRegressionClassifier`.

In [15]:
results = cross_validate(mygnb, iris.data, iris.target, cv=4)
for val in results.keys() :
    print(str(val)+'\t: '+str(np.average(results[val])))

fit_time	: 0.00148272514343
score_time	: 0.000649988651276
test_score	: 0.953525641026
train_score	: 0.964438122333




In [16]:
lgr = LogisticRegression()
results = cross_validate(lgr, iris.data, iris.target, cv=4)
for val in results.keys() :
    print(str(val)+'\t: '+str(np.average(results[val])))

fit_time	: 0.0011699795723
score_time	: 0.000410795211792
test_score	: 0.952457264957
train_score	: 0.962126600284




## Problem 4

Take the cancer data set (`datasets.load_breast_cancer()`) and do a grid search on an SVM (`sklearn.linear.svm`) with the parameter `C` as .01, .1, or 1, and the parameter `kernel` as `"linear"`, `"poly"`, `"rbf"`, and `"sigmoid"`.

What is the best choice of parameters?
How well does the corresponding model do?

In [20]:
cancer = load_breast_cancer()
Cs = [0.01,0.1,1]
kerns = ['linear','poly','rbf','sigmoid']
svc = SVC(max_iter=10000000)
clf = GridSearchCV(estimator=svc, param_grid={'C':Cs,'kernel':kerns},n_jobs=4)
clf.fit(cancer.data,cancer.target)



GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=10000000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'C': [0.01, 0.1, 1], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
print(clf.best_estimator_)
print(clf.best_score_)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=10000000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.949033391916


Best parameters are $C=1$ with a linear kernel, giving an accuracy of $\approx 0.94903339$.

## Problem 5

Make a pipeline of your transformer from Problem 2, a normalizing scaler transformer (`preprocessing.StandardScaler`), a PCA transformer (`decomposition.PCA`), and an SVM classifier (`svm.SVC`).
Using the titanic dataset (read in as a pandas DataFrame), do a grid search for the best model, varying your parameters however you see fit.

What is your best choice of parameters?
How well does the corresponding model do?

**Extra credit** to the student with the very best model!
To compete, pick your best parameters, do a cross validation with 10 folds, and take the average of the test scores.