In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.decomposition import PCA


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,response,make,address,all,num3d,our,over,remove,internet,order,...,conference,charSemicolon,charRoundbracket,charSquarebracket,charExclamation,charDollar,charHash,capitalAve,capitalLong,capitalTotal
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.925,0.0,0.0,1.833,6,11
1,2,0.0,0.0,0.0,0.0,2.94,0.0,0.0,0.0,0.0,...,0.0,0.335,0.335,0.0,0.671,0.0,0.0,4.0,12,28
2,2,0.0,0.0,1.15,0.0,0.38,0.38,0.0,0.0,0.0,...,0.0,0.0,0.196,0.0,0.261,0.0,0.0,5.666,56,272
3,2,0.1,0.3,0.4,0.0,0.2,0.9,0.2,0.5,0.8,...,0.0,0.0,0.175,0.0,0.307,0.175,0.014,6.937,669,1214
4,1,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.52,0.0,...,0.0,0.027,0.138,0.0,0.041,0.041,0.0,2.321,31,469


In [4]:
y_train = np.array(train_data.response)
X_train = np.array(train_data.iloc[:,1:])

X_test = np.array(test_data.iloc[:,:])

print(X_test.shape, X_train.shape)

(1500, 57) (3101, 57)


In [5]:
def customLoss(y,y_hat):
    return np.array([1 if ((real==2)&(pred==1)) else 5 if ((real==1)&(pred==2)) else 0 for real, pred in zip(y,y_hat)]).sum()  

# testing custom loss
# customLoss(y_train, np.random.randint(1, high=2, size = (y_train.shape)))

In [114]:
def get_models():
    models = list()
    models.append(LogisticRegression())
    models.append(RidgeClassifier())
    models.append(SGDClassifier())
    models.append(PassiveAggressiveClassifier())
    models.append(KNeighborsClassifier())
    models.append(DecisionTreeClassifier())
    models.append(ExtraTreeClassifier())
    models.append(LinearSVC())
    models.append(SVC())
    models.append(GaussianNB())
    models.append(AdaBoostClassifier(base_estimator=ExtraTreesClassifier()))
    models.append(BaggingClassifier())
    models.append(RandomForestClassifier())
    models.append(ExtraTreesClassifier())
    models.append(GaussianProcessClassifier())
    models.append(GradientBoostingClassifier())
    models.append(LinearDiscriminantAnalysis())
    models.append(QuadraticDiscriminantAnalysis())
    return models

In [115]:
def evaluate_model(X, y, cv, model):
    # define score
    my_scorer = make_scorer(customLoss, greater_is_better=False)
    # evaluate the model
    scores = cross_val_score(model, X, y, scoring=my_scorer, cv=cv, n_jobs=-1)
    
    return np.mean(scores)

In [116]:
cv = RepeatedKFold(n_splits=10, random_state=1337, n_repeats=3)
# get the list of models to consider
models = get_models()
# store results
cv_results = list()

for model in models:
    # evaluate model using each test condition
    mean_score = evaluate_model(X_train, y_train, cv, model)
    # check for invalid results
    if np.isnan(mean_score):
        continue
    # store results
    cv_results.append(mean_score)
    print('>%s: cv=%.3f' % (type(model).__name__, mean_score))


>LogisticRegression: cv=-64.767
>RidgeClassifier: cv=-68.267
>SGDClassifier: cv=-266.200
>PassiveAggressiveClassifier: cv=-217.300
>KNeighborsClassifier: cv=-186.433
>DecisionTreeClassifier: cv=-83.700
>ExtraTreeClassifier: cv=-107.800
>LinearSVC: cv=-171.033
>SVC: cv=-187.667
>GaussianNB: cv=-250.967
>AdaBoostClassifier: cv=-36.100
>BaggingClassifier: cv=-47.233
>RandomForestClassifier: cv=-38.367
>ExtraTreesClassifier: cv=-35.767
>GaussianProcessClassifier: cv=-172.667
>GradientBoostingClassifier: cv=-44.367
>LinearDiscriminantAnalysis: cv=-68.400
>QuadraticDiscriminantAnalysis: cv=-234.967


In [8]:
# Create a custom RF classifier with adjustable classification threshold 
class customRF(RandomForestClassifier):
    
    def __init__(self,
                 threshold=0.5,
                 n_estimators=10,
                 criterion="gini",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 max_features="auto",
                 max_leaf_nodes=None,
                 bootstrap=True,
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 warm_start=False,
                 class_weight=None):
        super(customRF, self).__init__(
            criterion = criterion,
            max_depth = max_depth,
            min_samples_split = min_samples_split,
            min_samples_leaf = min_samples_leaf,
            max_features = max_features,
            max_leaf_nodes = max_leaf_nodes,
            n_estimators=n_estimators,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start,
            class_weight=class_weight)
        
        self.threshold = threshold

        

    
    def predict(self, X):
        
        proba = self.predict_proba(X)
        
        # We will change the logic for binary classification
        if self.n_outputs_ == 1:            
            return self.classes_.take((proba [:,1] >= self.threshold).astype('int'), axis=0)

        # Keep the logic the default for more classes
        else:
            n_samples = proba[0].shape[0]
            predictions = np.zeros((n_samples, self.n_outputs_))

            for k in range(self.n_outputs_):
                predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
                                                                    axis=1),
                                                          axis=0)

            return predictions
        
        return proba

In [30]:
# Create a custom ETs classifier with adjustable classification threshold 
class customET(ExtraTreesClassifier):
    
    def __init__(self,
                 threshold=0.5,
                 n_estimators=10,
                 criterion="gini",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 max_features="auto",
                 max_leaf_nodes=None,
                 bootstrap=True,
                 n_jobs=1,
                 random_state=None,
                 verbose=0,
                 warm_start=False,
                 class_weight=None):
        super(customET, self).__init__(
            criterion = criterion,
            max_depth = max_depth,
            min_samples_split = min_samples_split,
            min_samples_leaf = min_samples_leaf,
            max_features = max_features,
            max_leaf_nodes = max_leaf_nodes,
            n_estimators=n_estimators,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose,
            warm_start=warm_start,
            class_weight=class_weight)
        
        self.threshold = threshold

        

    
    def predict(self, X):
        
        proba = self.predict_proba(X)
        
        # We will change the logic for binary classification
        if self.n_outputs_ == 1:            
            return self.classes_.take((proba [:,1] >= self.threshold).astype('int'), axis=0)

        # Keep the logic the default for more classes
        else:
            n_samples = proba[0].shape[0]
            predictions = np.zeros((n_samples, self.n_outputs_))

            for k in range(self.n_outputs_):
                predictions[:, k] = self.classes_[k].take(np.argmax(proba[k],
                                                                    axis=1),
                                                          axis=0)

            return predictions
        
        return proba

In [101]:
def pipe_maker(classifier):
    '''Takes in a classifier, returns a pipeline'''
    pipe = Pipeline([
                  ('scl', MinMaxScaler()), #Usinng MinMax cuz not sure for the distribution of features
                  ('fs', SelectFromModel(PCA())),
                  ('clf', classifier(random_state=1111, class_weight={1: 1, 2: 5}))
                ])
    return pipe

In [102]:
def gridsearch_maker(pipeline, params):
    '''Takes in a pipeline and param grid, returns GridSearchCV object'''
    
    my_scorer = make_scorer(customLoss, greater_is_better=False)
    
    return GridSearchCV(estimator=pipeline,
                        param_grid=params,
                        scoring=my_scorer,
                        cv=10,
                        verbose = 10,
                        n_jobs=-1)

In [124]:
pipe = pipe_maker(customET)

params = {
    'fs': ['passthrough'], # This treshold corresponds to the 20th percentile of feature importance
    'clf__threshold': np.arange(0.5,0.8, step=0.025),
    'clf__n_estimators': range(50,100,5),
    'clf__max_depth': list(range(35,55)) + [None],
    'clf__min_samples_split': range(2,3),
    'clf__min_samples_leaf': range(1,2),
    'clf__max_features': ["auto", "sqrt", "log2"]
    }

# params = {
#     'fs': ['passthrough', PCA(50), PCA(40), PCA(30)], # This treshold corresponds to the 20th percentile of feature importance
#     'clf__threshold': np.arange(0.6,0.8, step=0.02),
#     'clf__n_estimators': range(50,100,5),
#     'clf__max_depth': list(range(35,55)) + [None],
#     'clf__min_samples_split': range(2,3),
#     'clf__min_samples_leaf': range(1,2),
#     'clf__max_features': ["auto", "sqrt", "log2"]
#     }

gsc = gridsearch_maker(pipe, params)

In [125]:
grid_result = gsc.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 10 folds for each of 8190 candidates, totalling 81900 fits
Best: -24.900000 using {'clf__max_depth': 51, 'clf__max_features': 'auto', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 70, 'clf__threshold': 0.6750000000000002, 'fs': 'passthrough'}


In [126]:
best_classifier = grid_result.best_estimator_
predictions = best_classifier.predict(X_test)

In [127]:
np.savetxt('second_sub.txt', predictions, fmt='%i')

In [129]:
# Best: -24.900000 using: 
params = grid_result.best_params_
params

{'clf__max_depth': 51,
 'clf__max_features': 'auto',
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__n_estimators': 70,
 'clf__threshold': 0.6750000000000002,
 'fs': 'passthrough'}

In [149]:
# Got score of 40 bet baseline
best_clf = pipe.set_params(
    clf__max_depth=params['clf__max_depth'],
    clf__max_features=params['clf__max_features'],
    clf__min_samples_leaf=params['clf__min_samples_leaf'],
    clf__min_samples_split=params['clf__min_samples_split'],
    clf__n_estimators=params['clf__n_estimators'],
    clf__threshold=params['clf__threshold'],
    fs=params['fs'],
    clf__random_state=111111111
)
best_clf.fit(X_train, y_train)
predictions = best_clf.predict(X_test)
np.savetxt('second_sub.txt', predictions, fmt='%i')