<a href="https://colab.research.google.com/github/fangyiyu/Basic_ML_tasks/blob/main/abalone_voting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Honour code： https://www.kaggle.com/fabiendaniel/customer-segmentation
import pandas as pd
data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data',
        header=None)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
# Remove Male abalone rows
indices = data[data[0] == 'M'].index
data.drop(indices, inplace=True)

# Replace with numerical value
data.replace({'F': 0, "I": 1}, inplace=True)

# Split dataset to training and validation 
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.2)

train_labels = train.iloc[:,0]
train_data = train.iloc[:,1:]
val_labels = val.iloc[:,0]
val_data = val.iloc[:,1:]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing, model_selection, metrics, feature_selection
class Class_Fit(object):
    def __init__(self, clf, params=None):
        if params:            
            self.clf = clf(**params)
        else:
            self.clf = clf()

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def grid_search(self, parameters, Kfold):
        self.grid = GridSearchCV(estimator = self.clf, param_grid = parameters, cv = Kfold)
        
    def grid_fit(self, X, Y):
        self.grid.fit(X, Y)
        
    def grid_predict(self, X, Y):
        self.predictions = self.grid.predict(X)
        print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(Y, self.predictions)))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn import neighbors, linear_model, svm, tree, ensemble
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import warnings
warnings.filterwarnings('ignore') 
svc = Class_Fit(clf = SVC)
svc.grid_search(parameters = [{'C':np.logspace(-2,2,10),'kernel':['rbf', 'poly', 'sigmoid'],'probability':[True]}], Kfold = 5)

In [None]:
svc.grid_fit(X = train_data, Y = train_labels)
svc.grid_predict(val_data, val_labels)

Precision: 83.21 % 


In [None]:
lr = Class_Fit(clf = linear_model.LogisticRegression)
lr.grid_search(parameters = [{'C':np.logspace(-2,2,20)}], Kfold = 5)
lr.grid_fit(X = train_data, Y = train_labels)
lr.grid_predict(val_data, val_labels)

Precision: 82.26 % 


In [None]:
knn = Class_Fit(clf = neighbors.KNeighborsClassifier)
knn.grid_search(parameters = [{'n_neighbors': np.arange(1,50,1)}], Kfold = 5)
knn.grid_fit(X = train_data, Y = train_labels)
knn.grid_predict(val_data, val_labels)

Precision: 82.26 % 


In [None]:
tr = Class_Fit(clf = tree.DecisionTreeClassifier)
tr.grid_search(parameters = [{'criterion' : ['entropy', 'gini'], 'max_features' :['sqrt', 'log2']}], Kfold = 5)
tr.grid_fit(X = train_data, Y = train_labels)
tr.grid_predict(val_data, val_labels)

Precision: 74.53 % 


In [None]:
rf = Class_Fit(clf = ensemble.RandomForestClassifier)
param_grid = {'criterion' : ['entropy', 'gini'], 'n_estimators' : [20, 40, 60, 80, 100],
               'max_features' :['sqrt', 'log2']}
rf.grid_search(parameters = param_grid, Kfold = 5)
rf.grid_fit(X = train_data, Y = train_labels)
rf.grid_predict(val_data, val_labels)

Precision: 83.40 % 


In [None]:
ada = Class_Fit(clf = AdaBoostClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
ada.grid_search(parameters = param_grid, Kfold = 5)
ada.grid_fit(X = train_data, Y = train_labels)
ada.grid_predict(val_data, val_labels)

Precision: 80.94 % 


In [None]:
gb = Class_Fit(clf = ensemble.GradientBoostingClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}
gb.grid_search(parameters = param_grid, Kfold = 5)
gb.grid_fit(X = train_data, Y = train_labels)
gb.grid_predict(val_data, val_labels)

Precision: 83.77 % 


In [None]:
rf_best  = ensemble.RandomForestClassifier(**rf.grid.best_params_)
gb_best  = ensemble.GradientBoostingClassifier(**gb.grid.best_params_)
svc_best = svm.SVC(**svc.grid.best_params_)
tr_best  = tree.DecisionTreeClassifier(**tr.grid.best_params_)
knn_best = neighbors.KNeighborsClassifier(**knn.grid.best_params_)
lr_best  = linear_model.LogisticRegression(**lr.grid.best_params_)
ada_best  = AdaBoostClassifier(**ada.grid.best_params_)


In [None]:
votingC = ensemble.VotingClassifier(estimators=[('rf', rf_best),('knn', knn_best),
                                                ('ada', ada_best), ('lr', lr_best)], voting='soft') 

In [None]:
votingC = votingC.fit(train_data, train_labels)

In [None]:
predictions = votingC.predict(val_data)
print("Precision: {:.2f} % ".format(100*metrics.accuracy_score(val_labels, predictions)))

Precision: 82.64 % 
