In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

import warnings
warnings.filterwarnings("ignore")

In [2]:
cat_feat_names = ["workclass", "education_level", "marital-status", "occupation", "relationship", "race", "sex",\
                  "native-country"]
num_feat_names = ["age", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

In [3]:
class Data():
    def __init__(self, csv_file, cat_feats, num_feats):
        self.cat_feats = cat_feats
        self.num_feats = num_feats
        self.num_of_records = None
        self.features = None
        self.target= None
        self.parse_file(csv_file)
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocess()
        self.split_train_test()
    
    def parse_file(self, csv_file):
        df = pd.read_csv(csv_file)
        self.features = df[self.cat_feats + self.num_feats].copy()
        self.target = np.reshape(df["income"].copy().values, (-1,1))
        self.num_of_records = self.features.shape[0]
    
    def log_transformation(self, feature):
        self.features[feature] = self.features[feature].apply(lambda x: np.log(x+1))
    
    def scale_data(self):
        num_values = self.features[self.num_feats].copy()
        scaler = MinMaxScaler()
        scaled_values = scaler.fit_transform(num_values)
        scaled_df = pd.DataFrame(scaled_values, columns=self.num_feats)
        output_df = pd.concat([scaled_df, self.features[self.cat_feats]], axis=1)
        self.features = output_df
    
    def encode_categorical_features(self):
        encoded_df = pd.get_dummies(self.features[self.cat_feats])
        num_vars = self.features[self.num_feats]
        self.features = pd.concat([num_vars, encoded_df], axis=1)
        
    def encode_target(self):
        target_df = pd.DataFrame(self.target, columns=["income"])
        self.target = target_df["income"].map({"<=50K":0, ">50K":1})
    
    def preprocess(self):
        self.log_transformation("capital-loss")
        self.log_transformation("capital-gain")
        self.scale_data()
        self.encode_categorical_features()
        self.encode_target()
        
    def split_train_test(self):
        self.X_train, self.X_test, self.y_train, self.y_test = \
        train_test_split(self.features, self.target, test_size=0.2, random_state = 0)      

In [4]:
class Model_Selection():
    def __init__(self, data_, k=5):
        self.data = data_
        self.num_folds = k
        self.best_model = None
        self.models = list()
        self.report = dict()
        self.validation_scores = dict()
        self.fbeta_scores = dict()
#         self.sample_sizes = sample_sizes_
    
    def add_model(self, model_):
        self.models.append(model_)
            
    def cross_validate(self, k):
        scorer = make_scorer(fbeta_score, beta=0.5)
        for model in self.models:
            val_score = dict()
            cross_val_X = pd.concat([self.data.X_train, self.data.X_test])
            cross_val_y = pd.concat([self.data.y_train, self.data.y_test])
            cross_val_results = cross_val_score(model, cross_val_X, cross_val_y, cv=k, scoring=scorer)
            val_score["fscore_mean"] = np.mean(cross_val_results)
            val_score["fscore_std"] = np.std(cross_val_results)
            
            model.fit(self.data.X_train, self.data.y_train)
            train_predictions = model.predict(self.data.X_train)
            test_predictions = model.predict(self.data.X_test)   
            train_fscore = fbeta_score(self.data.y_train, train_predictions, beta=0.5)
            test_fscore = fbeta_score(self.data.y_test, test_predictions, beta=0.5)
            val_score["fscore_train"] = train_fscore
            
            self.fbeta_scores[model] = test_fscore
            
            self.validation_scores[model.__class__.__name__] = val_score
            
    def summarize_validation(self):
        for model_ in self.models:
            model_name = model_.__class__.__name__
            print("{}".format(model_name))
            model_validation_results = self.validation_scores[model_name]
            print("Mean F-beta score: {}".format(model_validation_results["fscore_mean"]))
            print("Standard Deviation of F-beta score: {}".format(model_validation_results["fscore_std"]))
            print("Training Set F-beta score: {}".format(model_validation_results["fscore_train"]))
            print("Test Set F-beta score: {}\n".format(self.fbeta_scores[model_]))
            
    def run(self):
        self.cross_validate(self.num_folds)
        self.summarize_validation()
        self.find_best_fit()
            
    def find_best_fit(self):
        self.best_model = max(self.fbeta_scores, key=self.fbeta_scores.get)

In [5]:
if __name__ == "__main__": 
    logistic_regression_model = LogisticRegression()
    gradient_boosting_model = GradientBoostingClassifier()
    random_forest_model = RandomForestClassifier()
    
    data_ = Data("census.csv",cat_feat_names, num_feat_names)
    
    model_selector = Model_Selection(data_)
    model_selector.add_model(logistic_regression_model)
    model_selector.add_model(gradient_boosting_model)
    model_selector.add_model(random_forest_model)
    model_selector.run()

LogisticRegression
Mean F-beta score: 0.6889350945662823
Standard Deviation of F-beta score: 0.00355836285111309
Training Set F-beta score: 0.6944049985799489
Test Set F-beta score: 0.6831652282416554

GradientBoostingClassifier
Mean F-beta score: 0.7455970908414289
Standard Deviation of F-beta score: 0.0055289268803982985
Training Set F-beta score: 0.7530470502391698
Test Set F-beta score: 0.7395338561802719

RandomForestClassifier
Mean F-beta score: 0.672867765310291
Standard Deviation of F-beta score: 0.0061673040160426865
Training Set F-beta score: 0.935004513575446
Test Set F-beta score: 0.678261736270308



In [6]:
model_selector.find_best_fit()
print(model_selector.best_model.__class__.__name__)

GradientBoostingClassifier
