In [1]:
##############################  variables ####################################  

# wage               : wage, lwage
# gender             : sex
# race               : white, black, hisp
# education          : shs, hsg, scl, clg
# region             : mw, so, we
# union membership   : union
# veteran status     : vet
# city               : cent, ncent
# family size        : fam1, fam2, fam3
# having children    : child
# foreign born       : fborn
# citizenship        : cit
# school attandence  : school
# pension            : pens
# firm size          : fsize10, fsize100
# health status      : health
# age                : age
# experience         : exp1, exp2, exp3, exp4
# occupation         : occ(factor with 456 levels), occ2(factor with 22 levels-aggregated)
# industry           : ind(factor with 257 levels), ind2(factor with 23 levels-aggregated)

###############################################################################    


In [2]:
# Importing Libraries
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline



## Getting Data

In [3]:
# Load the wholesale customers dataset
try:
    data = pd.read_csv("data.csv", index_col=0)

    print("Dataset has {} samples with {} features each.".format(*data.shape))
except:
    print("Dataset could not be loaded. Is the dataset missing?")

Dataset has 12697 samples with 37 features each.


In [4]:
data_features = data.drop(['wage', 'lwage'],  axis=1)

data_features.head(5)

Unnamed: 0,sex,white,black,hisp,shs,hsg,scl,clg,mw,so,...,health,age,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
1,0,1,0,0,0,0,0,1,0,0,...,1,31,7.0,0.49,0.343,0.2401,3600.0,11,8370.0,18
2,1,1,0,0,0,0,0,1,0,0,...,1,55,31.0,9.61,29.791,92.3521,3050.0,10,5070.0,9
3,1,1,0,0,0,1,0,0,0,0,...,1,38,18.0,3.24,5.832,10.4976,6260.0,19,770.0,4
4,1,1,0,0,0,0,1,0,0,0,...,1,35,13.0,1.69,2.197,2.8561,6050.0,18,170.0,1
5,0,1,0,0,0,0,0,0,0,0,...,0,51,25.0,6.25,15.625,39.0625,420.0,1,6990.0,12


## Data Exploration

In [5]:
# Display a description of the dataset
display(data.describe())

Unnamed: 0,wage,lwage,sex,white,black,hisp,shs,hsg,scl,clg,...,health,age,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
count,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,...,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0,12697.0
mean,18.145961,2.730945,0.542805,0.69607,0.189179,0.222887,0.064661,0.308734,0.301567,0.236513,...,0.71765,33.703158,12.064897,2.76238,8.145416,27.794773,8440.043554,12.966449,7771.903836,13.266598
std,10.466483,0.625985,0.498184,0.459971,0.391666,0.4162,0.245936,0.461989,0.458956,0.424957,...,0.450161,11.34714,11.431822,4.490614,19.186165,91.651419,19443.325548,6.647292,11658.013997,5.902888
min,0.001923,-6.253829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,16.0,0.0,0.0,0.0,0.0,10.0,1.0,170.0,1.0
25%,10.576923,2.358675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,25.0,3.0,0.09,0.027,0.0081,2340.0,8.0,4970.0,9.0
50%,15.384615,2.733368,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,30.0,8.0,0.64,0.512,0.4096,4700.0,15.0,7380.0,14.0
75%,23.237179,3.145754,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,40.0,19.0,3.61,6.859,13.0321,6220.0,17.0,8290.0,18.0
max,58.119658,4.062504,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,85.0,65.5,42.9025,281.011375,1840.624506,100000.0,22.0,100000.0,23.0


## Preprocessing

In [6]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Normalizing Numerical Features
data_features_scaled = pd.DataFrame(scaler.fit_transform(data_features), index=data_features.index, columns=data_features.columns)
data_features_scaled.head(5)

Unnamed: 0,sex,white,black,hisp,shs,hsg,scl,clg,mw,so,...,health,age,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.217391,0.10687,0.011421,0.001221,0.00013,0.035904,0.47619,0.08214,0.772727
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.565217,0.473282,0.223996,0.106014,0.050174,0.030403,0.428571,0.049083,0.363636
3,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.318841,0.274809,0.07552,0.020754,0.005703,0.062506,0.857143,0.00601,0.136364
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.275362,0.198473,0.039392,0.007818,0.001552,0.060406,0.809524,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.507246,0.381679,0.145679,0.055603,0.021222,0.0041,0.0,0.068316,0.5


In [7]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder

#Getting features and target
wages = LabelEncoder().fit_transform(data['lwage'])
features = data_features_scaled


#Quadratic - polynomial features
polynomial_features = PolynomialFeatures(degree=2)
features_poly = polynomial_features.fit_transform(features)

features.head(5)

Unnamed: 0,sex,white,black,hisp,shs,hsg,scl,clg,mw,so,...,health,age,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.217391,0.10687,0.011421,0.001221,0.00013,0.035904,0.47619,0.08214,0.772727
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.565217,0.473282,0.223996,0.106014,0.050174,0.030403,0.428571,0.049083,0.363636
3,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.318841,0.274809,0.07552,0.020754,0.005703,0.062506,0.857143,0.00601,0.136364
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.275362,0.198473,0.039392,0.007818,0.001552,0.060406,0.809524,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.507246,0.381679,0.145679,0.055603,0.021222,0.0041,0.0,0.068316,0.5


In [8]:
from sklearn.model_selection import train_test_split

# Split the 'features' and 'wage' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    wages, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Split the 'features' and 'wage' polynomial
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(features_poly, 
                                                    wages, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [9]:
from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, LassoLars, ElasticNet, LassoLars
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_validate


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

 
classifiers =   {
                    'LinearRegression': LinearRegression(),
                    'LinearRegressionPoly': LinearRegression(),
                    'Lasso': Lasso(alpha=1),
                    'LassoLars': LassoLars(),
                    'Ridge': Ridge(alpha=0),
#                     'ElasticNet': ElasticNet(alpha=.5),
                    'RandomForest': RandomForestClassifier(n_estimators=2000, max_depth=1000, min_samples_leaf=5),
#                     'BoostingTree': GradientBoostingClassifier(n_estimators=1000, max_depth=2),
                    'MLPClassifier': MLPClassifier(hidden_layer_sizes=(5,), max_iter=1000)
                    
                }
results = {}

for classifier in classifiers:
    pipe = Pipeline(steps=[('classifier', classifiers[classifier])])
    results[classifier] = {}
    if (classifier == 'LinearRegressionPoly'):
        pipe.fit(X_train_poly, y_train_poly)
        y_train_pred_poly = pipe.predict(X_train_poly)
        y_test_pred_poly = pipe.predict(X_test_poly)
        
        results[classifier]['model score'] = pipe.score(X_test_poly, y_test_poly)
        results[classifier]['mean square'] = cross_validate(pipe, X_train_poly, y_train_poly)['test_score'].mean()
        results[classifier]['Train RMSE'] = mean_squared_error(y_train_poly, y_train_pred_poly, squared=False)
        results[classifier]['Test RMSE'] = mean_squared_error(y_test_poly, y_test_pred_poly, squared=False)
    
    else:
        pipe.fit(X_train, y_train)
        y_train_pred = pipe.predict(X_train)
        y_test_pred = pipe.predict(X_test)
        results[classifier]['model score'] = pipe.score(X_test, y_test)
        results[classifier]['mean square'] = cross_validate(pipe, X_train, y_train)['test_score'].mean()
        results[classifier]['Train RMSE'] = mean_squared_error(y_train, y_train_pred, squared=False)
        results[classifier]['Test RMSE'] = mean_squared_error(y_test, y_test_pred, squared=False)
  




In [10]:
df_results = pd.DataFrame.from_dict(results, orient='index').sort_values(['model score'], ascending=False)
df_results

Unnamed: 0,model score,mean square,Train RMSE,Test RMSE
LinearRegression,0.326885,0.374849,344.06683,350.631394
Ridge,0.326885,0.374849,344.06683,350.631394
Lasso,0.323927,0.367522,346.362801,351.400973
LinearRegressionPoly,0.297578,0.23736,325.528692,358.183136
LassoLars,0.100353,0.138193,413.152868,405.361373
RandomForest,0.094882,0.082209,108.455552,401.047029
MLPClassifier,0.057087,0.060746,391.099034,398.165118
