Bringing scaled & imputed data as per Brad's feedback!!

In [8]:
%config IPCompleter.greedy=True
import pandas as pd
import numpy as np
import seaborn as sns

#Changing the location of this to maintain portability
#kick_data = pd.read_csv('/Users/chandlervaughn/Dropbox/4. Chandler/Development/Machine Learning/Lab1/kick.csv')
kick_data=pd.read_csv("https://www.dropbox.com/s/q2h8uypvc0et5vj/kick.csv?dl=1")

### 1.3.2 Data Cleanup and Imputation

In [3]:
#drop columns that have too little information to be useful
kick_data.columns = kick_data.columns.str.replace(' ', '')

kick_data = kick_data.drop(['PRIMEUNIT',
                            'AUCGUART', 
                            'PurchDate', 
                            'VehYear',
                            'VNZIP1', 
                            'WheelTypeID', 
                            'Nationality', 
                            #'model_short', 
                            'TopThreeAmericanName', 
                            'Trim', 
                            #'Transmission', 
                            #'Auction', 
                            #'Luxury', 
                            #'Make', 
                            'SubModel',
                            'MMRAcquisitionAuctionCleanPrice',
                            'MMRAcquisitionRetailCleanPrice',
                            'MMRCurrentAuctionAveragePrice',
                            'MMRCurrentAuctionCleanPrice',
                            'MMRCurrentRetailAveragePrice',
                            'MMRCurrentRetailCleanPrice',],axis=1)

# Feature Engineering

In [4]:
# Add Cylinder 
temp=kick_data.Model.fillna("0")
kick_data['Cylinder'] = pd.np.where(temp.str.contains("V6"),"V6",
                   pd.np.where(temp.str.contains("6C"), "V6",
                   pd.np.where(temp.str.contains("4C"), "V4", 
                   pd.np.where(temp.str.contains("8C"), "V8",            
                   pd.np.where(temp.str.contains("V4"), "V4",
                   pd.np.where(temp.str.contains("V8"), "V8", "Unknown"))))))  

# Add Axle
temp1=kick_data.Model.fillna("0")
kick_data['Axle'] = pd.np.where(temp1.str.contains("2WD"),"2WD",
                     pd.np.where(temp1.str.contains("4WD"), "4WD",
                     pd.np.where(temp1.str.contains("FWD"), "FWD",
                     pd.np.where(temp1.str.contains("AWD"), "AWD", "Unknown"))))

# Add Mileage
kick_data.VehOdo=kick_data.VehOdo.fillna("0")
kick_data.VehicleAge=kick_data.VehicleAge.fillna("1")
col1 = 'VehOdo'
col2 = 'VehicleAge'
conditions  = [ (kick_data[col1]/kick_data[col2]) < 10000, ((kick_data[col1]/kick_data[col2]) < 13867) & ((kick_data[col1]/kick_data[col2]) > 10000), (kick_data[col1]/kick_data[col2]) > 13867 ]
choices     = [ "LOW", 'GOOD', 'HIGH' ]
kick_data["Mileage"] = np.select(conditions, choices, default=np.nan)

In [5]:
#should be floats
continuous_features = ['VehOdo', 
                       'MMRAcquisitionAuctionAveragePrice',
                       'MMRAcquisitionRetailAveragePrice',
                       'VehBCost',
                       'WarrantyCost']

#should be int
ordinal_features = ['IsBadBuy', 
                    'VehicleAge', 
                    'BYRNO', 
                    'IsOnlineSale']


#should be categorical - Add Luxury, Axle, and Mileage
categorical_features = ['Auction',
                        'Make',
                        'Model',
                        'Color',
                        'Transmission',
                        'WheelType', 
                        'Size',
                        'VNST',
                        'Axle',
                        'Mileage',
                        'Cylinder']

#replace ? data with Nan so we can properly aggregate
kick_data[continuous_features] = kick_data[continuous_features].replace(to_replace='?', value = -9999999)
kick_data[ordinal_features] = kick_data[ordinal_features].replace(to_replace='?', value = -9999999)
kick_data[categorical_features] = kick_data[categorical_features].replace(to_replace='?', value = 'UNKNOWN')

kick_data[continuous_features] = kick_data[continuous_features].astype(np.float64)
kick_data[ordinal_features] = kick_data[ordinal_features].astype(np.int64)

kick_data[continuous_features] = kick_data[continuous_features].replace(to_replace=-9999999, value = np.nan)
kick_data[ordinal_features] = kick_data[ordinal_features].replace(to_replace=-9999999, value = np.nan)

In [6]:
#Review of overall data quality
def nullcounts(ser):
    return ser.isnull().sum()

def detail_describe(frame, func=['count', nullcounts, 'mean', 'std', 'min', 'median', 'max'],
                    numeric_only=False, **kwargs):
    return frame.agg(func, **kwargs)

detail_describe(kick_data).transpose()

Unnamed: 0,count,nullcounts,mean,std,min,median,max
IsBadBuy,72983,0,0.122988,0.328425,0,0.0,1
Auction,72983,0,,,ADESA,,OTHER
VehicleAge,72983,0,4.17664,1.71221,0,4.0,9
Make,72983,0,,,ACURA,,VOLVO
Model,72983,0,,,1500 RAM PICKUP 2WD,,ZEPHYR 3.0L V6 SFI
Color,72983,0,,,BEIGE,,YELLOW
Transmission,72983,0,,,AUTO,,UNKNOWN
WheelType,72983,0,,,Alloy,,UNKNOWN
VehOdo,72983,0,71500.0,14578.9,4825,73361.0,115717
Size,72983,0,,,COMPACT,,VAN


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

#create a max/min scaler
kick_scaled_imputed = kick_data

# we want to predict the X and y data as follows:
if 'IsBadBuy' in kick_scaled_imputed:
    y = kick_scaled_imputed['IsBadBuy'].values # get the labels we want
    del kick_scaled_imputed['IsBadBuy'] # get rid of the class label
    X = kick_scaled_imputed.values # use everything else to predict!

#split with stratification on y
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=99, 
                                                    stratify=y)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


#numeric_features = kick_scaled_imputed.select_dtypes(include=['int64', 'float64']).columns
numeric_features = [kick_scaled_imputed.columns.get_loc(c) 
                    for c in kick_scaled_imputed.select_dtypes(include=['int64', 'float64']).columns 
                    if c in kick_scaled_imputed]

#categorical_features = kick_scaled_imputed.select_dtypes(include=['object']).columns

categorical_features = [kick_scaled_imputed.columns.get_loc(c) 
                        for c in kick_scaled_imputed.select_dtypes(include=['object']).columns
                        if c in kick_scaled_imputed]

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

In [27]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import fbeta_score, matthews_corrcoef, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def f_half_score(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=0.5, average='weighted', pos_label=1)
                       
param_grid = {'classifier__n_estimators':range(10,500,10),
                    'classifier__max_samples':np.arange(0.4, 0.7, 0.01),
                    'classifier__max_features':np.arange(0.7, 0.95, 0.01), 
                    'classifier__bootstrap':[True],
                    'classifier__bootstrap_features':[True]}

scoring = {'AUC': 'roc_auc', 'MCC': make_scorer(matthews_corrcoef, greater_is_better=True)}

classifiers = [BaggingClassifier()]

#print baseline performance
for classifier in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    pipe.fit(X_train, y_train) 
    predic = pipe.predict(X_test)
    print(classifier)
    print("F0.5 model score: %.3f" % f_half_score(y_test, predic))
    print("MCC model score: %.3f" % matthews_corrcoef(y_test, predic))

bagging = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', classifier)])

grid = RandomizedSearchCV(bagging, param_grid, scoring=scoring, refit='MCC', 
                    return_train_score=True, verbose=2, n_jobs = -1, cv = 5, n_iter = 10)

grid.fit(X_train,y_train)
results = grid.cv_results_
pd.DataFrame(results)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=None, verbose=0,
                  warm_start=False)
F0.5 model score: 0.870
MCC model score: 0.385
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [25]:
import modin.pandas as pd

ImportError: Please `pip install modin[dask] to install compatible Dask version.

In [26]:
pd.DataFrame(grid.cv_results_)

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'