In [1]:
import pandas as pd
data = pd.read_csv(r'/home/awantik/Downloads/allstate_train.csv/train.csv', index_col='id')

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188318 entries, 1 to 587633
Columns: 131 entries, cat1 to loss
dtypes: float64(15), object(116)
memory usage: 189.7+ MB


In [3]:
data.head()

Unnamed: 0_level_0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,A,B,A,B,A,A,A,A,B,A,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
2,A,B,A,A,A,A,A,A,B,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
5,A,B,A,A,B,A,A,A,B,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
10,B,B,A,B,A,A,A,A,B,A,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
11,A,B,A,B,A,A,A,A,B,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [4]:
sample_data = data.sample(10000)

In [5]:
target_col = sample_data.loss

In [6]:
feature_data = sample_data.drop(columns=['loss'])

In [7]:
cat_cols = list(filter(lambda x:x.startswith('cat'), feature_data.columns.tolist()))

In [8]:
cat_feature_data = feature_data[cat_cols]

In [9]:
cont_cols = list(filter(lambda x:x.startswith('cont'), feature_data.columns.tolist()))

In [10]:
cont_feature_data = feature_data[cont_cols]

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV

In [57]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from joblib import dump, load

class BuildMlPipeline:
    
    def __init__(self):
        pass
        
    def set_estimators(self, *args):
        estimator_db = {
            'randomForestRegressor': RandomForestRegressor(),
            'linearRegressor': LinearRegression(),
        }
        self.estimators = list(map( lambda algo: estimator_db[algo],args))
        
    def set_scalers(self, *args):
        scaler_db = {
            'standardscaler':StandardScaler(),
            'minmaxscaler':MinMaxScaler(),
        }
        self.scalers = list(map( lambda scaler: scaler_db[scaler],args))
        
    def set_samplers(self, *args):
        sampler_db = {
            'smote':SMOTE(),
            'smoteenn':SMOTEENN(),
        }
        self.samplers = list(map( lambda sampler: sampler_db[sampler],args))
        
    def set_encoders(self, *args):
        encoders_db = {
            'ohe':OneHotEncoder(handle_unknown='ignore'),
            'oe':OrdinalEncoder(),
        }
        self.encoders = list(map( lambda encoder: encoders_db[encoder],args))
        
    def set_hyperparameters(self, params):
        self.hyperparameters = params

    
    def create_pipelines(self, cat_cols, cont_cols):
        self.model_pipelines = []
        for scaler in self.scalers:
            pipeline_num = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                           ('scaling',scaler)])
            for encoder in self.encoders:
                pipeline_cat = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                               ('encoder',encoder)])
                preprocessor = make_column_transformer((pipeline_num, cont_cols),(pipeline_cat, cat_cols))
                
                for estimator in self.estimators:
                    pipeline  = make_pipeline(preprocessor, estimator)
                    self.model_pipelines.append(pipeline)
        
    
    def fit(self, trainX, trainY):
        self.gs_pipelines = []
        for idx,pipeline in enumerate(self.model_pipelines):
            elems = list(map(lambda x:x[0] ,pipeline.steps))
            param_grid = {}

            for elem in elems:
                if elem.lower() in self.hyperparameters:
                    param_grid.update(self.hyperparameters[elem])
            
            gs = GridSearchCV(pipeline, param_grid= param_grid, n_jobs=6, cv=5)
            gs.fit(trainX, trainY)
            print (gs.score(testX,testY),  list(map(lambda x:x[0] , gs.best_estimator_.steps)), gs.best_params_)
            
            dump(gs, 'model'+str(idx)+'.pipeline') 
            self.gs_pipelines.append(gs)
      
        
    def score(self, testX, testY):
        for idx,model in enumerate(self.gs_pipelines):
            y_pred = model.best_estimator_.predict(testX)
            print (model.best_estimator_)
            print (idx,confusion_matrix(y_true=testY,y_pred=y_pred))

In [58]:
from sklearn.model_selection import train_test_split

In [59]:
feature_data.shape

(10000, 130)

In [60]:
target_col.shape

(10000,)

In [61]:
trainX,testX, trainY, testY = train_test_split(feature_data,target_col, test_size=.10)

In [62]:
trainX.shape

(9000, 130)

In [63]:
ml_pipeline = BuildMlPipeline()

In [64]:
ml_pipeline.set_estimators('randomForestRegressor', 'linearRegressor')

In [65]:
ml_pipeline.set_encoders('oe')

In [66]:
ml_pipeline.set_scalers('standardscaler','minmaxscaler')

In [67]:
params_dict = {}
params_dict['randomforestregressor'] = {'randomforestregressor__n_estimators':[100]}
params_dict['linearregression'] = {'linearregression__normalize':[True,False]}
ml_pipeline.set_hyperparameters(params_dict)

In [69]:
ml_pipeline.create_pipelines(cat_cols=cat_cols, cont_cols=cont_cols)

In [70]:
ml_pipeline.fit(feature_data, target_col)

ValueError: Found unknown categories ['G'] in column 101 during transform