In [1]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
class Data():
    def __init__(self, train_csv, train_target_csv, cat_features, num_features, target, scaled=False):
        self.cat_features = cat_features
        self.num_features = num_features
        self.target_label = target
        self.data_frame = self.read_data(train_csv, train_target_csv)
        self.scaling = scaled
        self.target_data = self.data_frame.salary.copy()
        self.Xtrain = None
        self.ytrain = None
        self.Xtest = None
        self.ytest = None
        self.pre_process()
        self.split_data()

    def convert_to_cat_dtype(self):
        for feat in self.cat_features:
            self.data_frame[feat] = self.data_frame[feat].astype("category")
    
    def scale_numeric_features(self):
        num_data = self.data_frame[self.num_features].values
        std_scaler = StandardScaler()
        scaled_values = std_scaler.fit_transform(num_data)
        num_df = pd.DataFrame(scaled_values, columns=self.num_features)
        test= pd.concat([num_df, self.data_frame[self.cat_features]], axis=1)
        
    def encode_cat_features(self):
        new_df = self.data_frame[self.num_features].copy()
        encoded_output = pd.get_dummies(self.data_frame[self.cat_features])
        new_df = pd.concat([new_df, encoded_output], axis=1)
        self.data_frame = new_df
        
    def get_target(self):
        return self.data_frame.loc["salary"].copy()
            
    def clean_data(self, df):
        total_df = df.copy()
        total_df.drop_duplicates(inplace=True)
        clean_df = total_df[total_df.salary >0].copy()
        return clean_df

    def pre_process(self):
        self.convert_to_cat_dtype()
        if(self.scaling == True):
            self.scale_numeric_features()
        self.encode_cat_features()
        
    def read_data(self, csv1, csv2):
        df = pd.read_csv(csv1)
        df2 = pd.read_csv(csv2)
        total_df = pd.merge(df, df2, how="inner", on="jobId")
        clean_df = self.clean_data(total_df)
        return clean_df
    
    def split_data(self):
        consolidated_df = pd.concat([self.data_frame, self.target_data],axis=1)
        train, test = train_test_split(consolidated_df, test_size=0.2, random_state = 42)
        self.Xtrain = train.iloc[:, :-1]
        self.ytrain = train.loc[:,"salary"]#.values.reshape(-1,1)#, columns=["salary"])
        self.Xtest = test.iloc[:, :-1]
        self.ytest = test.loc[:,"salary"]#.values.reshape(-1,1), columns=["salary"])


class Models:
    def __init__(self, data, scaled_data=None):
        if scaled_data != None:
            self.Xtrain_scaled = scaled_data.Xtrain
            self.Xtest_scaled = scaled_data.Xtest
        self.Xtrain = data.data_frame
        self.ytrain = data.target_data
        self.Xtest = data.Xtest
        self.ytest = data.ytest
        self.models = []
        self.mse = {}
        self.cross_val_scores = {}
        self.predictions = None
        self.best_model = None    
    
    def add_model(self, model):
        self.models.append(model)
            
    def cross_validate(self, k):
        for model in self.models:
#             model.fit(self.Xtrain, self.ytrain)
#             self.mse[model.__class__.__name__] = mean_squared_error(self.ytest, model.predict(self.Xtest))
            cross_evaluation = cross_val_score(model, self.Xtrain, self.ytrain, cv=k, scoring="neg_mean_squared_error")
            score = np.mean(np.negative(cross_evaluation))
            self.cross_val_scores[model.__class__.__name__] = score
        
    def select_best_model(self):
        self.best_model = min(self.cross_val_scores, key=self.cross_val_scores.get)
        
    def summarize(self):
        for model in self.models:
            print(model.__class__.__name__)
            print("Mean Squared Error before cross validation: {}".format(self.mse[model.__class__.__name__]))
            print("Mean Squared Error after cross validation: {}".format(self.cross_val_scores[model.__class__.__name__]))
            print("\n")
        print("Best Model is: {}".format(self.best_model))
            
    def hyper_param_tune(self, model, params):
        grid_search = GridSearchCV(model, params, cv=5, scoring = "neg_mean_squared_error")
        grid_search.fit(self.Xtrain, self.ytrain)
        return grid_search      
            
    def run(self):
        self.cross_validate(5)
        self.select_best_model()
        self.summarize()       

In [3]:
dataset_features = "data/train_features.csv"
dataset_target = "data/train_salaries.csv"
cat_features = ["companyId", "jobType", "degree", "major", "industry"]
num_features = ["milesFromMetropolis", "yearsExperience"]

lin_alg = LinearRegression()
# dec_tree = DecisionTreeRegressor(max_leaf_nodes=6, max_depth, max_features = 40)
rfr = RandomForestRegressor(n_estimators=100, max_leaf_nodes=10)
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)

param_grid = [{'n_estimators':[3,10,30], 'max_features':[10,30,40,50,60]},
             {"bootstrap":[False], 'n_estimators':[3,10], 'max_features':[5,20, 50,80]}]

if __name__ == '__main__':
    data = Data(dataset_features, dataset_target,cat_features, num_features, "salary", scaled=False)
    models = Models(data)
    models.add_model(lin_alg)
    models.add_model(ridge_reg)
    models.add_model(rfr)
    models.add_model(gbr)
    models.run()


In [4]:
rfr = RandomForestRegressor(n_estimators=100, max_features=25, max_depth=30, min_samples_split=70)
gbr = GradientBoostingRegressor(max_depth=5, n_estimators=150)
data_two = Data(dataset_features, dataset_target,cat_features, num_features, "salary")
models = Models(data_two)
# models.add_model(lin_alg)
#     models.add_model(ridge_reg)
models.add_model(rfr)
models.add_model(gbr)
models.run()

RandomForestRegressor


KeyError: 'RandomForestRegressor'

In [9]:
print(models.cross_val_scores["RandomForestRegressor"])

366.1981194984496


In [None]:
for mod