In [1]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge

In [11]:
class Data():
    def __init__(self, csv_file, cat_features, num_features):
        self.data_frame = pd.read_csv(csv_file)
        self.feature_names = list(self.data_frame.columns.values)
        self.cat_features = cat_features
        self.num_features = num_features
        self.Xtrain = None
        self.ytrain = None
        self.Xtest = None
        self.ytest = None
        
    def standardize_column_names(self):
        old_column_names = self.feature_names
        new_column_names = [column_name.lower() for column_name in old_column_names]
        self.feature_names = new_column_names
        self.data_frame.columns = self.feature_names
        
    def convert_to_cat_dtype(self):
        for feat in self.cat_features:
            self.data_frame[feat] = self.data_frame[feat].astype("category")
    
    def scale_numeric_features(self):
        num_features = self.data_frame[self.num_features].values
        std_scaler = StandardScaler()
        scaled_values = std_scaler.fit_transform(num_features)
        num_df = pd.DataFrame(scaled_values, columns=self.num_features)
        self.data_frame = pd.concat([num_df, self.data_frame[self.cat_features]], axis=1)
        
    def encode_cat_features(self):
        new_df = self.data_frame[self.num_features].copy()
        for cat_feature in self.cat_feature:
            encoded_output = pd.get_dummies(self.data_frame[cat_feature])
            new_df = pd.concat([new_df, encoded_output], axis=1)
        return new_df
        
            

In [10]:
dataset_features = "data/train_features.csv"
cat_features = ["companyid", "jobtype", "degree", "major", "industry"]
num_features = ["milesfrommetropolis", "yearsexperience"]
if __name__ == '__main__':
    data = Data(dataset_features, cat_features, num_features)
    data.standardize_column_names()
    data.scale_numeric_features()
    data.encode_cat_features
    print(data.data_frame)
    

        milesfrommetropolis  yearsexperience companyid         jobtype  \
0                  1.159051        -0.276245    COMP37             CFO   
1                  0.812763        -1.246797    COMP19             CEO   
2                 -0.399244        -0.276245    COMP52  VICE_PRESIDENT   
3                 -1.126448        -0.553546    COMP38         MANAGER   
4                 -1.161077        -0.553546     COMP7  VICE_PRESIDENT   
5                 -0.641646        -1.385448    COMP15         MANAGER   
6                 -0.884047         1.526210    COMP15             CFO   
7                  0.708877        -0.414895    COMP24          JUNIOR   
8                  0.154816        -1.524098    COMP20         JANITOR   
9                  0.639619         0.694307    COMP41  VICE_PRESIDENT   
10                -0.676274         1.664860    COMP56         JANITOR   
11                 1.020536        -0.692196     COMP7             CEO   
12                -0.710903        -0.

