In [1]:
import pandas as pd
import numpy as np

In [2]:
from sample_ml.utils.utility import get_datasets

In [3]:
data = get_datasets().get_train_test_dataset()

In [4]:
train_df, test_df = data[0]

In [5]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [6]:
train_df.shape

(16512, 10)

In [7]:
from sample_ml.config.configuration import Configuration
from sample_ml.utils.utility import read_yaml
config = Configuration().get_data_validation_config()

In [8]:
schema = read_yaml(config.schema_file_path)

In [9]:
target_column = schema['target_column']

In [10]:
X, y = train_df.drop(columns=[target_column]), train_df[target_column]

In [11]:
X.shape

(16512, 9)

In [12]:
y.shape

(16512,)

In [33]:
X.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [14]:
from sklearn.impute import SimpleImputer

Apply simple imputer on all columns because in future we can get missing values in other columns also.

In [15]:
simple_imputer = SimpleImputer(strategy='median')

In [34]:
numerical_cols = X.select_dtypes(exclude=['object']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [35]:
numerical_cols

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

In [37]:
numerical_cols_transformed = simple_imputer.fit_transform(X[numerical_cols])

In [38]:
simple_imputer.feature_names_in_

array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype=object)

In [39]:
# statistics values
simple_imputer.statistics_

array([-118.51   ,   34.26   ,   29.     , 2119.     ,  433.     ,
       1164.     ,  408.     ,    3.54155])

In [21]:
np.median(X.longitude)

-118.51

In [22]:
cat_impute = SimpleImputer(strategy='most_frequent')

In [40]:
categorical_cols_transformed = cat_impute.fit_transform(X[categorical_cols])

In [41]:
categorical_cols_transformed

array([['INLAND'],
       ['NEAR OCEAN'],
       ['INLAND'],
       ...,
       ['<1H OCEAN'],
       ['<1H OCEAN'],
       ['INLAND']], dtype=object)

## Custom Transformer

Custom Transformer basic structure

In [24]:
# from sklearn.base import BaseEstimator, TransformerMixin
# class FreatureGenerator(BaseEstimator, TransformerMixin):
#     def __init__(self, strategy:str='mean') -> None:
#         pass

#     def fit(self, X):
#         self.features_ = X.columns
#         self.statistics_ = []
#         for column in X.columns:
#             self.statistics_.append(X[column].median())

#     def transform(self, X):
#         pass

#     def fit_transform(self, X):
#         pass


In [42]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity'],
      dtype='object')

In [62]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np



COLUMN_TOTAL_ROOMS = "total_rooms"
COLUMN_POPULATION = "population"
COLUMN_HOUSEHOLDS = "households"
COLUMN_TOTAL_BEDROOM = "total_bedrooms"

class FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self,
                 columns: pd.Index = None,
                 add_bedrooms_per_rooms: bool = True,
                 total_bedrooms_idx: int = 3,
                 total_rooms_idx: int = 4,
                 population_idx: int = 5,
                 households_idx: int=6
                 ):
        
        self.add_bedrooms_per_rooms = add_bedrooms_per_rooms
        self.columns = columns
        if self.columns is not None:
            total_bedrooms_idx = self.columns.get_loc(COLUMN_TOTAL_BEDROOM)
            total_rooms_idx = self.columns.get_loc(COLUMN_TOTAL_ROOMS)
            population_idx = self.columns.get_loc(COLUMN_POPULATION)
            households_idx = self.columns.get_loc(COLUMN_HOUSEHOLDS)
        
        self.total_bedrooms_idx = total_bedrooms_idx
        self.total_rooms_idx = total_rooms_idx
        self.population_idx = population_idx
        self.households_idx = households_idx


    def fit(self, X):
        return self

    def transform(self, X):
        
        X = np.array(X)
        room_per_household = X[:, self.total_rooms_idx] / X[:, self.households_idx]
        print('second step pass')
        population_per_household = X[:, self.population_idx] / X[:, self.households_idx]
        print('checking if statement')
        if self.add_bedrooms_per_rooms:
            print('inside if statement , third step')
            bedrooms_per_rooms = X[:, self.total_bedrooms_idx] / X[:, self.total_rooms_idx]
            print('Creating features')
            generated_feature = np.c_[X, room_per_household, population_per_household, bedrooms_per_rooms]
        else:
            print('inside else, creating features')
            generated_feature = np.c_[X, room_per_household, population_per_household]
        
        print('saving extran cols to added columns')
        self.added_columns = pd.Index(['room_per_household', 'population_per_household', 'bedroom_per_room'])

        print('returning')

        return generated_feature


In [63]:
feature = FeatureGenerator(X.columns)

In [46]:
obj = feature.fit(X)

In [64]:
transformed = feature.transform(X)

second step pass
checking if statement
inside if statement , third step
Creating features
saving extran cols to added columns
returning


In [65]:
transformed

array([[-121.46, 38.52, 29.0, ..., 5.485835694050992, 3.168555240793201,
        0.20578363026077975],
       [-117.23, 33.09, 7.0, ..., 6.927083333333333, 2.6236979166666665,
        0.16071428571428573],
       [-119.04, 35.37, 44.0, ..., 5.3933333333333335, 2.223333333333333,
        0.1915945611866502],
       ...,
       [-122.72, 38.44, 48.0, ..., 4.1104651162790695,
        2.6627906976744184, 0.2347949080622348],
       [-122.7, 38.31, 14.0, ..., 6.297405189620759, 2.411177644710579,
        0.1838351822503962],
       [-122.14, 39.97, 27.0, ..., 5.477157360406092, 3.1725888324873095,
        0.2057460611677479]], dtype=object)

In [None]:
a = np.arange(10)
b = np.arange(10, 20)

In [None]:
np.c_[np.array([[1,2,3]]), 0, 0, np.array([[4,5,6]])]

array([[1, 2, 3, 0, 0, 4, 5, 6]])

In [None]:
cols = X.columns

In [None]:
cols.append(pd.Index(['a', 'b']))

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity', 'a', 'b'],
      dtype='object')

# Pipeline

In [66]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [67]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('features', FeatureGenerator()),
    ('scale', StandardScaler())
])

In [68]:
categorical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

In [69]:
numerical_columns = X.select_dtypes(exclude = ['object']).columns
categorical_columns = X.select_dtypes(include = ['object']).columns

In [70]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical-pipeline', numerical_pipeline, numerical_columns),
        ('categorical-pipeline', categorical_pipeline, categorical_columns)
    ]
)

In [71]:
preprocessor.fit(X)

second step pass
checking if statement
inside if statement , third step
Creating features
saving extran cols to added columns
returning


In [72]:
transformed = preprocessor.transform(X)

second step pass
checking if statement
inside if statement , third step
Creating features
saving extran cols to added columns
returning


In [None]:
pd.DataFrame(transformed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,-0.941350,1.347438,0.027564,0.584777,0.640371,0.732602,0.556286,-0.893647,0.046298,0.006223,-0.109813,0.0,1.0,0.0,0.0,0.0
1,1.171782,-1.192440,-1.722018,1.261467,0.781561,0.533612,0.721318,1.292168,0.018269,-0.040811,0.993655,0.0,0.0,0.0,0.0,1.0
2,0.267581,-0.125972,1.220460,-0.469773,-0.545138,-0.674675,-0.524407,-0.525434,-0.125247,-0.075371,0.181596,0.0,1.0,0.0,0.0,0.0
3,1.221738,-1.351474,-0.370069,-0.348652,-0.036367,-0.467617,-0.037297,-0.865929,-0.051287,-0.106803,-1.116237,0.0,0.0,0.0,0.0,1.0
4,0.437431,-0.635818,-0.131489,0.427179,0.272790,0.374060,0.220898,0.325752,0.019188,0.006109,0.387536,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,1.251711,-1.220505,-1.165333,1.890456,1.696862,0.543471,1.341519,0.637374,0.227380,-0.092580,0.339448,1.0,0.0,0.0,0.0,0.0
16508,-0.921368,1.342761,-1.085806,2.468471,2.161816,3.002174,2.451492,-0.557509,-0.180020,0.023024,0.454400,0.0,1.0,0.0,0.0,0.0
16509,-1.570794,1.310018,1.538566,-0.895802,-0.895679,-0.862013,-0.865118,-0.365475,-0.247704,-0.037436,-0.596009,1.0,0.0,0.0,0.0,0.0
16510,-1.560803,1.249211,-1.165333,0.249005,0.112126,-0.189747,0.010616,0.168261,0.097978,-0.059156,0.359982,1.0,0.0,0.0,0.0,0.0


In [None]:
import joblib

In [None]:
# joblib.dump(preprocessor, 'compressed-preprocessor.joblib', compress=True)
# joblib.dump(preprocessor, 'not-compressed-preprocessor.joblib', compress=False)

['not-compressed-preprocessor.joblib']

In [None]:
# compressed_preprocessor = joblib.load('compressed-preprocessor.joblib')
# not_compressed_preprocessor = joblib.load('not-compressed-preprocessor.joblib')

In [None]:
# compressed_preprocessor == preprocessor

False

In [None]:
# compressed_preprocessor is preprocessor

False

In [None]:
# not_compressed_preprocessor == preprocessor

False

In [None]:
# not_compressed_preprocessor is preprocessor

False

In [None]:
# import pickle

# with open('preprocessor.pkl', 'wb') as pre_file:
#     pickle.dump(preprocessor, pre_file)

In [None]:
# with open('preprocessor.pkl', 'rb') as pre_file:
#     pickle_preprocessor = pickle.load(pre_file)

In [None]:
# pickle_preprocessor == preprocessor

False

In [None]:
# os.system('du -h not-compressed-preprocessor.joblib')

8.0K	not-compressed-preprocessor.joblib


0

In [None]:
# os.system('du -h preprocessor.pkl')

4.0K	preprocessor.pkl


0

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit

In [73]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
4,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
16507,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
16508,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
16509,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
16510,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN
