In [1]:
import numpy as np 
import pandas as pd 
import matplotlib as plt 
import seaborn as sns
import sklearn 

In [2]:
df = pd.read_csv("housing.csv")

In [3]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
df["median_income_categories"] = pd.cut(df["median_income"],
                                        bins=[0.0,1.5,3.0,4.5,6.0,np.inf],
                                        labels=[1,2,3,4,5])

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

In [6]:
#prepare data

In [7]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=40)
stratified_split=[]
for train_i , test_i in sss.split(df,df["median_income_categories"]):
    stratified_train_set_n=df.iloc[train_i]
    stratified_test_set_n=df.iloc[test_i]
    stratified_split.append([stratified_train_set_n, stratified_test_set_n])

In [8]:
str_train_set = stratified_train_set_n.drop("median_income_categories",axis=1)
str_test_set = stratified_test_set_n.drop("median_income_categories",axis=1)

In [9]:
train = str_train_set

In [10]:
train_features = train.drop("median_house_value",axis=1)
train_target = train["median_house_value"]

In [11]:
#data cleaning

In [12]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 7805 to 18666
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16351 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.3+ MB


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [14]:
simple_imputer = SimpleImputer(strategy="median")

In [15]:
train_features_numeric = train_features.select_dtypes(include=[np.number])
train_features_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 7805 to 18666
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16351 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB


In [16]:
simple_imputer.fit(train_features_numeric)

In [17]:
simple_imputer.statistics_

array([-118.49   ,   34.25   ,   29.     , 2127.     ,  435.     ,
       1167.     ,  409.     ,    3.53245])

In [18]:
simple_imputer.transform(train_features_numeric).shape

(16512, 8)

In [19]:
train_features_numeric = pd.DataFrame(simple_imputer.transform(train_features_numeric),
                                      columns = train_features_numeric.columns,
                                      index = train_features_numeric.index)

In [20]:
train_features_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16512 entries, 7805 to 18666
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16512 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
dtypes: float64(8)
memory usage: 1.1 MB


In [21]:
#pipe lines

In [22]:
numerical_features_pipeline = Pipeline([
    ("medianImputer", SimpleImputer(strategy="median")),
    ("minMaxscaler", MinMaxScaler())
])

In [23]:
numerical_features_pipeline.fit_transform(train_features_numeric)

array([[0.62350598, 0.1434644 , 0.80392157, ..., 0.01975952, 0.03322368,
        0.21983145],
       [0.62948207, 0.12964931, 0.47058824, ..., 0.07006923, 0.12960526,
        0.36151915],
       [0.24900398, 0.49734325, 0.39215686, ..., 0.06981698, 0.13125   ,
        0.49793796],
       ...,
       [0.2250996 , 0.54941552, 0.66666667, ..., 0.03035399, 0.06003289,
        0.32067834],
       [0.49800797, 0.40170032, 0.17647059, ..., 0.0471706 , 0.11710526,
        0.22921063],
       [0.24003984, 0.47290117, 0.19607843, ..., 0.05608341, 0.13898026,
        0.26136881]])

In [24]:
from sklearn.compose import ColumnTransformer

In [25]:
numerical_features = list(train_features.columns)
numerical_features.remove("ocean_proximity")
numerical_features

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [26]:
list(train_features.columns)

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity']

In [27]:
categorical_features=["ocean_proximity"]

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
categorical_features_pipeline = Pipeline([
    ("mfImputer", SimpleImputer(strategy="most_frequent")),
    ("ohe",OneHotEncoder())
])

In [28]:
numerical_features_pipeline = Pipeline([
    ("medianImputer", SimpleImputer(strategy="median")),
    ("minMaxscaler", MinMaxScaler())
])

In [32]:
total_transformation = ColumnTransformer([
    ("numerical", numerical_features_pipeline, numerical_features),
    ("categorical", categorical_features_pipeline, categorical_features)
])

In [33]:
total_transformation.fit_transform(train_features)

array([[0.62350598, 0.1434644 , 0.80392157, ..., 0.        , 0.        ,
        0.        ],
       [0.62948207, 0.12964931, 0.47058824, ..., 0.        , 0.        ,
        0.        ],
       [0.24900398, 0.49734325, 0.39215686, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.2250996 , 0.54941552, 0.66666667, ..., 0.        , 1.        ,
        0.        ],
       [0.49800797, 0.40170032, 0.17647059, ..., 0.        , 0.        ,
        0.        ],
       [0.24003984, 0.47290117, 0.19607843, ..., 0.        , 0.        ,
        1.        ]])