## Importing-Module

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder



## 1 - load the dataset

In [3]:
#1 - load the dataset
housing = pd.read_csv('housing.csv')



## 2 - create a stratified test set

In [4]:
#2 - create a stratified test set
housing['income_cat'] = pd.cut(housing['median_income'],bins=[0.0,1.5,3.0,4.5,6.0,np.inf],labels=[1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing['income_cat']):   #split amti karibu train set au test set deba kintu income_cat columns ta sabu thire asiba au same asiba
    strat_train_set = housing.loc[train_index].drop('income_cat',axis=1)
    strat_test_set = housing.loc[test_index].drop('income_cat',axis=1)


In [5]:
strat_train_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,82700.0,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,268500.0,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,90400.0,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,140400.0,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,258100.0,<1H OCEAN


In [6]:
strat_test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
5241,-118.39,34.12,29.0,6447.0,1012.0,2184.0,960.0,8.2816,500001.0,<1H OCEAN
17352,-120.42,34.89,24.0,2020.0,307.0,855.0,283.0,5.0099,162500.0,<1H OCEAN
3505,-118.45,34.25,36.0,1453.0,270.0,808.0,275.0,4.3839,204600.0,<1H OCEAN
7777,-118.10,33.91,35.0,1653.0,325.0,1072.0,301.0,3.2708,159700.0,<1H OCEAN
14155,-117.07,32.77,38.0,3779.0,614.0,1495.0,614.0,4.3529,184000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
12182,-117.29,33.72,19.0,2248.0,427.0,1207.0,368.0,2.8170,110000.0,<1H OCEAN
7275,-118.24,33.99,33.0,885.0,294.0,1270.0,282.0,2.1615,118800.0,<1H OCEAN
17223,-119.72,34.44,43.0,1781.0,342.0,663.0,358.0,4.7000,293800.0,<1H OCEAN
10786,-117.91,33.63,30.0,2071.0,412.0,1081.0,412.0,4.9125,335700.0,<1H OCEAN


## 3 - we will work on the copy of the training set

In [7]:
#3 - we will work on the copy of the training set
housing = strat_train_set.copy()


## 4 - Separate feature and labels

In [8]:
#4 - Separate feature and labels
#feature
housing = strat_train_set.drop('median_house_value',axis=1)
#labels
housing_labels = strat_train_set['median_house_value'].copy()


In [9]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.8750,INLAND
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN
20496,-118.70,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15174,-117.07,33.03,14.0,6665.0,1231.0,2026.0,1001.0,5.0900,<1H OCEAN
12661,-121.42,38.51,15.0,7901.0,1422.0,4769.0,1418.0,2.8139,INLAND
19263,-122.72,38.44,48.0,707.0,166.0,458.0,172.0,3.1797,<1H OCEAN
19140,-122.70,38.31,14.0,3155.0,580.0,1208.0,501.0,4.1964,<1H OCEAN


In [10]:
housing_labels

12655     72100.0
15502    279600.0
2908      82700.0
14053    112500.0
20496    238300.0
           ...   
15174    268500.0
12661     90400.0
19263    140400.0
19140    258100.0
19773     62700.0
Name: median_house_value, Length: 16512, dtype: float64

## 5 - Separate numerical and categorical columns

In [11]:
num_attribs = housing.drop('ocean_proximity',axis=1).columns.tolist()

In [12]:
cat_attribs = ['ocean_proximity']

## 6 - Lets make the pipeline 

For numerical columns

In [13]:
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])  

For categorical columns

In [14]:
cat_pipeline = Pipeline([
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

#Construct the Full Pipeline

In [15]:
full_pipeline = ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',cat_pipeline,cat_attribs)
])

## 7 - Transform the data

In [16]:
housing_prepared = full_pipeline.fit_transform(housing)

In [17]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

## 8 Training ml alogrithms on preporcessed data


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [28]:

#and the main-aim is finding the best algorithm above this these alogorithm 
#here we train the model by using one by one thse algorithm ,and find which algorithm is give low (best) root mean error
#we find root mean square erro by using the cross validation method
#if don't use cross validation method it shows error


In [29]:
#8 - Train the model
#Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
lin_preds = lin_reg.predict(housing_prepared)
lin_rmses = root_mean_squared_error(housing_labels,lin_preds)
print(f'The root mean squared error for Linear Regression is {lin_rmses}')



The root mean squared error for Linear Regression is 69050.56219504567


In [31]:
#Decsion Tree Model
dec_reg = DecisionTreeRegressor()
dec_reg.fit(housing_prepared,housing_labels)
dec_preds = dec_reg.predict(housing_prepared)
dec_rmses = root_mean_squared_error(housing_labels,dec_preds)
print(f'The root mean squared error for DecsionTree is {dec_rmses}')


The root mean squared error for DecsionTree is 0.0


In [32]:
# # #Random Forest Model
# random_forest_reg = RandomForestClassifier()
# random_forest_reg.fit(housing_prepared,housing_labels)
# random_forest_preds = random_forest_reg.predict(housing_prepared)
# random_forest_rmses = root_mean_squared_error(housing_labels, random_forest_preds)
# print(f'The root mean squared error for Random Forest is {random_forest_rmses}')

# Random Forest Model
random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(housing_prepared, housing_labels)
random_forest_preds = random_forest_reg.predict(housing_prepared)
random_forest_rmse = root_mean_squared_error(housing_labels, random_forest_preds)
print(f"The root mean squared error for Random Forest is {random_forest_rmse}")

The root mean squared error for Random Forest is 18441.505420681042


## by using cross validation method


In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
#8 - Train the model
#Linear Regression Model
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
lin_preds = lin_reg.predict(housing_prepared)
lin_rmses = -cross_val_score(lin_reg,housing_prepared,housing_labels,scoring='neg_root_mean_squared_error',cv=10)
print(pd.Series(lin_rmses).describe())




count       10.000000
mean     69204.322755
std       2500.382157
min      65318.224029
25%      67124.346106
50%      69404.658178
75%      70697.800632
max      73003.752739
dtype: float64


In [33]:
# Decision Tree Model
dec_reg = DecisionTreeRegressor()
dec_reg.fit(housing_prepared, housing_labels)
dec_preds = dec_reg.predict(housing_prepared)
# dec_rmse = root_mean_squared_error(housing_labels, dec_preds)
dec_rmses = -cross_val_score(dec_reg, housing_prepared, housing_labels, scoring="neg_root_mean_squared_error", cv=10
                             )
# print(f"The root mean squared error for Decision Tree is {dec_rmses}")
print(pd.Series(dec_rmses).describe())



count       10.000000
mean     69260.367375
std       2537.699900
min      64497.202040
25%      67820.211193
50%      69257.137355
75%      70285.665540
max      73317.384156
dtype: float64


In [35]:
#Random Forest Model
random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(housing_prepared,housing_labels)
random_forest_preds = random_forest_reg.predict(housing_prepared)


random_forest_rmses = -cross_val_score(random_forest_reg,housing_prepared,housing_labels,scoring='neg_root_mean_squared_error',cv=3)
print(pd.Series(random_forest_rmses).describe())

count        3.000000
mean     50555.960875
std        621.568027
min      49937.543867
25%      50243.624956
50%      50549.706046
75%      50865.169379
max      51180.632713
dtype: float64
