In [10]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score


In [11]:
# training land price predictoion dataset
data = pd.read_csv('/media/prince/5A4E832F4E83034D/testing /new_train.csv')

In [12]:
# seprating features and labels
land_prices = data['land_value']
data = data.drop('land_value', axis=1)

In [13]:
# creating pipline
num_attribs = data.drop('ocean_proximity', axis=1).columns.to_list()
cat_attribs = ['ocean_proximity']
num_pipline = Pipeline([
    ("imputer" , SimpleImputer(strategy="median")),
    ("scaler" , StandardScaler())
])

cat_pipline = Pipeline([
    ("onehot" , OneHotEncoder(handle_unknown="ignore"))
])

# constructing full pipline 
full_pipline = ColumnTransformer([
    ("num" , num_pipline, num_attribs),
    ("cat", cat_pipline, cat_attribs)    
    ])

In [15]:
# fitting the pipeline
data_prepared = full_pipline.fit_transform(data)

In [19]:
# trying to check which is the best model
# linear regresion 
lin_reg = LinearRegression()
lin_reg.fit(data_prepared , land_prices)
lin_preds = lin_reg.predict(data_prepared)
lin_rmse = root_mean_squared_error(land_prices , lin_preds)
print(f"linearregresion = {lin_rmse}")
lin_rmses = -cross_val_score(
    lin_reg,
    data_prepared,
    land_prices,
    scoring="neg_root_mean_squared_error",
    cv=10
)

print(pd.Series(lin_rmses).describe())


linearregresion = 90714.8069584771
count       10.000000
mean     90716.396640
std       3131.604147
min      85658.591427
25%      88586.334770
50%      91100.346106
75%      93045.353286
max      95417.047367
dtype: float64


In [20]:
# decison tree  regresion 
dec_reg = DecisionTreeRegressor()
dec_reg.fit(data_prepared , land_prices)
dec_preds = dec_reg.predict(data_prepared)
dec_rmse = root_mean_squared_error(land_prices , dec_preds)
print(f"decisiontree = {dec_rmse}")
dec_rmses = -cross_val_score(
    dec_reg,
    data_prepared,
    land_prices,
    scoring="neg_root_mean_squared_error",
    cv=10
)
print(pd.Series(dec_rmses).describe())


decisiontree = 6.29562579033786e-11
count       10.000000
mean     82926.429172
std       2549.531454
min      79687.285186
25%      81239.409912
50%      82417.382588
75%      84349.169029
max      87414.604774
dtype: float64


In [21]:
# random forest regresion 
ran_reg = RandomForestRegressor()
ran_reg.fit(data_prepared , land_prices)
ran_preds = ran_reg.predict(data_prepared)
ran_rmse = root_mean_squared_error(land_prices , ran_preds)
print(f"randomforest = {ran_rmse}")
ran_rmses = -cross_val_score(
    ran_reg,
    data_prepared,
    land_prices,
    scoring="neg_root_mean_squared_error",
    cv=10
)

print(pd.Series(ran_rmses).describe())


randomforest = 22668.89515089795
count       10.000000
mean     60959.884728
std       2169.484903
min      57769.943027
25%      59369.339228
50%      60928.766251
75%      62442.451340
max      64885.779253
dtype: float64
