In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,RobustScaler, Normalizer
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame and 'price' is the target variable
df = pd.read_csv('Housing.csv')  # replace 'your_data.csv' with your actual file path

# Define categorical and numerical columns
categorical_cols = ['mainroad', 'guestroom', 'basement', 'airconditioning','hotwaterheating', 'prefarea', 'furnishingstatus']
numerical_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Define the features and the target
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define a preprocessor StandardScaler
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_cols),
#         ('cat', OneHotEncoder(), categorical_cols)])

# Define a preprocessor RobustScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)])

# Define a preprocessor Normalizer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', Normalizer(), numerical_cols),
#         ('cat', OneHotEncoder(), categorical_cols)])

# Fit and transform the training data
X_train = preprocessor.fit_transform(X_train)

# Transform the test data
X_test = preprocessor.transform(X_test)

# Now, X_train and X_test are ready to be used in a machine learning model

In [4]:
# Printing first 5 records of the dataset
print(df.head(5))

   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  7420         4          2        3      yes        no       no   
1  8960         4          4        4      yes        no       no   
2  9960         3          2        2      yes        no      yes   
3  7500         4          2        2      yes        no      yes   
4  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus     price  
0              no             yes        2      yes        furnished  13300000  
1              no             yes        3       no        furnished  12250000  
2              no              no        2      yes   semi-furnished  12250000  
3              no             yes        3      yes        furnished  12215000  
4              no             yes        2       no        furnished  11410000  


In [5]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error ,r2_score , mean_absolute_percentage_error

# Define the PCA object
pca = PCA(n_components=4)  # n_components specifies the number of dimensions to reduce to

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train)

# Transform the test data
X_test_pca = pca.transform(X_test)

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_pca, y_train)

# Get feature importances
importances = rf.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print(f"Principal Component {i+1}: {importance}")

# Predict on the test set
y_pred_rf = rf.predict(X_test_pca)

# Compute the root mean squared error of the Random Forest model
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)

# Print the root mean squared error
print(f"Root Mean Squared Error (Random Forest): {rmse_rf}")
print(f"mean_absolute_percentage_error (Random Forest): {mean_absolute_percentage_error(y_test,y_pred_rf)}")
print(f"r2 score: {r2_score(y_test,y_pred_rf)}")


Principal Component 1: 0.6723815409702203
Principal Component 2: 0.09192148615734294
Principal Component 3: 0.1323230499768433
Principal Component 4: 0.10337392289559344
Root Mean Squared Error (Random Forest): 1222773.9023985271
mean_absolute_percentage_error (Random Forest): 0.23869092130239986
r2 score: 0.6782010331957385


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error,r2_score , mean_absolute_percentage_error

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Get the indices of the features sorted by importance
indices = np.argsort(importances)[::-1]

# Select a subset of the most important features
n_features = 16  # You can adjust this value
X_train_reduced = X_train[:, indices[:n_features]]
X_test_reduced = X_test[:, indices[:n_features]]

# Define the PCA object
pca = PCA(n_components=4)  # n_components specifies the number of dimensions to reduce to

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train_reduced)

# Transform the test data
X_test_pca = pca.transform(X_test_reduced)

# Train a new Random Forest model on the reduced and transformed data
rf_pca = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_rf_pca = rf_pca.predict(X_test_pca)

# Compute the root mean squared error of the Random Forest model
rmse_rf_pca = mean_squared_error(y_test, y_pred_rf_pca, squared=False)

# Print the root mean squared error
print(f"Root Mean Squared Error (Random Forest with feature selection and PCA): {rmse_rf_pca}")
print(f"mean_absolute_percentage_error (Random Forest): {mean_absolute_percentage_error(y_test,y_pred_rf_pca)}")
print(f"r2 score: {r2_score(y_test,y_pred_rf_pca)}")

Root Mean Squared Error (Random Forest with feature selection and PCA): 1161664.6320026845
mean_absolute_percentage_error (Random Forest): 0.23259319389811708
r2 score: 0.7095617191627231


In [7]:
indices

array([ 0,  2,  4,  3,  1, 19, 12, 11, 10,  9, 15, 16, 17,  8,  7, 18, 13,
       14,  5,  6])

In [8]:
importances.shape

(20,)

In [9]:
X_test.shape

(55, 20)

In [10]:
from sklearn.linear_model import LinearRegression

# Train a Linear Regression model
lr = LinearRegression()
lr.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_lr = lr.predict(X_test_pca)

# Compute the R^2 score
r2_lr = r2_score(y_test, y_pred_lr)

# Compute the root mean squared error
rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)

# Compute the absolute percentage error
ape_lr = np.abs((y_test - y_pred_lr) / y_test) * 100

# Compute the mean absolute percentage error
mape_lr = np.mean(ape_lr)

# Print the metrics
print(f"R^2 Score (Linear Regression with feature selection and PCA): {r2_lr}")
print(f"Root Mean Squared Error (Linear Regression with feature selection and PCA): {rmse_lr}")
print(f"Mean Absolute Percentage Error (Linear Regression with feature selection and PCA): {mape_lr}%")

R^2 Score (Linear Regression with feature selection and PCA): 0.6754357703787055
Root Mean Squared Error (Linear Regression with feature selection and PCA): 1228016.3963502755
Mean Absolute Percentage Error (Linear Regression with feature selection and PCA): 22.853512186879463%


In [11]:
from sklearn.linear_model import SGDRegressor

# Train a SGD Regressor
sgd = SGDRegressor(max_iter=1000, tol=1e-3)
sgd.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_sgd = sgd.predict(X_test_pca)

# Compute the R^2 score
r2_sgd = r2_score(y_test, y_pred_sgd)

# Compute the root mean squared error
rmse_sgd = mean_squared_error(y_test, y_pred_sgd, squared=False)

# Compute the absolute percentage error
ape_sgd = np.abs((y_test - y_pred_sgd) / y_test) * 100

# Compute the mean absolute percentage error
mape_sgd = np.mean(ape_sgd)

# Print the metrics
print(f"R^2 Score (SGD Regressor with feature selection and PCA): {r2_sgd}")
print(f"Root Mean Squared Error (SGD Regressor with feature selection and PCA): {rmse_sgd}")
print(f"Mean Absolute Percentage Error (SGD Regressor with feature selection and PCA): {mape_sgd}%")

R^2 Score (SGD Regressor with feature selection and PCA): 0.6742858317059178
Root Mean Squared Error (SGD Regressor with feature selection and PCA): 1230189.9182801952
Mean Absolute Percentage Error (SGD Regressor with feature selection and PCA): 22.91668675887083%


In [12]:
import numpy as np

# Initialize parameters
theta = np.zeros(X_train_pca.shape[1])
alpha = 0.01  # learning rate
m = y_train.size  # number of samples
epsilon = 0.0001  # convergence threshold
old_cost = np.inf

# Batch gradient descent
for _ in range(10000):  # maximum number of iterations
    # Compute predictions
    pred = np.dot(X_train_pca, theta)

    # Compute cost
    error = pred - y_train
    cost = np.sum(error ** 2) / (2 * m)

    # Check for convergence
    if np.abs(old_cost - cost) < epsilon:
        break

    old_cost = cost

    # Update parameters
    gradient = np.dot(X_train_pca.T, error) / m
    theta = theta - alpha * gradient

# Now, theta contains the parameters of the model trained using batch gradient descent


# Predict on the test set
y_pred_bgd = np.dot(X_test_pca, theta)

# Compute the R^2 score
r2_bgd = r2_score(y_test, y_pred_bgd)

# Compute the root mean squared error
rmse_bgd = mean_squared_error(y_test, y_pred_bgd, squared=False)

# Compute the absolute percentage error
ape_bgd = np.abs((y_test - y_pred_bgd) / y_test) * 100

# Compute the mean absolute percentage error
mape_bgd = np.mean(ape_bgd)

# Print the metrics
print(f"R^2 Score (Batch Gradient Descent): {r2_bgd}")
print(f"Root Mean Squared Error (Batch Gradient Descent): {rmse_bgd}")
print(f"Mean Absolute Percentage Error (Batch Gradient Descent): {mape_bgd}%")

R^2 Score (Batch Gradient Descent): -4.030371358320194
Root Mean Squared Error (Batch Gradient Descent): 4834525.574396
Mean Absolute Percentage Error (Batch Gradient Descent): 115.29191141480251%


In [13]:
from sklearn.linear_model import SGDRegressor

# Train a SGD Regressor with parameters to mimic BGD
sgd_bgd = SGDRegressor(max_iter=len(y_train), learning_rate='constant', eta0=0.01, tol=1e-3)
sgd_bgd.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_sgd_bgd = sgd_bgd.predict(X_test_pca)

# Compute the R^2 score
r2_sgd_bgd = r2_score(y_test, y_pred_sgd_bgd)

# Compute the root mean squared error
rmse_sgd_bgd = mean_squared_error(y_test, y_pred_sgd_bgd, squared=False)

# Compute the absolute percentage error
ape_sgd_bgd = np.abs((y_test - y_pred_sgd_bgd) / y_test) * 100

# Compute the mean absolute percentage error
mape_sgd_bgd = np.mean(ape_sgd_bgd)

# Print the metrics
print(f"R^2 Score (SGD as BGD): {r2_sgd_bgd}")
print(f"Root Mean Squared Error (SGD as BGD): {rmse_sgd_bgd}")
print(f"Mean Absolute Percentage Error (SGD as BGD): {mape_sgd_bgd}%")

R^2 Score (SGD as BGD): 0.6810250286742008
Root Mean Squared Error (SGD as BGD): 1217396.7608115159
Mean Absolute Percentage Error (SGD as BGD): 22.54534802014643%


In [14]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.base import BaseEstimator, TransformerMixin

# Define the OutlierRemover class
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.1):
        self.contamination = contamination

    def fit(self, X, y=None):
        self.iforest = IsolationForest(contamination=self.contamination)
        self.iforest.fit(X)
        return self

    def transform(self, X):
        is_inlier = self.iforest.predict(X)
        return X[is_inlier == 1]

# Assuming df is your DataFrame and 'price' is the target variable
df = pd.read_csv('Housing.csv')  # replace 'your_data.csv' with your actual file path

# Define categorical and numerical columns
categorical_cols = ['mainroad', 'guestroom', 'basement', 'airconditioning','hotwaterheating', 'prefarea', 'furnishingstatus']
numerical_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Define the features and the target
X = df.drop('price', axis=1)
y = df['price']
# Remove outliers from the numerical columns of the training data
outlier_remover = OutlierRemover()
X_train_numerical = X_train[numerical_cols]

# Get the inlier indices before transforming X_train_numerical
inlier_indices = outlier_remover.iforest.fit_predict(X_train_numerical) == 1

# Transform X_train_numerical
X_train_numerical = outlier_remover.transform(X_train_numerical)

# Update y_train to match the inliers in X_train_numerical
y_train = y_train[inlier_indices]

# Update X_train to only include the inliers
X_train = X_train.loc[y_train.index, :]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


# Define a preprocessor RobustScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)])

# Fit and transform the training data
X_train = preprocessor.fit_transform(X_train)

# Transform the test data
X_test = preprocessor.transform(X_test)

# Now, X_train and X_test are ready to be used in a machine learning model


IndexError: ignored

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,RobustScaler, Normalizer
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame and 'price' is the target variable
df = pd.read_csv('Housing.csv')  # replace 'your_data.csv' with your actual file path
# Calculate the IQR of each column
Q1 = df.quantile(0.08)
Q3 = df.quantile(0.92)
IQR = Q3 - Q1

# Define a mask for values that are NOT outliers
mask = ~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)


# Remove outliers
df_clean = df[mask]
# Define categorical and numerical columns
categorical_cols = ['mainroad', 'guestroom', 'basement', 'airconditioning','hotwaterheating', 'prefarea', 'furnishingstatus']
numerical_cols = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Define the features and the target
X = df_clean.drop('price', axis=1)
y = df_clean['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define a preprocessor StandardScaler
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_cols),
#         ('cat', OneHotEncoder(), categorical_cols)])

# Define a preprocessor RobustScaler
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)])

# Define a preprocessor Normalizer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', Normalizer(), numerical_cols),
#         ('cat', OneHotEncoder(), categorical_cols)])

# Fit and transform the training data
X_train = preprocessor.fit_transform(X_train)

# Transform the test data
X_test = preprocessor.transform(X_test)

# Now, X_train and X_test are ready to be used in a machine learning model

  Q1 = df.quantile(0.08)
  Q3 = df.quantile(0.92)
  mask = ~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)


In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error,r2_score , mean_absolute_percentage_error

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Get the indices of the features sorted by importance
indices = np.argsort(importances)[::-1]

# Select a subset of the most important features
n_features = 16  # You can adjust this value
X_train_reduced = X_train[:, indices[:n_features]]
X_test_reduced = X_test[:, indices[:n_features]]

# Define the PCA object
pca = PCA(n_components=15)  # n_components specifies the number of dimensions to reduce to

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train_reduced)

# Transform the test data
X_test_pca = pca.transform(X_test_reduced)

# Train a new Random Forest model on the reduced and transformed data
rf_pca = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_rf_pca = rf_pca.predict(X_test_pca)

# Compute the root mean squared error of the Random Forest model
rmse_rf_pca = mean_squared_error(y_test, y_pred_rf_pca, squared=False)

# Print the root mean squared error
print(f"Root Mean Squared Error (Random Forest with feature selection and PCA): {rmse_rf_pca}")
print(f"mean_absolute_percentage_error (Random Forest): {mean_absolute_percentage_error(y_test,y_pred_rf_pca)}")
print(f"r2 score: {r2_score(y_test,y_pred_rf_pca)}")

Root Mean Squared Error (Random Forest with feature selection and PCA): 1105258.5838240976
mean_absolute_percentage_error (Random Forest): 0.21765849254245626
r2 score: 0.7348951875266124


In [43]:
X_train[0]

array([-0.57971014,  0.        ,  0.        , -1.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ])

In [17]:
df_clean.shape

(544, 13)

In [18]:
df.shape

(545, 13)

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error,r2_score , mean_absolute_percentage_error

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_

# Get the indices of the features sorted by importance
indices = np.argsort(importances)[::-1]

# Select a subset of the most important features
n_features = 16  # You can adjust this value
X_train_reduced = X_train[:, indices[:n_features]]
X_test_reduced = X_test[:, indices[:n_features]]

# Define the PCA object
pca = PCA(n_components=4)  # n_components specifies the number of dimensions to reduce to

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train_reduced)

# Transform the test data
X_test_pca = pca.transform(X_test_reduced)

# Train a new Random Forest model on the reduced and transformed data
rf_pca = RandomForestRegressor(n_estimators=100, random_state=42)
rf_pca.fit(X_train_pca, y_train)

# Predict on the test set
y_pred_rf_pca = rf_pca.predict(X_test_pca)

# Compute the root mean squared error of the Random Forest model
rmse_rf_pca = mean_squared_error(y_test, y_pred_rf_pca, squared=False)

# Print the root mean squared error
print(f"Root Mean Squared Error (Random Forest with feature selection and PCA): {rmse_rf_pca}")
print(f"mean_absolute_percentage_error (Random Forest): {mean_absolute_percentage_error(y_test,y_pred_rf_pca)}")
print(f"r2 score: {r2_score(y_test,y_pred_rf_pca)}")

Root Mean Squared Error (Random Forest with feature selection and PCA): 1165893.877728246
mean_absolute_percentage_error (Random Forest): 0.2223870229047006
r2 score: 0.7050096139900867


In [37]:
X_train_reduced

array([[-0.57971014,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 3.11594203,  0.        ,  2.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.2173913 ,  0.        ,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.97644928,  0.        ,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [-0.88949275,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.63405797,  1.        ,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [38]:
importance

0.10337392289559344

In [39]:
indices

array([ 0,  2,  4,  3, 19,  1, 11, 12, 15, 10, 16,  9, 17, 18,  8,  7, 13,
       14,  5,  6])

In [40]:
X

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
...,...,...,...,...,...,...,...,...,...,...,...,...
540,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,2910,3,1,1,no,no,no,no,no,0,no,furnished
