## Name :- Darpan Gaur
## Roll No :- CO21BTECH11004

Used the following regression techniques
- Random Forest Regressor
- Linear regression
- SVM regressor
- XGBoost
- LightGBM
- Catboost

In [15]:
# imoprt libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [16]:
# load data from csv file
train_path = './house-prices-data/train.csv'
test_path = './house-prices-data/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [17]:
# print columns in decreasing order of % of missing values if null values are present
missing_values = train_data.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values = missing_values.sort_values(ascending=False)

# % of missing values
missing_values = missing_values / len(train_data) * 100
print(missing_values)


PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
MasVnrType      59.726027
FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
BsmtFinType2     2.602740
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
BsmtQual         2.534247
MasVnrArea       0.547945
Electrical       0.068493
dtype: float64


In [18]:
# drop columns with null values
train_data = train_data.drop(missing_values.index, axis=1)
test_data = test_data.drop(missing_values.index, axis=1)

In [19]:
# inplace id column
train_data.set_index('Id', inplace=True)
test_data.set_index('Id', inplace=True)

# remove columns with object data type
train_data = train_data.select_dtypes(exclude=['object'])
test_data = test_data.select_dtypes(exclude=['object'])

# fill missing values with mean
train_data = train_data.fillna(train_data.mean())
test_data = test_data.fillna(test_data.mean())

# X, y split
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

In [20]:
# feartue importance by random forest regressor
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X, y)

# plot feature importance
features = X.columns
importances = model.feature_importances_

# sort feature importance
indices = np.argsort(importances)[::-1]

# print feature importance in decreasing order
for i in range(len(features)):
    print(f'{features[indices[i]]}: {importances[indices[i]]}')

OverallQual: 0.6033049704010017
GrLivArea: 0.10637060384091533
2ndFlrSF: 0.044979388270151306
TotalBsmtSF: 0.03952580482285996
BsmtFinSF1: 0.0299181811172081
1stFlrSF: 0.02534935617161585
GarageCars: 0.022718381813762317
LotArea: 0.01702706393051751
GarageArea: 0.01674710552037375
YearBuilt: 0.015642552750868302
YearRemodAdd: 0.011112679908303589
TotRmsAbvGrd: 0.00789539899497932
OpenPorchSF: 0.0076153713480284525
BsmtUnfSF: 0.007532024931130557
OverallCond: 0.006876823961562263
WoodDeckSF: 0.006783389588093046
FullBath: 0.004933196004732867
MoSold: 0.004609349039834162
Fireplaces: 0.004410526692874096
MSSubClass: 0.0026642099452237568
YrSold: 0.002064358173495031
BedroomAbvGr: 0.0019363518924055791
BsmtFullBath: 0.001625397865387403
ScreenPorch: 0.0015632439551529309
KitchenAbvGr: 0.0013435188720101188
HalfBath: 0.001291336090203125
EnclosedPorch: 0.001236407867669201
BsmtFinSF2: 0.00099537635528207
BsmtHalfBath: 0.0005540159748370602
3SsnPorch: 0.0005193763121406833
PoolArea: 0.00046

In [21]:
# drop columns with importance less than thresh
thresh = 0.001
X = X.drop(features[importances < thresh], axis=1)
test_data = test_data.drop(features[importances < thresh], axis=1)


## Random Forest Regressor

In [22]:
# use random forest regressor to predict
model = RandomForestRegressor()
model.fit(X, y)
predictions = model.predict(test_data)

# save predictions to csv file
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
output.to_csv('rf_regressor_submission.csv', index=False)

## Linear Regression

In [23]:
# logisitic regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y)
predictions = model.predict(test_data)

# save predictions to csv file
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
output.to_csv('linear_regression_submission.csv', index=False)

## SVM

In [24]:
# svm
from sklearn.svm import SVR

model = SVR()
model.fit(X, y)
predictions = model.predict(test_data)

# save predictions to csv file
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
output.to_csv('svm_submission.csv', index=False)

## XGBoost

In [25]:
# xgboost

model = XGBRegressor()
model.fit(X, y)
predictions = model.predict(test_data)

# save predictions to csv file
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
output.to_csv('xgboost_submission.csv', index=False)

## LightGBM

In [26]:
# lightgbm

model = lgb.LGBMRegressor()
model.fit(X, y)
predictions = model.predict(test_data)

# save predictions to csv file
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
output.to_csv('lightgbm_submission.csv', index=False)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2696
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 27
[LightGBM] [Info] Start training from score 180921.195890


## CatBoost

In [27]:
# catboost

model = CatBoostRegressor()
model.fit(X, y)
predictions = model.predict(test_data)

# save predictions to csv file
output = pd.DataFrame({'Id': test_data.index, 'SalePrice': predictions})
output.to_csv('catboost_submission.csv', index=False)

Learning rate set to 0.043466
0:	learn: 77183.0525934	total: 938us	remaining: 937ms
1:	learn: 75077.0887075	total: 1.75ms	remaining: 872ms
2:	learn: 73143.2228189	total: 2.5ms	remaining: 833ms
3:	learn: 71097.9455452	total: 3.14ms	remaining: 782ms
4:	learn: 69166.8425022	total: 3.76ms	remaining: 748ms
5:	learn: 67309.3653616	total: 4.42ms	remaining: 733ms
6:	learn: 65669.7269223	total: 5.05ms	remaining: 717ms
7:	learn: 64074.6824768	total: 5.63ms	remaining: 698ms
8:	learn: 62449.2620361	total: 6.17ms	remaining: 679ms
9:	learn: 60970.2904535	total: 6.85ms	remaining: 679ms
10:	learn: 59579.4171573	total: 7.36ms	remaining: 662ms
11:	learn: 58203.6699452	total: 7.86ms	remaining: 647ms
12:	learn: 56803.2379829	total: 8.37ms	remaining: 635ms
13:	learn: 55590.2483933	total: 8.92ms	remaining: 628ms
14:	learn: 54367.4389310	total: 9.5ms	remaining: 624ms
15:	learn: 53161.8248055	total: 10ms	remaining: 617ms
16:	learn: 51920.3153387	total: 10.6ms	remaining: 613ms
17:	learn: 50825.9382483	total: 1

## Scores
- Linear Regression: 0.35652
- Random Forest: 0.14851
- SVM Regressor: 0.41645
- XGBoost: 0.15374
- LightGBM: 0.14359
- CatBoost: 0.13617