In [85]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

check_variants = False

# Load data
df = pd.read_csv('train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [86]:
# if data is not over 70% complete, drop it
df = df.dropna(thresh=0.7*len(df), axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [87]:
# use label encoder to convert categorical data to numerical
le = preprocessing.LabelEncoder()
# if column is not numerical, apply label encoder
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
# drop id column
df = df.drop(columns=['Id'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   int64  
 5   LotShape       1460 non-null   int64  
 6   LandContour    1460 non-null   int64  
 7   Utilities      1460 non-null   int64  
 8   LotConfig      1460 non-null   int64  
 9   LandSlope      1460 non-null   int64  
 10  Neighborhood   1460 non-null   int64  
 11  Condition1     1460 non-null   int64  
 12  Condition2     1460 non-null   int64  
 13  BldgType       1460 non-null   int64  
 14  HouseStyle     1460 non-null   int64  
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [88]:
# check for NaN values
df.isna().sum().sort_values(ascending=False)

# fill NaN values with mean
df = df.fillna(df.mean())
df.isna().sum().sort_values(ascending=False)

MSSubClass      0
GarageYrBlt     0
Fireplaces      0
Functional      0
TotRmsAbvGrd    0
               ..
ExterQual       0
MasVnrArea      0
Exterior2nd     0
Exterior1st     0
SalePrice       0
Length: 74, dtype: int64

In [89]:
# split data into X and y
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

y = np.log(y)


from sklearn.model_selection import train_test_split

# create test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [90]:
# create models and return rmse
def get_rmse(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return np.sqrt(np.mean((y_test - y_pred)**2))

def knn(df):
    from sklearn.neighbors import KNeighborsRegressor
    model = KNeighborsRegressor()
    return get_rmse(model, X_train, X_test, y_train, y_test)

def dt(df):
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(random_state=42)
    return get_rmse(model, X_train, X_test, y_train, y_test)

def rf(dt):
    from sklearn.tree import DecisionTreeRegressor
    model = DecisionTreeRegressor()
    return get_rmse(model, X_train, X_test, y_train, y_test)

def lr(df):
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    return get_rmse(model, X_train, X_test, y_train, y_test)

def svm(df):
    from sklearn.svm import SVR
    model = SVR()
    return get_rmse(model, X_train, X_test, y_train, y_test)

def adaboost(df):
    from sklearn.ensemble import AdaBoostRegressor
    model = AdaBoostRegressor()
    return get_rmse(model, X_train, X_test, y_train, y_test)

def gradient_boosting(df):
    from sklearn.ensemble import GradientBoostingRegressor
    model = GradientBoostingRegressor(random_state=42, n_estimators=1000, learning_rate=0.1, max_leaf_nodes=7)
    return get_rmse(model, X_train, X_test, y_train, y_test)

In [91]:
# print scores
print('KNN: ', knn(df))
print('RF: ', rf(df))
print('DT: ', dt(df))
print('LR: ', lr(df))
print('SVM: ', svm(df))
print('AdaBoost: ', adaboost(df))
print('Gradient Boosting: ', gradient_boosting(df))

KNN:  0.22266356364752063
RF:  0.20520366283452965


DT:  0.13494661337749
LR:  0.13606320392475466
SVM:  0.19925603972455727
AdaBoost:  0.16392802856040262
Gradient Boosting:  0.12871957309527615


In [92]:
# # find best parameters for gradient boosting
# from sklearn.ensemble import GradientBoostingRegressor

# def find_best_params():
#     param_grid = {'n_estimators': [100, 500, 1000],
#                     'learning_rate': [0.1],
#                     'max_depth': [3, 5, 7],
#                     'max_leaf_nodes': [3, 5, 7]
#                     }
#     model = GradientBoostingRegressor(random_state=42)
#     grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True)
#     grid_search.fit(X_train, y_train)
#     print(grid_search.best_params_)
#     print(grid_search.best_estimator_)
#     print(grid_search.best_score_)
#     return grid_search.best_params_



In [93]:
# print(find_best_params())

In [94]:
# import test data
df_test = pd.read_csv('test.csv')
df_test = df_test.dropna(thresh=0.7*len(df_test), axis=1)

# use label encoder to convert categorical data to numerical
le = preprocessing.LabelEncoder()
# if column is not numerical, apply label encoder
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = le.fit_transform(df_test[col])
# drop id column
df_test = df_test.drop(columns=['Id'])
df_test.info()

# check for NaN values
df_test.isna().sum().sort_values(ascending=False)

# fill NaN values with mean
df_test = df_test.fillna(df_test.mean())
df_test.isna().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 73 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1459 non-null   int64  
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   int64  
 5   LotShape       1459 non-null   int64  
 6   LandContour    1459 non-null   int64  
 7   Utilities      1459 non-null   int64  
 8   LotConfig      1459 non-null   int64  
 9   LandSlope      1459 non-null   int64  
 10  Neighborhood   1459 non-null   int64  
 11  Condition1     1459 non-null   int64  
 12  Condition2     1459 non-null   int64  
 13  BldgType       1459 non-null   int64  
 14  HouseStyle     1459 non-null   int64  
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

MSSubClass       0
HeatingQC        0
Fireplaces       0
Functional       0
TotRmsAbvGrd     0
                ..
ExterQual        0
MasVnrArea       0
Exterior2nd      0
Exterior1st      0
SaleCondition    0
Length: 73, dtype: int64

In [95]:
# use gradientboost to predict saleprice in test data
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(random_state=42, n_estimators=1000, learning_rate=0.1, max_depth=3, max_leaf_nodes=5)

model.fit(X, y)
y_pred = model.predict(df_test)
y_pred = np.exp(y_pred)
y_pred = pd.DataFrame(y_pred)

y_pred

Unnamed: 0,0
0,116995.064558
1,157423.240273
2,185818.520925
3,185070.135805
4,177216.992446
...,...
1454,77160.442207
1455,79738.930410
1456,157247.094975
1457,107486.799680


In [96]:
# rename column 0 to SalePrice
y_pred = y_pred.rename(columns={0: 'SalePrice'})
y_pred

Unnamed: 0,SalePrice
0,116995.064558
1,157423.240273
2,185818.520925
3,185070.135805
4,177216.992446
...,...
1454,77160.442207
1455,79738.930410
1456,157247.094975
1457,107486.799680


In [99]:
# swap columns
cols = y_pred.columns.tolist()
cols = cols[-1:] + cols[:-1]
y_pred = y_pred[cols]
y_pred

# add id column
df_test = pd.read_csv('test.csv')
y_pred['Id'] = df_test['Id']
y_pred

Unnamed: 0,SalePrice,Id
0,116995.064558,1461
1,157423.240273,1462
2,185818.520925,1463
3,185070.135805,1464
4,177216.992446,1465
...,...,...
1454,77160.442207,2915
1455,79738.930410,2916
1456,157247.094975,2917
1457,107486.799680,2918


In [101]:
# change order of columns
cols = y_pred.columns.tolist()
cols = cols[-1:] + cols[:-1]
y_pred = y_pred[cols]
y_pred

Unnamed: 0,Id,SalePrice
0,1461,116995.064558
1,1462,157423.240273
2,1463,185818.520925
3,1464,185070.135805
4,1465,177216.992446
...,...,...
1454,2915,77160.442207
1455,2916,79738.930410
1456,2917,157247.094975
1457,2918,107486.799680


In [102]:
# save to csv
y_pred.to_csv('submission2.csv', index=False)

In [116]:
# get best dt by for loop in for loop in for loop
rmse = []

from sklearn.tree import DecisionTreeRegressor
for i in range(2, 100, 3):
    print(i)
    for j in range(2, 100,3):
            model = DecisionTreeRegressor(random_state=42, max_depth=j, max_leaf_nodes=i)
            # add [rmse, i, j] to list
            rmse.append([get_rmse(model, X_train, X_test, y_train, y_test), i, j])
            


2
5
8
11
14
17


20
23
26
29
32
35
38
41
44
47
50
53
56
59
62
65
68
71
74
77
80
83
86
89
92
95
98


In [114]:
rmse

[0.19338282592415695,
 0.19406842233558347,
 0.1942201415675976,
 0.19511594185318087,
 0.1954029420766858,
 0.19576804148787216,
 0.19593401422689657,
 0.1959861833207094,
 0.1963576286567603,
 0.19657552558815392,
 0.1970243076235702,
 0.198552353561114,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19858015121640898,
 0.19870798995335687,
 0.19870798995335687,
 0.19870798995335687,
 0.19870798995335