In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import researchpy as rp
import statsmodels.api as smf
from statsmodels.formula.api import ols

In [21]:
df_train = pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
Y = df_train[["SalePrice"]]
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [22]:
print(df_test.shape)
#Check how many NaN values have each feature
counting_nan = pd.DataFrame({"Counting nan": df_test.isna().sum().sort_values(ascending=False)})
counting_nan.head(10)

(1459, 80)


Unnamed: 0,Counting nan
PoolQC,1456
MiscFeature,1408
Alley,1352
Fence,1169
FireplaceQu,730
LotFrontage,227
GarageCond,78
GarageQual,78
GarageYrBlt,78
GarageFinish,78


In [23]:
print(df_train.shape)

#Check how many NaN values have each feature
counting_nan = pd.DataFrame({"Counting nan": df_train.isna().sum().sort_values(ascending=False)})
counting_nan.head(10)

(1460, 81)


Unnamed: 0,Counting nan
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
FireplaceQu,690
LotFrontage,259
GarageCond,81
GarageType,81
GarageYrBlt,81
GarageFinish,81


In [24]:
def preprocess(df):
  #df.drop('PoolQC', axis=1, inplace=True)
  #df.drop('MiscFeature', axis=1, inplace=True)
  #df.drop('Alley', axis=1, inplace=True)
  #df.drop('Fence', axis=1, inplace=True)
  #df.drop('FireplaceQu', axis=1, inplace=True)
  #df.drop('LotFrontage', axis=1, inplace=True)
    
  # because a very high percentage of data in the above mentioned columns are null , it is very obvious to drop the columns for better
  # understanding however it can also be understood that null is a characteristic of certain coumn so for this instance we are taking it
    
  numeric_variables = list(df.select_dtypes(include=['int64','float']).columns.values)
  df[numeric_variables]=df[numeric_variables].apply(lambda x: x.fillna(x.median()),axis=0)
  categorical_variables = list(df.select_dtypes(exclude=['int64','float','bool']).columns.values)
  df[categorical_variables]=df[categorical_variables].apply(lambda x: x.fillna("None"),axis=0)
  
  df = pd.get_dummies(df, prefix_sep="__",columns=categorical_variables)  
  
  return df

In [25]:
df_train =  preprocess(df_train.drop("SalePrice",axis=1))
df_test =  preprocess(df_test)
df_train.shape

(1460, 304)

In [26]:
# Get missing columns in the training test
missing_cols = set( df_train.columns ) - set( df_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    df_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
df_test = df_test[df_train.columns]

In [27]:
df_test.shape

(1459, 304)

Since the data shape was about 81 columns we should not at the first stage remove a column with many null values , however a null feature can be seen as a characteristic of a category variable thus we alot a not available in for null.

In [None]:
df_train.head(5)

In [None]:
df_train.describe()

# Splitting The Train set 

In [28]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df_train, Y, random_state=42)
(X_train.shape, y_test.shape)

((1095, 304), (365, 1))

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)   
X_test = scaler.transform(X_test) 

# Lasso Regression 

In [29]:
from sklearn.linear_model import Lasso
lasso = Lasso(random_state=42)


In [30]:
from sklearn.model_selection import GridSearchCV
alphas = [0.03,0,0.02,0.01]

parameters_lasso = {
    'alpha':alphas,
}
n_folds = 2

clf = GridSearchCV(lasso, parameters_lasso, cv=n_folds, refit=False)
clf.fit(X_train, y_train)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']

  estimator.fit(X_train, y_train, **fit_params)
  positive)
  estimator.fit(X_train, y_train, **fit_params)
  positive)


In [31]:
clf.best_params_

{'alpha': 0.03}

In [32]:
scores

array([0.69604112, 0.69504297, 0.69596071, 0.69575354])

In [33]:
scores_std

array([0.01743296, 0.01520781, 0.01680881, 0.01605765])

In [38]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha = clf.best_params_['alpha'] , max_iter = 149898)
reg.fit(X_train,y_train)
clf_predict = reg.predict(X_test)


In [42]:
from sklearn.metrics import explained_variance_score
print("explained Variance Score  --->   " + str(explained_variance_score(y_test, clf_predict)) ) 
from sklearn.metrics import mean_absolute_error
print("mean_absolute_error  --->   " + str(mean_absolute_error(y_test, clf_predict)) ) 
from sklearn.metrics import mean_squared_log_error
#print("mean_squared_log_error  --->   " + str(mean_squared_log_error(y_test, clf_predict)) ) 
from sklearn.metrics import r2_score
print("r2_score  --->   " + str(r2_score(y_test, clf_predict)) ) 

explained Variance Score  --->   0.5014894579601672
mean_absolute_error  --->   19433.024751932204
r2_score  --->   0.49908469618294893


# Ridge Regression

In [43]:
from sklearn.linear_model import Ridge
ridgeReg = Ridge(alpha=0.05, normalize=True)
ridge_params = [{'alpha': [0.05,0.005,0.0005,0.5]
          }]

In [44]:
clf_ridge = GridSearchCV(estimator=ridgeReg,
                   param_grid = ridge_params, 
                   cv=5, 
                   refit=True)

clf_ridge.fit(X_train, y_train)

ridgeReg_opt = clf_ridge.best_estimator_ 
ridgeReg_opt_predict = ridgeReg_opt.predict(X_test)

In [45]:
ridgeReg_opt_predict_test = ridgeReg_opt.predict(df_test)

In [46]:
ridgeReg_test = ridgeReg_opt.predict(df_test)

In [87]:
Solution = pd.DataFrame(df_test[["Id"]])
Solution['SalesPrice'] = ridgeReg_test


In [89]:
Solution.to_csv('SolutionFinal.csv', index = None, header=True)

In [47]:
clf_ridge.best_params_

{'alpha': 0.5}

In [48]:
from sklearn.metrics import explained_variance_score
print("explained Variance Score  --->   " + str(explained_variance_score(y_test, ridgeReg_opt_predict)) ) 
from sklearn.metrics import mean_absolute_error
print("mean_absolute_error  --->   " + str(mean_absolute_error(y_test, ridgeReg_opt_predict)) ) 
from sklearn.metrics import mean_squared_log_error
print("mean_squared_log_error  --->   " + str(mean_squared_log_error(y_test, ridgeReg_opt_predict)) ) 
from sklearn.metrics import r2_score
print("r2_score  --->   " + str(r2_score(y_test, ridgeReg_opt_predict)) ) 

explained Variance Score  --->   0.8789833786170925
mean_absolute_error  --->   17924.493718276877
mean_squared_log_error  --->   0.02140870225939272
r2_score  --->   0.8786980476061791


# Decission Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(max_depth=8,random_state=51)
params = [{'max_depth': np.linspace(1,10,10,dtype=int),
           'random_state':np.linspace(0,100,10,dtype=int) 
          }]

In [None]:
clf_tree = GridSearchCV(estimator=dtree,
                   param_grid = params, 
                   cv=5, 
                   refit=True)

clf_tree.fit(X_train, y_train)

dtree_opt = clf_tree.best_estimator_ 
dtree_opt_predict = dtree_opt.predict(X_test)

In [None]:
clf_tree.best_params_

In [None]:
from sklearn.metrics import explained_variance_score
print("explained Variance Score  --->   " + str(explained_variance_score(y_test, dtree_opt_predict)) ) 
from sklearn.metrics import mean_absolute_error
print("mean_absolute_error  --->   " + str(mean_absolute_error(y_test, dtree_opt_predict)) ) 
from sklearn.metrics import mean_squared_log_error
print("mean_squared_log_error  --->   " + str(mean_squared_log_error(y_test, dtree_opt_predict)) ) 
from sklearn.metrics import r2_score
print("r2_score  --->   " + str(r2_score(y_test, dtree_opt_predict)) ) 