In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
def binary_encode(column:str,df1):
  if df1[column].isna().sum() > 0:
    df1[column] = df1[column].fillna(0)
    df1.loc[df1[column].notna(), column] = 1
    return 'Col modificada'
  return 'Col no modificada'

def one_hot_encode(df,lst_categories:list,column:str):

  # OneHotEncoder with specified categories
  encoder = OneHotEncoder(categories=[lst_categories], handle_unknown='ignore', sparse_output=False)

  # Fit on df MSZoning and transform
  encoded_array = encoder.fit_transform(df[[column]])
  encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([column]))
  df = pd.concat([df, encoded_df], axis=1)
  df = df.drop(column, axis=1)
  return df

In [62]:

def pre_processing(df):

    df = df.drop(['Utilities', 'Street', 'PoolQC'], axis=1)

    df['TotalBsmtSF'].fillna(0, inplace=True)
    df['1stFlrSF'].fillna(0, inplace=True)
    df['2ndFlrSF'].fillna(0, inplace=True)
    df['GarageCars'].fillna(0, inplace=True)
    df['GarageArea'].fillna(0, inplace=True)

    '''df1['remodeled'] = ''
    df1['yearbuild_range'] = 1

    df1.loc[df1['YearRemodAdd'] == df1['YearBuilt'], 'remodeled'] = 0
    df1.loc[df1['YearRemodAdd'] != df1['YearBuilt'], 'remodeled'] = 1

    df1.loc[df1['YearBuilt'] > 1954, 'yearbuild_range'] = 2
    df1.loc[df1['YearBuilt'] > 1973, 'yearbuild_range'] = 3
    df1.loc[df1['YearBuilt'] > 2000, 'yearbuild_range'] = 4

    df1.drop(['YearBuilt','YearRemodAdd'], axis=1, inplace=True)'''

    #binary_encode('LotFrontage')
    binary_encode('BsmtQual',df)
    binary_encode('GarageType',df)
    #binary_encode('PoolQC')
    binary_encode('Fence',df)

    df.loc[df['CentralAir'] == 'Y', 'CentralAir'] = 1

    df.rename(columns={'BsmtQual': 'Basement'}, inplace=True)
    df.rename(columns={'GarageType': 'Garage'}, inplace=True)

    '''lst_street = ['Pave','Grvl']
    df1 = one_hot_encode(df1,lst_street,'Street')'''

    lst_lotshape = ['Reg','IR1','IR2','IR3']
    df = one_hot_encode(df,lst_lotshape,'LotShape')

    lst_mssubclass = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150,160,180,190]
    df = one_hot_encode(df,lst_mssubclass,'MSSubClass')

    lst_mszoning = ['RL','RM','FV','RH','C','A','I','RP']
    df = one_hot_encode(df,lst_mszoning,'MSZoning')

    lst_landcontour = ['Lvl','Bnk','Low','HLS']
    df = one_hot_encode(df,lst_landcontour,'LandContour')

    '''lst_utilities = ['AllPub','NoSewr','NoSeWa','ELO']
    df1 = one_hot_encode(df1,lst_utilities,'Utilities')'''

    '''lst_lotconfig = ['Inside','Corner','CulDSac','FR2','FR3']
    df1 = one_hot_encode(df1,lst_lotconfig,'LotConfig')'''

    lst_landslope = ['Gtl','Mod','Sev']
    df = one_hot_encode(df,lst_landslope,'LandSlope')

    lst_neighborhood = ["Blmngtn","Blueste","BrDale","BrkSide","ClearCr","CollgCr","Crawfor","Edwards","Gilbert","IDOTRR","MeadowV","Mitchel","Names","NoRidge","NPkVill","NridgHt","NWAmes","OldTown","SWISU","Sawyer","SawyerW","Somerst","StoneBr","Timber","Veenker"]
    df = one_hot_encode(df,lst_neighborhood,'Neighborhood')

    lst_roofstyle = ['Gable','Hip','Gambrel','Mansard','Flat','Shed']
    df = one_hot_encode(df,lst_roofstyle,'RoofStyle')

    lst_foundation = ['BrkTil','CBlock','PConc','Slab','Stone','Wood']
    df = one_hot_encode(df,lst_foundation,'Foundation')

    

    # Fill missing values in non-numeric columns with 'None'
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    df[non_numeric_columns] = df[non_numeric_columns].fillna('None')

    '''lst_heating = ['Floor','GasA','GasW','Grav','OthW','Wall']
    df1 = one_hot_encode(df1,lst_heating,'Heating')'''

    X = df[df.columns.to_list()]
    return X

In [106]:
def pre_processing_v2(train,test):
    print("Train set size:", train.shape)
    print("Test set size:", test.shape)
    print('START data processing')

    train_ID = train['Id']
    test_ID = test['Id']
    # Now drop the  'Id' colum since it's unnecessary for  the prediction process.
    train.drop(['Id'], axis=1, inplace=True)
    test.drop(['Id'], axis=1, inplace=True)

    # Deleting outliers
    train = train[train.GrLivArea < 4500]
    train.reset_index(drop=True, inplace=True)

    # We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
    #train["SalePrice"] = np.log1p(train["SalePrice"])
    y = train.SalePrice.reset_index(drop=True)
    train_features = train.drop(['SalePrice'], axis=1)
    test_features = test

    features = pd.concat([train_features, test_features]).reset_index(drop=True)
    print(features.shape)
    # Some of the non-numeric predictors are stored as numbers; we convert them into strings 
    features['MSSubClass'] = features['MSSubClass'].apply(str)
    features['YrSold'] = features['YrSold'].astype(str)
    features['MoSold'] = features['MoSold'].astype(str)

    features['Functional'] = features['Functional'].fillna('Typ')
    features['Electrical'] = features['Electrical'].fillna("SBrkr")
    features['KitchenQual'] = features['KitchenQual'].fillna("TA")
    features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
    features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
    features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

    features["PoolQC"] = features["PoolQC"].fillna("None")

    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        features[col] = features[col].fillna(0)
    for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        features[col] = features[col].fillna('None')
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        features[col] = features[col].fillna('None')

    features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

    objects = []
    for i in features.columns:
        if features[i].dtype == object:
            objects.append(i)

    features.update(features[objects].fillna('None'))

    features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

    # Filling in the rest of the NA's

    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerics = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numerics.append(i)
    features.update(features[numerics].fillna(0))

    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerics2 = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numerics2.append(i)

    '''skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

    high_skew = skew_features[skew_features > 0.5]
    skew_index = high_skew.index

    for i in skew_index:
        features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))'''

    features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

    features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
    features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

    features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                    features['1stFlrSF'] + features['2ndFlrSF'])

    features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                                features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

    features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                                features['EnclosedPorch'] + features['ScreenPorch'] +
                                features['WoodDeckSF'])

    # simplified features
    features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

    print(features.shape)
    final_features = pd.get_dummies(features).reset_index(drop=True)
    print(final_features.shape)
    X = final_features.iloc[:len(y), :]
    X_sub = final_features.iloc[len(y):, :]

    outliers = [30, 88, 462, 631, 1322]
    X = X.drop(X.index[outliers])
    y = y.drop(y.index[outliers])

    overfit = []
    for i in X.columns:
        counts = X[i].value_counts()
        zeros = counts.iloc[0]
        if zeros / len(X) * 100 > 99.94:
            overfit.append(i)

    overfit = list(overfit)
    overfit.append('MSZoning_C (all)')

    X = X.drop(overfit, axis=1).copy()
    X_sub = X_sub.drop(overfit, axis=1).copy()

    return X,y,X_sub

In [107]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

Train set size: (1460, 81)
Test set size: (1459, 80)
START data processing
(2917, 79)
(2917, 86)
(2917, 333)


In [64]:
quantitative = [f for f in train.columns if train.dtypes[f] != 'object']
quantitative.remove('SalePrice')
quantitative.remove('Id')
qualitative = [f for f in train.columns if train.dtypes[f] == 'object']

print("Qualitatives features: ",len(qualitative))
print("Quantitative features: ",len(quantitative))


Qualitatives features:  43
Quantitative features:  36


In [None]:
X_train,y_train,x_test = pre_processing_v2(train,test)

In [108]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

from sklearn.linear_model import Lasso
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
import time
lasso = Lasso()

# Define the hyperparameters distribution
param_dist = {
    'alpha': uniform(0.1, 10.0),  # Uniform distribution between 0.1 and 10.0
    'fit_intercept': [True, False],
    'max_iter': [1000, 5000, 10000],
    'tol': [0.0001, 0.001, 0.01],
    'selection': ['cyclic', 'random']
}

# Set up the randomized search with verbose output
random_search = RandomizedSearchCV(estimator=lasso, param_distributions=param_dist, n_iter=50, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42, verbose=2)

# Measure the time taken to fit the randomized search
start_time = time.time()
random_search.fit(X_train, y_train)
end_time = time.time()

# Calculate the duration
duration = end_time - start_time
print(f"Randomized search took {duration:.2f} seconds")

# Get the best parameters
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

# Train the final model with the best parameters
best_lasso = Lasso(**best_params)
best_lasso.fit(X_train, y_train)

# Evaluate the model


from sklearn.linear_model import Ridge
ridge = Ridge(alpha=4.0)
ridge.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Randomized search took 77.42 seconds
Best parameters found:  {'alpha': 9.71172024349349, 'fit_intercept': True, 'max_iter': 1000, 'selection': 'random', 'tol': 0.01}


In [109]:
y_pred_lr = model.predict(x_test)
y_pred_ridge = ridge.predict(x_test)
y_pred = best_lasso.predict(x_test)

In [110]:
df = pd.read_csv('sample_submission.csv')
df['SalePrice'] = y_pred

df.to_csv('submission.csv', index=False)

In [84]:
mse = mean_squared_error(y_train, y_pred_lr)
r2 = r2_score(y_train, y_pred_lr)

print(f"linear Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

mse = mean_squared_error(y_train, y_pred_laso)
r2 = r2_score(y_train, y_pred_laso)

print(f"laso Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

mse = mean_squared_error(y_train, y_pred_ridge)
r2 = r2_score(y_train, y_pred_ridge)

print(f"ridge Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

linear Mean Squared Error: 0.00794170551648995
R-squared: 0.9502588777462642
laso Mean Squared Error: 0.03029921331426972
R-squared: 0.810227554959351
ridge Mean Squared Error: 0.008592161794979184
R-squared: 0.9461848881980667
