In [1]:
#Import Dependecies
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

In [2]:
#Import Data
training_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
training_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:

# Check for null values in each column
columns_with_null = training_data.columns[training_data.isnull().any()].tolist()

#Clean out null columns with proper replacements
training_data['LotFrontage'].fillna(0, inplace=True)
training_data['Alley'].fillna('NoAlley', inplace=True)
training_data['MasVnrType'].fillna('None', inplace=True)
training_data['MasVnrArea'].fillna(0, inplace=True)
training_data['GarageYrBlt'].fillna(0, inplace=True)

columns_to_convert = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                      'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond','PoolQC','Fence','MiscFeature']

for column in columns_to_convert:
    training_data[column].fillna('NA', inplace=True)



# Print columns with null values
for column in columns_with_null:
    unique_values = training_data[column].unique()
    print(f'Column "{column}" contains null values. Unique values: {unique_values}')



Column "LotFrontage" contains null values. Unique values: [ 65.  80.  68.  60.  84.  85.  75.   0.  51.  50.  70.  91.  72.  66.
 101.  57.  44. 110.  98.  47. 108. 112.  74. 115.  61.  48.  33.  52.
 100.  24.  89.  63.  76.  81.  95.  69.  21.  32.  78. 121. 122.  40.
 105.  73.  77.  64.  94.  34.  90.  55.  88.  82.  71. 120. 107.  92.
 134.  62.  86. 141.  97.  54.  41.  79. 174.  99.  67.  83.  43. 103.
  93.  30. 129. 140.  35.  37. 118.  87. 116. 150. 111.  49.  96.  59.
  36.  56. 102.  58.  38. 109. 130.  53. 137.  45. 106. 104.  42.  39.
 144. 114. 128. 149. 313. 168. 182. 138. 160. 152. 124. 153.  46.]
Column "Alley" contains null values. Unique values: ['NoAlley' 'Grvl' 'Pave']
Column "MasVnrType" contains null values. Unique values: ['BrkFace' 'None' 'Stone' 'BrkCmn']
Column "MasVnrArea" contains null values. Unique values: [1.960e+02 0.000e+00 1.620e+02 3.500e+02 1.860e+02 2.400e+02 2.860e+02
 3.060e+02 2.120e+02 1.800e+02 3.800e+02 2.810e+02 6.400e+02 2.000e+02
 2.460e+

In [4]:
numerical_features = training_data.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_features = training_data.select_dtypes(include=['object']).columns.tolist()

In [18]:
# Create a copy of the DataFrame
df_hashed = training_data.copy()

# Apply feature hashing to each categorical column
for col in categorical_features: 
    num_unique_values = training_data[col].nunique()
    num_hashed_features = max(1, num_unique_values // 2)  # Ensure at least 1 feature
    fh = FeatureHasher(n_features=num_hashed_features, input_type='string')
    hashed_features = fh.fit_transform(training_data[[col]].astype(str).values)
    hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f"{col}_hash_{i}" for i in range(num_hashed_features)])
    df_hashed = pd.concat([df_hashed, hashed_df], axis=1)

# Drop the original categorical columns
df_hashed = df_hashed.drop(categorical_features, axis=1)

# Display the resulting DataFrame
print(df_hashed)



        Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
0        1          60         65.0     8450            7            5   
1        2          20         80.0     9600            6            8   
2        3          60         68.0    11250            7            5   
3        4          70         60.0     9550            7            5   
4        5          60         84.0    14260            8            5   
...    ...         ...          ...      ...          ...          ...   
1455  1456          60         62.0     7917            6            5   
1456  1457          20         85.0    13175            6            6   
1457  1458          70         66.0     9042            7            9   
1458  1459          20         68.0     9717            5            6   
1459  1460          20         75.0     9937            5            6   

      YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  Fence_hash_1  \
0          2003          2003      

In [19]:
#Scale the data
scaler = StandardScaler()
df_hashed[numerical_features] = scaler.fit_transform(df_hashed[numerical_features])

In [20]:
X = df_hashed.drop('SalePrice', axis=1)
X = X.drop('Id', axis = 1)
y = df_hashed['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 768459735.1626816
R-squared: 0.8903032358142621


In [21]:
df_hashed.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Fence_hash_1,MiscFeature_hash_0,MiscFeature_hash_1,SaleType_hash_0,SaleType_hash_1,SaleType_hash_2,SaleType_hash_3,SaleCondition_hash_0,SaleCondition_hash_1,SaleCondition_hash_2
0,1,0.073375,0.212877,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.514104,0.575425,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0
1,2,-0.872563,0.645747,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57075,1.171992,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0
2,3,0.073375,0.299451,0.07348,0.651479,-0.5172,0.984752,0.830215,0.325915,0.092907,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0
3,4,0.309859,0.068587,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57075,-0.499274,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0
4,5,0.073375,0.761179,0.375148,1.374795,-0.5172,0.951632,0.733308,1.366489,0.463568,...,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0


In [22]:
from sklearn.ensemble import GradientBoostingRegressor

# Create the Gradient Boosting Regressor model
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred_gb = gb.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print("Gradient Boosting Mean Squared Error:", mse_gb)
print("Gradient Boosting R-squared:", r2_gb)

Gradient Boosting Mean Squared Error: 747635858.2101732
Gradient Boosting R-squared: 0.8932758208632479


In [25]:
feature_importance = gb.feature_importances_

# Create a DataFrame to display the feature importances
fi_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})
fi_df = fi_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

# Display the feature importances
print(fi_df)

                  Feature  Importance
0             OverallQual    0.492594
1               GrLivArea    0.142112
2              GarageCars    0.048483
3              BsmtFinSF1    0.040046
4             TotalBsmtSF    0.039714
..                    ...         ...
154     HouseStyle_hash_3    0.000000
155     HouseStyle_hash_1    0.000000
156     HouseStyle_hash_0    0.000000
157       BldgType_hash_1    0.000000
158  SaleCondition_hash_2    0.000000

[159 rows x 2 columns]


In [27]:
#Testing data predictions 
# Check for null values in each column
columns_with_null = test_data.columns[test_data.isnull().any()].tolist()

# Clean out null columns with proper replacements
test_data['LotFrontage'].fillna(0, inplace=True)
test_data['Alley'].fillna('NoAlley', inplace=True)
test_data['MasVnrType'].fillna('None', inplace=True)
test_data['MasVnrArea'].fillna(0, inplace=True)
test_data['GarageYrBlt'].fillna(0, inplace=True)

columns_to_convert = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                      'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond','PoolQC','Fence','MiscFeature']

for column in columns_to_convert:
    test_data[column].fillna('NA', inplace=True)

# Print columns with null values
for column in columns_with_null:
    unique_values = test_data[column].unique()
    print(f'Column "{column}" contains null values. Unique values: {unique_values}')

Column "MSZoning" contains null values. Unique values: ['RH' 'RL' 'RM' 'FV' 'C (all)' nan]
Column "Utilities" contains null values. Unique values: ['AllPub' nan]
Column "Exterior1st" contains null values. Unique values: ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'CemntBd' 'WdShing'
 'BrkFace' 'AsbShng' 'BrkComm' 'Stucco' 'AsphShn' nan 'CBlock']
Column "Exterior2nd" contains null values. Unique values: ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'Brk Cmn' 'CmentBd'
 'ImStucc' 'Wd Shng' 'AsbShng' 'Stucco' 'CBlock' 'BrkFace' 'AsphShn' nan
 'Stone']
Column "BsmtFinSF1" contains null values. Unique values: [ 468.  923.  791.  602.  263.    0.  935.  637.  804. 1051.  156.  300.
  514.  110.   28. 1373.  578.   24.   16.  326. 1414.  126.  250. 1129.
 1298.  280.  368.  376.  466.  244. 1032.  484.  833.  506. 1137.  687.
  329.  698. 1059. 1010. 1500.  670.  944. 1188.  856.  936.  734.  339.
  648.  532.  481.  588.  717.   48.  579.  274.  780.  176.  283.  788.
  474.  188

In [12]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,NoAlley,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,NoAlley,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,NoAlley,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,NoAlley,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,NoAlley,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [13]:
numerical_features = test_data.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_features = test_data.select_dtypes(include=['object']).columns.tolist()
numerical_features.remove('Id')

In [14]:
# Create a copy of the DataFrame
test_data_hashed = test_data.copy()

# Apply feature hashing to each categorical column
for col in categorical_features: 
    fh = FeatureHasher(n_features=10, input_type='string')
    hashed_features = fh.fit_transform(test_data[[col]].astype(str).values)
    hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f"{col}_hash_{i}" for i in range(10)])
    test_data_hashed = pd.concat([test_data_hashed, hashed_df], axis=1)

# Drop the original categorical columns
test_data_hashed = test_data_hashed.drop(categorical_features, axis=1)

In [15]:
test_data_hashed.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleCondition_hash_0,SaleCondition_hash_1,SaleCondition_hash_2,SaleCondition_hash_3,SaleCondition_hash_4,SaleCondition_hash_5,SaleCondition_hash_6,SaleCondition_hash_7,SaleCondition_hash_8,SaleCondition_hash_9
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
# Scale the data
scaler = StandardScaler()
test_data_hashed[numerical_features] = scaler.fit_transform(test_data_hashed[numerical_features])
test_data_hashed.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleCondition_hash_0,SaleCondition_hash_1,SaleCondition_hash_2,SaleCondition_hash_3,SaleCondition_hash_4,SaleCondition_hash_5,SaleCondition_hash_6,SaleCondition_hash_7,SaleCondition_hash_8,SaleCondition_hash_9
0,1461,-0.874711,0.684849,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.563316,0.063273,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1462,-0.874711,0.715852,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.047057,1.063027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1463,0.061351,0.498831,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.563316,0.772989,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1464,0.061351,0.622843,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.450284,0.357706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1465,1.465443,-0.462261,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.563316,-0.387166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
# Assuming 'Id' column is present and should be dropped
X_second = test_data_hashed.drop(['Id'], axis=1)

# Make predictions using the trained model
y_pred_second = gb.predict(X_second)

# Add the predicted sale prices to the second data DataFrame
test_data_hashed['PredictedSalePrice'] = y_pred_second

# Display the DataFrame with the predicted sale prices
print(test_data_hashed[['Id', 'PredictedSalePrice']])

ValueError: Input X contains NaN.
GradientBoostingRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values