### Add Imports

In [112]:
import numpy as np
import copy
import os
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read dataset and print the shapes

In [85]:
df = pd.read_csv('train.csv')

feature_names = df.columns
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [51]:
print("Feature Names: ")
print(feature_names)
print("Features Matrix (X):")
print(X[:2])
print("\nTarget Vector (Y):")
print(Y[:2])

Feature Names: 
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCo

### See features

##### See Unique values per column

In [None]:
def create_ranges(series, num_bins=20):
    min_val, max_val = series.min(), series.max()
    range_step = (max_val - min_val) / num_bins
    ranges = [f"{min_val + range_step * i:.2f} - {min_val + range_step * (i+1):.2f}" for i in range(num_bins)]
    return ranges

def print_ranges(df):
  for column in df.columns:
      unique_values = df[column].unique()

      if unique_values.size > 20 and np.issubdtype(df[column].dtype, np.number):
            print(f"Column '{column}' has more than 20 unique numeric values. Creating ranges:")
            print(create_ranges(df[column]))
      else:
        print(f"Unique values in '{column}':")
        print(unique_values)
      print()

print_ranges(df)

##### Check for Nans

In [86]:
def replaceNans(df):
  df_numeric = df.select_dtypes(include=[np.number])
  df[df_numeric.columns] = df_numeric.fillna(0)
  return df

df = replaceNans(df)

### Feature Engineering

In [96]:
def preprocessDataset(df):
  numerical_data = df.select_dtypes(include=['int64', 'float64'])
  categorical_data = df.select_dtypes(include=['object'])

  # One hot encoding for categories
  categorical_data_encoded = pd.get_dummies(categorical_data)

  # Apply Min-Max Scaling to numerical data
  scaler = MinMaxScaler()
  numerical_data_scaled = scaler.fit_transform(numerical_data)
  numerical_data_scaled_df = pd.DataFrame(numerical_data_scaled, columns=numerical_data.columns)

  preprocessed_df = pd.concat([numerical_data_scaled_df, categorical_data_encoded], axis=1)
  preprocessed_df.drop('Id', axis=1, inplace=True)
  return preprocessed_df

preprocessed_df = preprocessDataset(df)
#print(preprocessed_df)

In [97]:
preprocessed_X = preprocessed_df.iloc[:, :-1].values
Y = preprocessed_df.iloc[:, -1].values

In [98]:
print(preprocessed_X.shape)

(1460, 288)


#### Train

In [90]:
input_shape = (preprocessed_X.shape[1],)
model = Sequential([
    Dense(512, input_shape=input_shape),
    Dense(256),
    Dense(100),
    Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mae'])

In [None]:
model.fit(preprocessed_X, Y, epochs=1000, batch_size=32)


# Predicton

In [114]:
# Read train data

test_df = pd.read_csv('test.csv')

ids = test_df['Id'].values.astype(int)
test_df = replaceNans(test_df)
preprocessed_test_df = preprocessDataset(test_df)



In [107]:
print(preprocessed_test_df.shape)

unique_to_preprocessed_df = preprocessed_df.columns.difference(preprocessed_test_df.columns)
print(unique_to_preprocessed_df)

# Populate difference of columns in test_Df
for column in unique_to_preprocessed_df:
    preprocessed_test_df[column] = 0

# maintain order of the columns too:
ordered_columns = [col for col in preprocessed_df.columns if col in preprocessed_test_df.columns]

preprocessed_test_df = preprocessed_test_df[ordered_columns]
preprocessed_test_df.drop('SalePrice', axis=1, inplace=True)

# check difference again:
unique_to_preprocessed_df = preprocessed_df.columns.difference(preprocessed_test_df.columns)
print(unique_to_preprocessed_df)

print("Shape of train: " + str(preprocessed_df.shape) + " Shape of test: " + str(preprocessed_test_df.shape))



(1459, 270)
Index(['Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn',
       'Electrical_Mix', 'Exterior1st_ImStucc', 'Exterior1st_Stone',
       'Exterior2nd_Other', 'GarageQual_Ex', 'Heating_Floor', 'Heating_OthW',
       'HouseStyle_2.5Fin', 'MiscFeature_TenC', 'PoolQC_Fa',
       'RoofMatl_ClyTile', 'RoofMatl_Membran', 'RoofMatl_Metal',
       'RoofMatl_Roll', 'SalePrice', 'Utilities_NoSeWa'],
      dtype='object')
Index(['SalePrice'], dtype='object')
Shape of train: (1460, 289) Shape of test: (1459, 288)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_test_df.drop('SalePrice', axis=1, inplace=True)


In [108]:
predictions = model.predict(preprocessed_test_df)



In [115]:
# concatenate predictions with ids and store to file.

result = np.column_stack((ids, predictions))

print("Predictions shape: " + str(predictions.shape))
print("ids shape: " + str(ids.shape))
print("result shape: " + str(result.shape))

Predictions shape: (1459, 1)
ids shape: (1459,)
result shape: (1459, 2)


In [117]:

result_df = pd.DataFrame(result, columns=['Id', 'SalePrice'])
result_df['Id'] = result_df['Id'].astype(int)


# Save the DataFrame to a CSV file
result_df.to_csv('predictions.csv', index=False)


In [113]:
print("Current Working Directory:", os.getcwd())


Current Working Directory: /content
