### Add Imports

In [1]:
import numpy as np
import copy
import os
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


%matplotlib inline

%load_ext autoreload
%autoreload 2

### Read dataset and print the shapes

In [2]:
df = pd.read_csv('train.csv')

feature_names = df.columns
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [124]:
#print("Feature Names: ")
#print(feature_names)
#print("Features Matrix (X):")
#print(X[:2])
#print("\nTarget Vector (Y):")
#print(Y[:2])

### See features

##### See Unique values per column

In [125]:
def create_ranges(series, num_bins=20):
    min_val, max_val = series.min(), series.max()
    range_step = (max_val - min_val) / num_bins
    ranges = [f"{min_val + range_step * i:.2f} - {min_val + range_step * (i+1):.2f}" for i in range(num_bins)]
    return ranges

def print_ranges(df):
  for column in df.columns:
      unique_values = df[column].unique()

      if unique_values.size > 20 and np.issubdtype(df[column].dtype, np.number):
            print(f"Column '{column}' has more than 20 unique numeric values. Creating ranges:")
            print(create_ranges(df[column]))
      else:
        print(f"Unique values in '{column}':")
        print(unique_values)
      print()

#print_ranges(df)

##### Check for Nans

In [3]:
def replace_nans(df):
  df_numeric = df.select_dtypes(include=[np.number])
  df[df_numeric.columns] = df_numeric.fillna(0)
  return df

df = replace_nans(df)

### Feature Engineering

In [4]:
def preprocessDataset(df):
  numerical_data = df.select_dtypes(include=['int64', 'float64'])
  categorical_data = df.select_dtypes(include=['object'])

  # One hot encoding for categories
  categorical_data_encoded = pd.get_dummies(categorical_data)

  # Apply Min-Max Scaling to numerical data
  scaler = MinMaxScaler()
  numerical_data_scaled = scaler.fit_transform(numerical_data)
  numerical_data_scaled_df = pd.DataFrame(numerical_data_scaled, columns=numerical_data.columns)

  preprocessed_df = pd.concat([numerical_data_scaled_df, categorical_data_encoded], axis=1)
  preprocessed_df.drop('Id', axis=1, inplace=True)
  return preprocessed_df

preprocessed_df = preprocessDataset(df)
#print(preprocessed_df)

In [5]:
preprocessed_X = preprocessed_df.iloc[:, :-1].values
Y = preprocessed_df.iloc[:, -1].values

In [None]:
print(preprocessed_X.shape)

(1460, 288)


#### Train

In [7]:
input_shape = (preprocessed_X.shape[1],)
model = Sequential([
    Dense(2024, input_shape=input_shape),
    Dense(1024),
    Dense(512),
    Dense(256),
    Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')])

In [8]:
model.fit(preprocessed_X, Y, epochs=1000, batch_size=4)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.src.callbacks.History at 0x7e1cc8c84f40>

# Predicton

In [9]:
# Read train data

test_df = pd.read_csv('test.csv')

ids = test_df['Id'].values.astype(int)
test_df = replace_nans(test_df)
preprocessed_test_df = preprocessDataset(test_df)



In [10]:
#print(preprocessed_test_df.shape)

unique_to_preprocessed_df = preprocessed_df.columns.difference(preprocessed_test_df.columns)
#print(unique_to_preprocessed_df)

# Populate difference of columns in test_Df
for column in unique_to_preprocessed_df:
    preprocessed_test_df[column] = 0

# maintain order of the columns too:
ordered_columns = [col for col in preprocessed_df.columns if col in preprocessed_test_df.columns]

preprocessed_test_df = preprocessed_test_df[ordered_columns]
preprocessed_test_df.drop('SalePrice', axis=1, inplace=True)

# check difference again:
unique_to_preprocessed_df = preprocessed_df.columns.difference(preprocessed_test_df.columns)
#print(unique_to_preprocessed_df)

#print("Shape of train: " + str(preprocessed_df.shape) + " Shape of test: " + str(preprocessed_test_df.shape))



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_test_df.drop('SalePrice', axis=1, inplace=True)


In [11]:
predictions = model.predict(preprocessed_test_df)



In [12]:
# concatenate predictions with ids and store to file.

result = np.column_stack((ids, predictions))

print("Predictions shape: " + str(predictions.shape))
print("ids shape: " + str(ids.shape))
print("result shape: " + str(result.shape))

Predictions shape: (1459, 1)
ids shape: (1459,)
result shape: (1459, 2)


In [13]:

result_df = pd.DataFrame(result, columns=['Id', 'SalePrice'])
result_df['Id'] = result_df['Id'].astype(int)


# Save the DataFrame to a CSV file
result_df.to_csv('predictions.csv', index=False)


In [None]:
print("Current Working Directory:", os.getcwd())


Current Working Directory: /content
