In [41]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset


In [51]:
# Load the dataset
data = pd.read_csv("Melbourne_housing_FULL.csv")

numeric_features_index = data.dtypes[data.dtypes != 'object'].index
print("the shape of numeric dtype indexs:",numeric_features_index.shape)

label_features_index = ['Price']
numeric_features_index = numeric_features_index.drop(label_features_index)
object_feature_index = data.dtypes[data.dtypes == 'object'].index

print("the lable name:", label_features_index)
print("the shape of numeric dtype indexs:",numeric_features_index.shape)
print("the shape of object dtype indexs:",object_feature_index.shape)


# Example of feature engineering: let's say we are using the numerical columns
features = data[numeric_features_index].values
labels = data[label_features_index].values
print('hi')

# Standardize the features (important for neural networks)
scaler = StandardScaler()
features = scaler.fit_transform(features)

# Reshape the features into a 2D format like a "image" (mimicking CNN's input format)
# The CNN expects input with shape (batch_size, channels, height, width). We'll reshape to simulate a single-channel "image".
features = features.reshape(features.shape[0], 1, 2, 3)  # Simulating a 1x2x3 "image" from the features

# Convert data to PyTorch tensors
features_tensor = torch.tensor(features, dtype=torch.float32)
labels_tensor = torch.tensor(labels, dtype=torch.float32).view(-1, 1)  # Reshaping labels to (N, 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_tensor, labels_tensor, test_size=0.2, random_state=42)

# Create DataLoader for batch processing
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


the shape of numeric dtype indexs: (13,)
the lable name: ['Price']
the shape of numeric dtype indexs: (12,)
the shape of object dtype indexs: (8,)
hi


ValueError: cannot reshape array of size 418284 into shape (34857,1,2,3)

# Define the CNN Architecture

In [None]:
class HousePriceCNN(nn.Module):
    def __init__(self):
        super(HousePriceCNN, self).__init__()
        
        # Define CNN layers (convolution, pooling)
        self.conv1 = nn.Conv2d(1, 32, kernel_size=2)  # Input: (1, 2, 3) -> Output: (32, 1, 2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=2)  # Output: (64, 1, 1)
        
        # Flatten the output from the CNN layers
        self.flatten = nn.Flatten()
        
        # Fully connected layers
        self.fc1 = nn.Linear(64, 128)  # First fully connected layer
        self.fc2 = nn.Linear(128, 1)  # Output layer

    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.conv2(x)
        x = torch.relu(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x


# Train the CNN Model
 Write the training loop for the model

In [None]:
# Initialize the model, loss function, and optimizer
model = HousePriceCNN()
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader)}")

# Save the trained model
torch.save(model.state_dict(), "house_price_cnn.pth")


# Evaluate the model on the test set

After training model can be tested

In [None]:
# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predictions.append(outputs.numpy())
        true_labels.append(labels.numpy())

# Convert lists to numpy arrays
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# Print Mean Squared Error
mse = np.mean((predictions - true_labels) ** 2)
print(f"Mean Squared Error on Test Set: {mse}")


# Prediction Example

In [None]:
# Example prediction
new_data = np.array([[10000, 6, 5, 2500, 8, 1995]])  # Example feature values
new_data = scaler.transform(new_data)  # Apply the same scaler

# Reshape to simulate "image-like" input
new_data = new_data.reshape(new_data.shape[0], 1, 2, 3)

# Convert to tensor
new_data_tensor = torch.tensor(new_data, dtype=torch.float32)

# Make prediction
model.eval()
with torch.no_grad():
    prediction = model(new_data_tensor)
    print(f"Predicted House Price: ${prediction.item():,.2f}")


# Conclusion

- This is a basic CNN model for house price prediction. Although CNNs are typically used for image data, you can experiment with structured data in a similar "image-like" format for feature input.

- For structured data like house prices, fully connected (dense) neural networks or other traditional machine learning models (e.g., decision trees, random forests, etc.) are usually more effective.

In [39]:
melbourne_data =  pd.read_csv("/home/gitika/My_Home/Python_ML_CNN/Regression/Melbourne_housing_FULL.csv")
#print(melbourne_data.describe())

inp_f=['Rooms','Suburb','Distance']
df_all = melbourne_data[inp_f]

new_all = pd.get_dummies(df_all, dummy_na=True)
print("the shape of preprossed train data:",new_all.shape)
print('df_all.shape,new_all.shape',df_all.shape,new_all.shape)
#print(df_all)
print(new_all)

the shape of preprossed train data: (34857, 354)
df_all.shape,new_all.shape (34857, 3) (34857, 354)
       Rooms  Distance  Suburb_Abbotsford  Suburb_Aberfeldie  \
0          2       2.5               True              False   
1          2       2.5               True              False   
2          2       2.5               True              False   
3          3       2.5               True              False   
4          3       2.5               True              False   
...      ...       ...                ...                ...   
34852      4       6.3              False              False   
34853      2       6.3              False              False   
34854      2       6.3              False              False   
34855      3       6.3              False              False   
34856      2       6.3              False              False   

       Suburb_Airport West  Suburb_Albanvale  Suburb_Albert Park  \
0                    False             False               Fals