<a href="https://colab.research.google.com/github/elsaimo/4105-project/blob/Jon/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [107]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [108]:
import pandas as pd

# Load the dataset into a Pandas DataFrame
df = pd.read_csv("house-train_1.csv")

# Display the first few rows of the DataFrame
print(df.head())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [109]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

# Fill missing values with appropriate strategies
# For numerical columns, we'll fill missing values with the mean
numerical_columns = df.select_dtypes(include=[np.number]).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

# For categorical columns, we'll fill missing values with the most frequent value
categorical_columns = df.select_dtypes(exclude=[np.number]).columns
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])

# Verify if there are any missing values remaining
missing_values_after_fill = df.isnull().sum()
print("\nMissing values after filling:\n", missing_values_after_fill)


Missing values:
 Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

Missing values after filling:
 Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 81, dtype: int64


In [110]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# One-hot encode categorical variables
encoded_df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Scale numerical features
scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(encoded_df[numerical_columns])
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_columns)

# Combine encoded categorical variables and scaled numerical features
preprocessed_df = pd.concat([scaled_numerical_df, encoded_df.drop(columns=numerical_columns)], axis=1)

# Display the preprocessed DataFrame
print(preprocessed_df.head())


         Id  MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  \
0 -1.730865    0.073375    -0.229372 -0.207142     0.651479    -0.517200   
1 -1.728492   -0.872563     0.451936 -0.091886    -0.071836     2.179628   
2 -1.726120    0.073375    -0.093110  0.073480     0.651479    -0.517200   
3 -1.723747    0.309859    -0.456474 -0.096897     0.651479    -0.517200   
4 -1.721374    0.073375     0.633618  0.375148     1.374795    -0.517200   

   YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1  ...  SaleType_ConLI  \
0   1.050994      0.878668    0.511418    0.575425  ...           False   
1   0.156734     -0.429577   -0.574410    1.171992  ...           False   
2   0.984752      0.830215    0.323060    0.092907  ...           False   
3  -1.863632     -0.720298   -0.574410   -0.499274  ...           False   
4   0.951632      0.733308    1.364570    0.463568  ...           False   

   SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \
0           False         Fals

In [111]:
from sklearn.impute import SimpleImputer

# Convert non-numeric values to NaN
X_train_numeric = X_train.apply(pd.to_numeric, errors='coerce')
X_test_numeric = X_test.apply(pd.to_numeric, errors='coerce')

# Impute or drop NaN values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_numeric), columns=X_train_numeric.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test_numeric), columns=X_test_numeric.columns)

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_imputed.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_imputed.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)  # No need for .values here

# Check the shapes of the tensors
print("Shapes of tensors:")
print("X_train:", X_train_tensor.shape)
print("y_train:", y_train_tensor.shape)
print("X_test:", X_test_tensor.shape)
print("y_test:", y_test_tensor.shape)


Shapes of tensors:
X_train: torch.Size([1168, 245])
y_train: torch.Size([1168])
X_test: torch.Size([292, 245])
y_test: torch.Size([292])


In [112]:
import torch.nn as nn
import torch.optim as optim

# Define the neural network architecture
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the model
input_size = X_train_tensor.shape[1]
hidden_size = 64
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [113]:
# Define number of epochs
num_epochs = 100

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor.view(-1, 1))

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print progress
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/100], Loss: 0.4596
Epoch [20/100], Loss: 0.2401
Epoch [30/100], Loss: 0.1759
Epoch [40/100], Loss: 0.1463
Epoch [50/100], Loss: 0.1258
Epoch [60/100], Loss: 0.1121
Epoch [70/100], Loss: 0.1003
Epoch [80/100], Loss: 0.0901
Epoch [90/100], Loss: 0.0809
Epoch [100/100], Loss: 0.0729


In [114]:
# Evaluate the model
with torch.no_grad():
    model.eval()
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor.view(-1, 1))
    print(f'Test Loss: {test_loss.item():.4f}')


Test Loss: 0.1556


In [115]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Convert tensors to numpy arrays
y_pred = y_pred.numpy()
y_test = y_test_tensor.numpy()

# Compute evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')
print(f'R-squared (R2): {r2:.4f}')


Mean Absolute Error (MAE): 0.2239
Root Mean Squared Error (RMSE): 0.3945
R-squared (R2): 0.8721


In [116]:
# Save the trained model
torch.save(model.state_dict(), 'house_price_prediction_model.pth')


Loading the model

In [117]:
# Load the test dataset
test_df = pd.read_csv("house-test_1.csv")

# Preprocess the test dataset
# Convert non-numeric values to NaN
test_df_numeric = test_df.apply(pd.to_numeric, errors='coerce')

# Get the intersection of columns between X_train_imputed and test_df_numeric
common_columns = list(set(X_train_imputed.columns) & set(test_df_numeric.columns))

# Select only the common columns in the test dataset
test_df_common = test_df_numeric[common_columns]

# Impute missing values using the same imputer as before
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train_imputed[common_columns])  # Fit the imputer on the training dataset
test_df_imputed = pd.DataFrame(imputer.transform(test_df_common), columns=common_columns)

# Convert the preprocessed test dataset to PyTorch tensors
test_tensor = torch.tensor(test_df_imputed.values, dtype=torch.float32)

# Make predictions with the trained model
with torch.no_grad():
    model.eval()
    predictions = model(test_tensor)

# Convert predictions to numpy array
predicted_prices = predictions.numpy()

# Output the predicted prices
for i in range(len(test_df)):
    print(f"ID: {test_df['Id'][i]}, Predicted Price: {predicted_prices[i][0]}")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1459x37 and 245x64)