# Mini projekt 2
- Damian Baraniak
- Michał Kaniecki

In [695]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix


In [696]:
RANDOM_SEED = 202
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data processing

In [697]:
data_prices = pd.read_csv("train_data.csv")
data_prices.head(5)

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0


### Converting columns

In [698]:
def features_preprocess(data: pd.DataFrame | pd.Series) -> pd.DataFrame | pd.Series:
    # Map TimeToBusStop and TimeToSubway
    bus_stop_time_map = {
        "0~5min": 5,
        "5min~10min": 2,
        "10min~15min": 0
    }
    subway_time_map = {
        "0-5min": 5,
        "5min~10min": 3,
        "10min~15min": 2,
        "15min~20min": 1,
        "no_bus_stop_nearby": 0
    }

    data["TimeToBusStop"] = data["TimeToBusStop"].map(bus_stop_time_map)
    data["TimeToSubway"] = data["TimeToSubway"].map(subway_time_map)

    # One hot encoding for "HallwayType", "HeatingType", "AptManageType", "SubwayStation"
    dummy_columns = ["HallwayType", "HeatingType", "AptManageType", "SubwayStation"]
    drop_first = True
    return pd.get_dummies(data, columns=dummy_columns, drop_first=drop_first)


def label_preprocess(data):
    # Classifies SalePrice to appropriate class
    data['class'] = pd.cut(data['SalePrice'], bins=[-float('inf'), 10e4, 35e4, float('inf')], labels=[0, 1, 2])
    return data.drop(columns=['SalePrice'])


processed_data = features_preprocess(data_prices)
processed_data = label_preprocess(processed_data)
processed_data.head(5)

Unnamed: 0,YearBuilt,Size(sqf),Floor,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,N_FacilitiesInApt,...,HeatingType_individual_heating,AptManageType_self_management,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby,class
0,2006,814,3,111.0,184.0,2,2,3.0,0.0,5,...,True,False,False,False,False,True,False,False,False,1
1,1985,587,8,80.0,76.0,5,3,2.0,2.0,3,...,True,True,False,False,True,False,False,False,False,0
2,1985,587,6,80.0,76.0,5,3,2.0,2.0,3,...,True,True,False,False,True,False,False,False,False,0
3,2006,2056,8,249.0,536.0,5,5,5.0,11.0,5,...,True,False,False,False,False,False,False,True,False,2
4,1992,644,2,142.0,79.0,2,1,4.0,8.0,3,...,True,True,False,False,False,False,True,False,False,0


In [699]:
print(processed_data.shape)
train_data = processed_data.sample(frac=0.8, random_state=RANDOM_SEED)
processed_data = processed_data.drop(train_data.index)
val_data = processed_data.sample(frac=0.6, random_state=RANDOM_SEED)
test_data = processed_data.drop(val_data.index)

print("Dataset sizes")

print(f"Train: {train_data.shape}")
print(f"Val:   {val_data.shape} samples")
print(f"Test:  {test_data.shape} samples")

(4124, 24)
Dataset sizes
Train: (3299, 24)
Val:   (495, 24) samples
Test:  (330, 24) samples


In [700]:
train_X, train_y = train_data.drop(columns='class'), train_data['class']
val_X, val_y = val_data.drop(columns='class'), val_data['class']
test_X, test_y = test_data.drop(columns='class'), test_data['class']


In [701]:
X_scaler = MinMaxScaler()

train_X = X_scaler.fit_transform(train_X)
val_X = X_scaler.transform(val_X)
test_X = X_scaler.transform(test_X)

train_y = train_y.to_numpy().astype(np.int32)
val_y = val_y.to_numpy().astype(np.int32)
test_y = test_y.to_numpy().astype(np.int32)

In [702]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
f"Using device: {device}"

'Using device: cuda'

In [703]:
X_train_tensor = torch.tensor(train_X, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(train_y, dtype=torch.long).to(device)

X_val_tensor = torch.tensor(val_X, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(val_y, dtype=torch.long).to(device)

X_test_tensor = torch.tensor(test_X, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(test_y, dtype=torch.long).to(device)

In [704]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [705]:
class_weights = [0.91, 0.77, 1.3]
sample_weights = np.array([class_weights[label] for label in train_y])
sample_weights = torch.FloatTensor(sample_weights)

sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

In [706]:
BATCH_SIZE = 256
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)
val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [707]:
class SalePricesModel(nn.Module):
    def __init__(self, input_size: int):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Dropout(0.2),

            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),

            nn.Linear(128, 3)
        )

    def forward(self, x):
        return self.model(x)


input_size = train_X.shape[1]
price_model = SalePricesModel(input_size).to(device)

In [708]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(price_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.8)

In [709]:
def predict_from_logits(logits, y_hat):
    pred_probabilities = nn.Softmax(dim=1)(logits)
    predictions = pred_probabilities.argmax(1)

    correct = (predictions == y_hat).sum().item()
    return predictions, correct

In [710]:
def train(model: nn.Module, epochs: int) -> np.ndarray:
    train_losses = []
    val_losses = []
    train_acc = []
    val_acc = []

    for epoch in range(epochs):
        total_train_loss = 0
        total_val_loss = 0

        train_correct = 0
        train_total = 0

        val_correct = 0
        val_total = 0

        model.train()
        for data_inputs, data_labels in train_data_loader:
            outputs = model(data_inputs)
            loss = criterion(outputs, data_labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

            _, correct = predict_from_logits(outputs, data_labels)

            train_correct += correct
            train_total += data_labels.size(0)

        train_accuracy = 100 * train_correct / train_total
        avg_train_loss = total_train_loss / len(train_data_loader)
        train_losses.append(avg_train_loss)
        train_acc.append(train_accuracy)

        model.eval()
        with torch.no_grad():
            for data_inputs, data_labels in val_data_loader:
                data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)

                outputs = model(data_inputs)

                loss = criterion(outputs.squeeze(dim=1), data_labels)
                total_val_loss += loss.item()

                _, correct = predict_from_logits(outputs, data_labels)

                val_correct += correct
                val_total += data_labels.size(0)

        val_accuracy = 100 * val_correct / val_total
        avg_val_loss = total_val_loss / len(val_data_loader)
        val_losses.append(avg_val_loss)
        val_acc.append(val_accuracy)

        scheduler.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}| Train Loss = {avg_train_loss:.4f} Train acc = {train_accuracy:.4f}"
                  f"| Val Loss = {avg_val_loss:.4f}| Val acc = {val_accuracy:.4f}")

    result = [train_losses, train_acc, val_losses, val_acc]
    return np.array(result)

In [711]:
out = train(price_model, epochs=250)

Epoch 10| Train Loss = 0.3252 Train acc = 84.5408| Val Loss = 0.3672| Val acc = 81.6162
Epoch 20| Train Loss = 0.2914 Train acc = 85.8745| Val Loss = 0.3287| Val acc = 83.4343
Epoch 30| Train Loss = 0.3144 Train acc = 84.9348| Val Loss = 0.3272| Val acc = 82.6263
Epoch 40| Train Loss = 0.2930 Train acc = 85.6017| Val Loss = 0.3294| Val acc = 82.8283
Epoch 50| Train Loss = 0.2890 Train acc = 85.5411| Val Loss = 0.3227| Val acc = 81.8182
Epoch 60| Train Loss = 0.2821 Train acc = 85.6320| Val Loss = 0.3302| Val acc = 85.0505
Epoch 70| Train Loss = 0.2627 Train acc = 86.8445| Val Loss = 0.3253| Val acc = 82.6263
Epoch 80| Train Loss = 0.2694 Train acc = 87.2689| Val Loss = 0.3269| Val acc = 83.8384
Epoch 90| Train Loss = 0.2921 Train acc = 85.6623| Val Loss = 0.3172| Val acc = 84.8485
Epoch 100| Train Loss = 0.2673 Train acc = 86.5717| Val Loss = 0.3243| Val acc = 84.6465
Epoch 110| Train Loss = 0.2711 Train acc = 86.0867| Val Loss = 0.3182| Val acc = 85.0505
Epoch 120| Train Loss = 0.2829

In [712]:
price_model.eval()
true_preds, num_preds = 0., 0.

test_y_hat = []
test_y = []
with torch.no_grad():
    for data_inputs, data_labels in test_data_loader:
        data_inputs, data_labels = data_inputs.to(device), data_labels.to(device)

        logits = price_model(data_inputs)
        predictions, correct = predict_from_logits(logits, data_labels)

        test_y_hat.append(predictions.to("cpu").numpy())
        test_y.append(data_labels.to("cpu").numpy())

        true_preds += correct
        num_preds += data_labels.shape[0]

test_accuracy = (true_preds / num_preds) * 100.0
print(f"Accuracy of the model: {test_accuracy:.2f}%")

Accuracy of the model: 87.81%


In [713]:
def accuracies_for_classes(cm: np.array):
    accs = []
    for class_idx in range(cm.shape[0]):
        acc = (cm[class_idx, class_idx] / np.sum(cm[class_idx, :])) * 100
        accs.append(acc)

    return accs

In [714]:
test_y = np.concatenate(test_y)
test_y_hat = np.concatenate(test_y_hat)
cm = confusion_matrix(test_y, test_y_hat)

accuracies = accuracies_for_classes(cm)
labels = ["Cheap", "Average", "Expensive"]
for i, acc in enumerate(accuracies):
    print(f"Accuracy for class {labels[i]:<10} ({i}): {acc:.2f} %")
print()
print(f"Test accuracy: {test_accuracy:.2f} %")

Accuracy for class Cheap      (0): 77.06 %
Accuracy for class Average    (1): 90.94 %
Accuracy for class Expensive  (2): 81.94 %

Test accuracy: 87.81 %


## Predicting

In [715]:
data_test = pd.read_csv("test_data.csv")
print(data_test.shape)

X_test = features_preprocess(data_test)

price_model.eval()
X_test = X_scaler.transform(X_test)  # Use transform, not fit_transform (avoid data leakage)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

with torch.no_grad():
    logits = price_model(X_test_tensor)
    probabilities = torch.softmax(logits, dim=1)  # Convert logits to probabilities
    predictions = probabilities.argmax(dim=1).cpu().numpy()  # Get predicted class indices

# Create and save DataFrame
output_df = pd.DataFrame(predictions, columns=["Prediction"])
output_df.to_csv("pred.csv", index=False, header=False)

(1767, 16)
