In [238]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Step 1: Preprocess the Data
data = pd.read_csv("data_before_forest.csv")  # Replace with your actual file path

# Manually define the categories
owners_categories = [
    "0 .. 20,000", "20,000 .. 200,000", "200,000 .. 2,000,000", 
    "2,000,000 .. 20,000,000", "20,000,000 .. 200,000,000"
]

# Assuming that owners is the column with string ranges.
# Label encoding the target owners into integer categories
label_encoder = LabelEncoder()
label_encoder.fit(owners_categories)

# Create a new column owners_category for encoded labels
data['owners_category'] = label_encoder.transform(data['owners'])

# Now split your data into features (X) and target (y)
X = data.drop(['owners', 'owners_category'], axis=1)  # Drop non-feature columns
y = data['owners_category']  # Target labels (integer encoding)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Use 'long' for labels
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Step 2: Define the Model (for multi-class classification)
class ClassificationNN(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 5)  # 5 categories for classification (output layer)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)  # Output layer: no activation, CrossEntropyLoss takes care of softmax
        return x

# Step 3: Instantiate the Model, Loss Function, and Optimizer
model = ClassificationNN(input_size=X_train_tensor.shape[1])
loss_fn = nn.CrossEntropyLoss()  # For multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

class_counts = data['owners'].value_counts()
class_weights = 1.0 / class_counts
weights = torch.tensor(class_weights.values, dtype=torch.float32)

# Use the class weights in CrossEntropyLoss
loss_fn = nn.CrossEntropyLoss(weight=weights)

# Step 4: Train the Model
num_epochs = 300

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X_train_tensor)
    loss = loss_fn(outputs, y_train_tensor)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}, Train Loss: {loss.item()}")

# Step 5: Evaluate the Model
model.eval()
with torch.no_grad():
    # Predict on the test set
    test_outputs = model(X_test_tensor)
    
    # Get the predicted class (the index of the max value)
    _, predicted = torch.max(test_outputs, 1)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test_tensor, predicted)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Step 6: Optionally, get the model's predictions in the original range labels
predicted_categories = label_encoder.inverse_transform(predicted.numpy())
print(predicted_categories)

Epoch 0, Train Loss: 1.6309789419174194
Epoch 50, Train Loss: 1.1819427013397217
Epoch 100, Train Loss: 1.128562092781067
Epoch 150, Train Loss: 1.085033655166626
Epoch 200, Train Loss: 1.044491171836853
Epoch 250, Train Loss: 1.0011794567108154
Test Accuracy: 45.75%
['200,000 .. 2,000,000' '0 .. 20,000' '200,000 .. 2,000,000' ...
 '200,000 .. 2,000,000' '200,000 .. 2,000,000' '200,000 .. 2,000,000']


In [240]:
import pickle

# Save the model as a pickle file
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)
print("Model saved as model.pkl!")

Model saved as model.pkl!
