In [5]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch.cuda.amp as amp
from sklearn.metrics import f1_score

class TorchOneClassSVM(nn.Module):
    def __init__(self, input_dim):
        super(TorchOneClassSVM, self).__init__()
        self.rbf_layer = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, x, chunk_size=128):
        batch_predictions = []
        
        for i in range(0, x.size(0), chunk_size):
            end_idx = min(i + chunk_size, x.size(0))
            chunk = x[i:end_idx]
            
            with torch.cuda.amp.autocast():
                output = self.rbf_layer(chunk)
            
            batch_predictions.append(output)
            
        return torch.cat(batch_predictions, dim=0)

# Set up GPU and memory optimization
torch.backends.cudnn.benchmark = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
grad_scaler = amp.GradScaler()  # Renamed from scaler to grad_scaler

print(f"Using device: {device}")
torch.cuda.empty_cache()

# Load and preprocess data
df = pd.read_csv('water_main_data.csv')
df['Length'] = df['Length'].astype(str).str.replace(' ft', '').astype(float)
df = df[df['Material'] != 'UNKNOWN']

# Encode Material column
material_encoder = OneHotEncoder(sparse_output=False)
material_encoded = material_encoder.fit_transform(df['Material'].values.reshape(-1, 1))
material_columns = [f'Material_{i}' for i in range(material_encoded.shape[1])]
df_encoded = pd.DataFrame(material_encoded, columns=material_columns)

df = pd.concat([df, df_encoded], axis=1)
df['Installed'] = pd.to_datetime(df['Installed'])
df['Year_Installed'] = df['Installed'].dt.year
df = df.drop(columns=['Material', 'Installed'])

# Prepare features and target
X = df.drop(columns=['ID', 'Leak Occurred'])
y = df['Leak Occurred']

# Preprocessing
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Fix the MinMaxScaler implementation
min_max_scaler = MinMaxScaler()  # Renamed from scaler to min_max_scaler
X_scaled = min_max_scaler.fit_transform(X_imputed)

# Convert to PyTorch tensors
X_tensor = torch.FloatTensor(X_scaled).to(device)
y_tensor = torch.FloatTensor(y.values).to(device)

# Create DataLoader with GPU optimization
batch_size = 256
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=False,
    num_workers=0
)

# Initialize model and move to GPU
model = TorchOneClassSVM(input_dim=X_tensor.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Change to BCEWithLogitsLoss
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor([5.0]).to(device)
)

# Training loop with GPU optimization
num_epochs = 50
best_f1 = 0
best_threshold = 0.5

print("Starting training...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        
        with torch.cuda.amp.autocast():
            logits = model(batch_x)
            loss = criterion(logits.squeeze(), batch_y)
        
        grad_scaler.scale(loss).backward()  # Using grad_scaler instead of scaler
        grad_scaler.step(optimizer)
        grad_scaler.update()
        
        total_loss += loss.item()
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    if (epoch + 1) % 5 == 0:
        model.eval()
        with torch.no_grad():
            val_logits = model(X_tensor)
            val_predictions = torch.sigmoid(val_logits)
            
            for threshold in np.arange(0.3, 0.7, 0.05):
                pred_binary = (val_predictions.squeeze().cpu().numpy() > threshold).astype(int)
                f1 = f1_score(y, pred_binary)
                
                if f1 > best_f1:
                    best_f1 = f1
                    best_threshold = threshold
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}, Best F1: {best_f1:.4f}')

# Final evaluation
print("\nFinal Evaluation...")
model.eval()
with torch.no_grad():
    final_logits = model(X_tensor)
    final_predictions = torch.sigmoid(final_logits)
    final_predictions = (final_predictions.squeeze().cpu().numpy() > best_threshold).astype(int)

# Calculate metrics
from sklearn.metrics import classification_report, confusion_matrix

print("\nModel Performance:")
print("\nClassification Report:")
print(classification_report(y, final_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y, final_predictions))

# Save results
df['Predicted_Leak'] = final_predictions
df.to_csv('predicted_leaks_gpu.csv', index=False)

if torch.cuda.is_available():
    torch.cuda.empty_cache()


  grad_scaler = amp.GradScaler()  # Renamed from scaler to grad_scaler
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Using device: cuda
Starting training...
Epoch [5/50], Loss: 0.0967, Best F1: 0.2355


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [10/50], Loss: 0.0880, Best F1: 0.2366


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [15/50], Loss: 0.0874, Best F1: 0.2405


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [20/50], Loss: 0.0874, Best F1: 0.2417


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [25/50], Loss: 0.0877, Best F1: 0.2452


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [30/50], Loss: 0.0864, Best F1: 0.2452


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [35/50], Loss: 0.0856, Best F1: 0.2470


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [40/50], Loss: 0.0852, Best F1: 0.2539


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [45/50], Loss: 0.0856, Best F1: 0.2539


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch [50/50], Loss: 0.0853, Best F1: 0.2539

Final Evaluation...


  with torch.cuda.amp.autocast():



Model Performance:

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    119117
           1       0.22      0.30      0.25       742

    accuracy                           0.99    119859
   macro avg       0.61      0.65      0.62    119859
weighted avg       0.99      0.99      0.99    119859


Confusion Matrix:
[[118331    786]
 [   520    222]]
