In [1]:
window_size = 20
input_dim = 152
output_dim = 34 
hidden_dim = 128
batch_size = 32
num_epochs = 10
num_layers = 2
best_accuracy = 50

In [2]:
import torch
import torch.nn as nn
import numpy as np
import os
import pandas as pd
import dask.dataframe as dd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F
from IPython.display import clear_output
import time

In [3]:
class MyDataset(Dataset):
    def __init__(self, file_path, chunk_size):
        self.file_path = file_path
        self.chunk_size = chunk_size
        
        self.columns = pd.read_csv(self.file_path, nrows=0).columns.str.strip().str.replace('\xa0', ' ').tolist()
        self.train_columns = [col for col in self.columns if col != 'Discard']  # 排除 'Discard' 欄位
        self.num_samples = sum(1 for _ in open(self.file_path, encoding='utf-8')) - 1  # 計算總樣本數
        
    def __getitem__(self, idx):
        chunk_start = idx // self.chunk_size * self.chunk_size
        df = pd.read_csv(self.file_path, skiprows=chunk_start + 1, nrows=self.chunk_size, header=None, encoding='utf-8')  # 跳過標題和之前的行

        df.columns = self.columns
        
        if 'Discard' not in df.columns:
            raise KeyError(f"Chunk starting at row {chunk_start + 1} does not contain 'Discard' column.")
        
            
        sample_idx = idx % self.chunk_size
        
        train_data = df[self.train_columns].iloc[sample_idx].values
        
        value_data = df['Discard'].iloc[sample_idx]  # 提取 'Discard' 欄位
        
        return torch.tensor(train_data, dtype=torch.float32), torch.tensor(value_data, dtype=torch.float32)
        
    def __len__(self):
        return self.num_samples

In [4]:
"""
讀取csv
"""
file_path = 'E:/專題/data/2021/DiscardData.csv'
dataset = MyDataset(file_path, chunk_size=50000)  # 每次只加載 50000 行
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [5]:
class MyMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MyMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.fc5 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [7]:
model = MyMLP(input_dim, hidden_dim, output_dim).to(device)
loss_criterion  = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.001)

In [8]:
current_loss = 0
for epoch in range(num_epochs):
    for train_data, value_data in train_dataloader:
        model.train()
        train_data, value_data = train_data.to(device), value_data.to(device)

        outputs = model(train_data)
        value_data = value_data.long()
        probabilities = F.softmax(outputs, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1)
        correct_predictions = (predicted_labels == value_data).sum().item()
        accuracy = (correct_predictions / value_data.size(0))*100
        
        loss = loss_criterion(outputs, value_data)
        current_loss = loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        clear_output()
        print(predicted_labels)
        print(value_data)
        print(f"Correct Predictions: {correct_predictions}")
        print(f"Total Predictions: {value_data.size(0)}")
        print(f"Accuracy: {accuracy:.4f}","%")
        print(f"At epoch: {epoch}, loss: {current_loss}")
        for name, param in model.named_parameters():
            if param.grad is not None:
                print(f'{name} gradient norm: {param.grad.norm()}')
        if accuracy >= best_accuracy:
            best_accuracy = accuracy
            torch.save(model, f'E:/專題/MLP_discard_model/MLP_discard_model_{best_accuracy}.pth')
            print(f'模型在 epoch {epoch} 之後被儲存，正確率: {best_accuracy}')

tensor([28, 26, 17, 32, 26, 26, 17, 32, 26, 26, 17, 32, 26, 17, 10, 32, 26, 17,
        10, 33, 17, 10, 33, 26, 10, 10, 26, 17, 10, 10, 26, 17],
       device='cuda:0')
tensor([18, 13,  2, 17, 10, 17, 12, 17, 12,  9, 16, 26, 10,  8,  5, 18, 12,  3,
         8,  3,  4, 30, 27, 10,  4, 30, 30,  3,  8,  3, 30, 13],
       device='cuda:0')
Correct Predictions: 0
Total Predictions: 32
Accuracy: 0.0000 %
At epoch: 0, loss: 3.5429558753967285
fc1.weight gradient norm: 0.5454714894294739
fc1.bias gradient norm: 0.0007996627246029675
fc2.weight gradient norm: 0.22389709949493408
fc2.bias gradient norm: 0.0034081502817571163
fc3.weight gradient norm: 0.13116875290870667
fc3.bias gradient norm: 0.012965921312570572
fc4.weight gradient norm: 0.09728036820888519
fc4.bias gradient norm: 0.05934687331318855
fc5.weight gradient norm: 0.11459355056285858
fc5.bias gradient norm: 0.23598270118236542


KeyboardInterrupt: 