In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import csv
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Model, GPT2Tokenizer, AdamW

In [None]:
with open('/kaggle/input/project/2021_residential_description.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    data_2021 = [row for row in reader]

with open('/kaggle/input/project/2022_residential_description.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)
    data_2022 = [row for row in reader]

data_2021 = np.array(data_2021).astype(str)
data_2022 = np.array(data_2022).astype(str)

In [None]:
concat_col_2021 = np.array([f"{row[0]} {row[1]} The sold price is " for row in data_2021])
concat_col_2022 = np.array([f"{row[0]} {row[1]} The sold price is " for row in data_2022])
data_all = np.concatenate((concat_col_2021, concat_col_2022))
target_all = np.concatenate((np.round(data_2021[:, 2].astype(float)/100000), np.round(data_2022[:, 2].astype(float)/100000)))

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

class ResiDataset(Dataset):
  def __init__(self, sentences, target, tokenizer):
    self.sentences = sentences
    self.target = target
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, idx):
    encoding = self.tokenizer.encode_plus(self.sentences[idx], add_special_tokens=True, max_length=256, padding='max_length', pad_to_max_length=True,
                        return_attention_mask=True, truncation=True, return_tensors='pt')
    input = encoding['input_ids'].squeeze()
    attention_mask = encoding['attention_mask'].squeeze()
    target = torch.tensor(self.target[idx], dtype=torch.long)
    return {'input_ids': input, 'attention_mask': attention_mask, 'target': target}

In [None]:
dataset_len = len(data_all)
train_len = int(0.7 * dataset_len)
val_len = int(0.9 * dataset_len)
train_data = ResiDataset(data_all[:train_len], target_all[:train_len], tokenizer)
val_data = ResiDataset(data_all[train_len:val_len], target_all[train_len:val_len], tokenizer)
test_data = ResiDataset(data_all[val_len:], target_all[val_len:], tokenizer)

num_classes = int(torch.max(torch.tensor(target_all)) + 1)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)

In [None]:
gpt2 = GPT2Model.from_pretrained("gpt2")
new_vocab_size = gpt2.config.vocab_size + 1 ## for <pad>
new_embed = nn.Embedding(new_vocab_size, gpt2.config.hidden_size)
new_embed.weight.data[:gpt2.config.vocab_size, :] = gpt2.wte.weight.data
gpt2.set_input_embeddings(new_embed)
class GPTModel(nn.Module):
  def __init__(self, num_classes, gpt2):
    super(GPTModel, self).__init__()
    self.gpt_model = gpt2
    self.fc = nn.Linear(gpt2.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    gpt_out = self.gpt_model(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output_state = gpt_out.last_hidden_state.mean(dim=1)
    return self.fc(pooled_output_state)

In [None]:
def train(model, train_loader, creterion, device, optimizer):
  model.train()
  loss_sum = 0.0
  correct = 0
  total = 0
  for data in train_loader:
    input = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    target = data['target'].to(device)

    optimizer.zero_grad()
    output = model(input_ids=input, attention_mask=attention_mask)
    loss = creterion(output, target)
    loss.backward()
    optimizer.step()
    loss_sum += loss.item()

    pred = torch.max(output, dim=1)[1]
    total += target.size(0)
    correct += (pred == target).sum().item()
  avg_loss = loss_sum / len(train_loader)
  acc = correct / total
  return avg_loss, acc

def evaluate(model, data_loader, creterion, device):
  model.eval()
  loss_sum = 0.0
  correct = 0
  total = 0
  with torch.no_grad():
    for data in data_loader:
      input = data["input_ids"].to(device)
      attention_mask = data["attention_mask"].to(device)
      target = data["target"].to(device)


      output = model(input_ids=input, attention_mask=attention_mask)
      loss = creterion(output, target)
      loss_sum += loss.item()

      pred = torch.max(output, dim=1)[1]
      total += target.size(0)
      correct += (pred == target).sum().item()
  avg_loss = loss_sum / len(data_loader)
  acc = correct / total
  return avg_loss, acc

In [None]:
model = GPTModel(num_classes=num_classes, gpt2=gpt2)

In [None]:
lr = 1e-3
epoch = 5
creterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=lr)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for i in range(epoch):
  train_loss, train_acc = train(model, train_loader, creterion, device, optimizer)
  val_loss, val_acc = evaluate(model, val_loader, creterion, device)
  print(f"Epoch {i+1}/{epoch}: Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

In [None]:
test_loss, test_acc = evaluate(model, test_loader, creterion, device)
print(f"Test Accuracy: {test_acc:.4f}")

In [None]:
torch.save(model.state_dict(), '/kaggle/working/GPTModel2')