In [None]:
!git clone https://github.com/drryodino246/csc542-gp-team164.git

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
emotions = load_dataset('csv', data_files='/content/csc542-gp-team164/baseline.csv')

In [None]:
emotions_split = emotions['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = emotions_split['train']
test_dataset = emotions_split['test']

print(f"Training data size: {len(train_dataset)}")
print(f"Test data size: {len(test_dataset)}")

In [None]:
### Build-From-Scratch version ###

from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer

# ラベルエンコード
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_dataset['updated_emotion'])
test_labels = label_encoder.transform(test_dataset['updated_emotion'])

# トークナイズ（今回は簡易的に単語単位で分割）
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Tokenizerだけ使う

MAX_LEN = 64  # 短文ならこれで充分

def tokenize(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')

class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenize(texts)
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

train_data = EmotionDataset(train_dataset['text'], train_labels)
test_data = EmotionDataset(test_dataset['text'], test_labels)

###################################

In [None]:
### Build-From-Scratch version ###

import torch.nn as nn
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        x = x.unsqueeze(1)  # (batch_size, 1, seq_len, embed_dim)
        conv_x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # conv outputs
        pooled_x = [F.max_pool1d(c, c.size(2)).squeeze(2) for c in conv_x]
        out = torch.cat(pooled_x, dim=1)
        out = self.dropout(out)
        return self.fc(out)
    
###################################    

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
import numpy as np

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

glove_path = "/content/glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

In [None]:
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
import torch

# Colab上でデータから語彙を作る
def yield_tokens(texts):
    for text in texts:
        yield text.lower().split()

vocab = build_vocab_from_iterator(yield_tokens(train_dataset['text']), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)

# GloVe埋め込みをvocabに合わせて並べる
embedding_dim = 100
embedding_matrix = np.random.normal(scale=0.6, size=(vocab_size, embedding_dim))

for i, token in enumerate(vocab.get_itos()):
    vector = glove_embeddings.get(token)
    if vector is not None:
        embedding_matrix[i] = vector

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [None]:
class TextCNN(nn.Module):
    def __init__(self, embedding_matrix, num_classes, kernel_sizes=[3,4,5], num_filters=100):
        super(TextCNN, self).__init__()
        vocab_size, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # ← ここ大事！
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x = x.unsqueeze(1)     # (batch, 1, seq_len, embed_dim)
        conv_x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled_x = [F.max_pool1d(c, c.size(2)).squeeze(2) for c in conv_x]
        out = torch.cat(pooled_x, dim=1)
        out = self.dropout(out)
        return self.fc(out)