In [14]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2TokenizerFast
import os
import sys

sys.path.append(os.path.abspath("../src"))
from GPTModel import GPTModel
from Generate import generate
from EncodeDecode import text_to_token_ids, token_ids_to_text

In [2]:
df = pd.read_csv('../data/SMSSpamCollection.tsv',sep = '\t') 
df

Unnamed: 0,type,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
valeurs_uniques = df['type'].unique()
print("Possible choices in column 'type':")
print(valeurs_uniques)

print("Number of 'ham':", df[df['type'] == 'ham'].shape[0])
print("Number of 'spam':", df[df['type'] == 'spam'].shape[0])

Possible choices in column 'type':
['ham' 'spam']
Number of 'ham': 4825
Number of 'spam': 747


### Undersampling

In [4]:
ham = df[df['type'] == 'ham']
spam = df[df['type'] == 'spam']

ham_undersampled = ham.sample(n=len(spam), random_state=42)

df_balanced = pd.concat([ham_undersampled, spam])

print(df_balanced['type'].value_counts())

type
ham     747
spam    747
Name: count, dtype: int64


### Create the datasets

In [5]:
df_balanced = df_balanced.sample(frac=1, random_state=123).reset_index(drop=True)

# Calculate split indices
n = len(df_balanced)
train_end = int(0.7 * n)
val_end = train_end + int(0.1 * n)

# Split
train = df_balanced.iloc[:train_end]
val = df_balanced.iloc[train_end:val_end]
test = df_balanced.iloc[val_end:]

train.to_csv('../data/fine-tuning/train.csv', index=False)
val.to_csv('../data/fine-tuning/validation.csv', index=False)
test.to_csv('../data/fine-tuning/test.csv', index=False)

In [6]:
import torch
from torch.utils.data import Dataset
import pandas as pd

class SpamDataset(Dataset):
    def __init__(self, csv_file, max_length, tokenizer, pad_token_id):
        """
        Args:
            csv_file (str): Path to the CSV file containing the dataset.
            max_length (int or None): Maximum length of the tokenized vector.
                If None, all vectors are resized to the longest vector in the dataset.
            tokenizer: The tokenizer used to encode the strings.
            pad_token_id (int): The token ID used for padding.
        """
        self.data = pd.read_csv(csv_file)
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.pad_token_id = pad_token_id
        self.input_ids = []
        self.labels = []

        # Tokenize all texts and determine the max length if not provided
        self._tokenize_and_preprocess()

    def _tokenize_and_preprocess(self):
        # Tokenize all texts
        texts = self.data['msg'].tolist()
        labels = self.data['type'].tolist()

        # Tokenize each text
        tokenized_texts = [self.tokenizer.encode(text, return_tensors='pt').squeeze(0) for text in texts]

        # Determine the max length if not provided
        if self.max_length is None:
            self.max_length = max(len(t) for t in tokenized_texts)

        # Pad or truncate each sequence
        for tokens in tokenized_texts:
            if len(tokens) < self.max_length:
                # Pad with pad_token_id
                padded_tokens = torch.cat([
                    tokens,
                    torch.full((self.max_length - len(tokens),), self.pad_token_id)
                ])
                self.input_ids.append(padded_tokens)
            else:
                # Truncate
                self.input_ids.append(tokens[:self.max_length])

        # Convert labels to numerical values (e.g., 'ham' -> 0, 'spam' -> 1)
        label_map = {'ham': 0, 'spam': 1}
        self.labels = [label_map[label] for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
            self.input_ids[idx], torch.tensor(self.labels[idx], dtype=torch.long)

In [7]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
eos_token_id = tokenizer.eos_token_id

trainDataSet = SpamDataset('../data/fine-tuning/train.csv', None, tokenizer, eos_token_id)
valDataSet = SpamDataset('../data/fine-tuning/validation.csv', 256, tokenizer, eos_token_id)
testDataSet = SpamDataset('../data/fine-tuning/test.csv', 256, tokenizer, eos_token_id)

### Create dataloaders

In [8]:
trainDataLoader = DataLoader(trainDataSet, batch_size=8, shuffle=True, drop_last=True, num_workers=0)
valDataLoader = DataLoader(valDataSet, batch_size=8, shuffle=False, drop_last=False, num_workers=0)
testDataLoader = DataLoader(testDataSet, batch_size=8, shuffle=False, drop_last=False, num_workers=0)

print("Number of training samples:", len(trainDataSet))
print("Number of validation samples:", len(valDataSet))
print("Number of test samples:", len(testDataSet))

Number of training samples: 1045
Number of validation samples: 149
Number of test samples: 300


In [9]:
def assign(target_tensor: torch.Tensor, source_array) -> torch.Tensor:
    source_tensor = torch.tensor(source_array, dtype=target_tensor.dtype)

    if target_tensor.shape != source_tensor.shape:
        raise ValueError(
            f"Incompatible shapes: target {tuple(target_tensor.shape)} "
            f"vs source {tuple(source_tensor.shape)}"
        )

    return source_tensor

def load_weights_into_gpt(model, params):
    emb_dim = model.emb_dim
    n_heads = model.n_heads
    
    with torch.no_grad():
        # --- Embeddings ---
        model.token_embedding.weight.copy_(
            assign(model.token_embedding.weight, params["wte"])
        )
        model.pos_embedding_layer.weight.copy_(
            assign(model.pos_embedding_layer.weight, params["wpe"])
        )
        
        # --- Blocs Transformer ---
        for i, block in enumerate(model.transformer_blocks):
            p_block = params["blocks"][i]
            
            # LayerNorm 1
            block.layerNorm1.scale.copy_(
                assign(block.layerNorm1.scale, p_block["ln_1"]["g"])
            )
            block.layerNorm1.shift.copy_(
                assign(block.layerNorm1.shift, p_block["ln_1"]["b"])
            )
            
            # --- ATTENTION : Gestion Correcte de c_attn ---
            W_qkv = p_block["attn"]["c_attn"]["w"]  # [768, 2304]
            b_qkv = p_block["attn"]["c_attn"]["b"]  # [2304]
            
            # Découpage sur la dimension de sortie (colonnes)
            W_q = W_qkv[:, 0*emb_dim:1*emb_dim]  # [768, 768]
            W_k = W_qkv[:, 1*emb_dim:2*emb_dim]
            W_v = W_qkv[:, 2*emb_dim:3*emb_dim]
            
            b_q = b_qkv[0*emb_dim:1*emb_dim]  # [768]
            b_k = b_qkv[1*emb_dim:2*emb_dim]
            b_v = b_qkv[2*emb_dim:3*emb_dim]
            
            # Vérification de shape (CRITIQUE)
            assert W_q.shape == (emb_dim, emb_dim), f"❌ W_q shape: {W_q.shape}"
            assert b_q.shape == (emb_dim,), f"❌ b_q shape: {b_q.shape}"
            
            # Copie dans les Linear (bias=False donc on ignore les biais)
            # PyTorch: Linear.weight est (out_features, in_features)
            block.mha.W_query.weight.copy_(assign(block.mha.W_query.weight, W_q.T))
            block.mha.W_query.bias.copy_(assign(block.mha.W_query.bias, b_q))

            block.mha.W_key.weight.copy_(assign(block.mha.W_key.weight, W_k.T))
            block.mha.W_key.bias.copy_(assign(block.mha.W_key.bias, b_k))

            block.mha.W_value.weight.copy_(assign(block.mha.W_value.weight, W_v.T))
            block.mha.W_value.bias.copy_(assign(block.mha.W_value.bias, b_v))
            
            # Projection de sortie
            W_o = p_block["attn"]["c_proj"]["w"]  # [768, 768]
            b_o = p_block["attn"]["c_proj"]["b"]
            block.mha.out_proj.weight.copy_(assign(block.mha.out_proj.weight, W_o.T))
            block.mha.out_proj.bias.copy_(assign(block.mha.out_proj.bias, b_o))
            
            # LayerNorm 2
            block.layerNorm2.scale.copy_(
                assign(block.layerNorm2.scale, p_block["ln_2"]["g"])
            )
            block.layerNorm2.shift.copy_(
                assign(block.layerNorm2.shift, p_block["ln_2"]["b"])
            )
            
            # --- MLP ---
            W1 = p_block["mlp"]["c_fc"]["w"]  # [768, 3072]
            b1 = p_block["mlp"]["c_fc"]["b"]  # [3072]
            
            assert W1.shape == (emb_dim, 4*emb_dim), f"❌ W1 shape: {W1.shape}"
            
            block.feedForward.layer1.weight.copy_(assign(block.feedForward.layer1.weight, W1.T))
            block.feedForward.layer1.bias.copy_(assign(block.feedForward.layer1.bias, b1))
            
            W2 = p_block["mlp"]["c_proj"]["w"]  # [3072, 768]
            b2 = p_block["mlp"]["c_proj"]["b"]  # [768]
            
            block.feedForward.layer2.weight.copy_(assign(block.feedForward.layer2.weight, W2.T))
            block.feedForward.layer2.bias.copy_(assign(block.feedForward.layer2.bias, b2))
        
        # LayerNorm final (stored at root level as 'g' and 'b')
        model.layerNorm.scale.copy_(
            assign(model.layerNorm.scale, params["g"])
        )
        model.layerNorm.shift.copy_(
            assign(model.layerNorm.shift, params["b"])
        )
        
        # Weight tying (inverseEmbedding == token_embedding)
        if hasattr(model, "inverseEmbedding"):
            model.inverseEmbedding.weight.copy_(
                assign(model.inverseEmbedding.weight, params["wte"])
            )

In [10]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(device)

cpu


In [11]:
from gpt_download import download_and_load_gpt2

settings, params = download_and_load_gpt2("124M", "../model/gpt")

File already exists and is up-to-date: ../model/gpt\124M\checkpoint
File already exists and is up-to-date: ../model/gpt\124M\encoder.json
File already exists and is up-to-date: ../model/gpt\124M\hparams.json
File already exists and is up-to-date: ../model/gpt\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: ../model/gpt\124M\model.ckpt.index
File already exists and is up-to-date: ../model/gpt\124M\model.ckpt.meta
File already exists and is up-to-date: ../model/gpt\124M\vocab.bpe


In [12]:
config = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.0,
    'qkv_bias': True,
    "out_bias": True
}

model = GPTModel(config)
model.to(device)
load_weights_into_gpt(model, params)

### Can the base GPT2 model perform the classification using only a prompt? 

In [15]:
input = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)

batch = text_to_token_ids(input)
batch = batch.to(device)

model.eval()

with torch.no_grad():
    generated_tokens = generate(model, batch, max_new_tokens=20, context_size=1024, temperature=1, top_k=30)

generated_text = token_ids_to_text(generated_tokens)

print(input + generated_text)

Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner you have been specially selected to receive $1000 cash or a $2000 award.'

Click on any of the questions below to go directly to your winning submission. The links below
