In [39]:
# tokenization library
!pip install tiktoken==0.8.0



In [40]:
import tiktoken
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import pickle
import pandas as pd
import time
import matplotlib.pyplot as plt
# user llm library functions
from gpt import GPTModel, generate_text_simple
from dataloader import create_dataloader_v1
from ft_classification import (random_split,calc_accuracy_loader, calc_loss_batch, 
        evaluate_model, train_classifier_simple, plot_values, classify_review )
from pretraining import load_weights_into_gpt, text_to_token_ids, token_ids_to_text

### Initialize Model and Load Weights

In [41]:
GPT_CONFIG_355M = {
    "vocab_size": 50257,     # Vocabulary Size
    "context_length": 256,  # Context length
    "emb_dim": 1024,          # Embedding dimension
    "n_heads": 16,           # Number of attention heads
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [42]:
#initialize model
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_355M)
model.eval()

#load_weights in pytorch format
param_keys = [
    "blocks",
    "b",
    "g",
    "wpe",
    "wte"
]

params = {}
# Load (deserialize)
for weight in param_keys:
    with open("weights/"+weight+".pkl", "rb") as f:
        params[weight] = pickle.load(f)

In [43]:
#load pretrained weights
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-small (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-medium (355M)"
NEW_CONFIG = GPT_CONFIG_355M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length":1024})
NEW_CONFIG.update({"qkv_bias": True})

In [44]:
tokenizer = tiktoken.get_encoding("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

load_weights_into_gpt(gpt, params)
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 1024)
  (pos_emb): Embedding(1024, 1024)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=1024, out_features=1024, bias=True)
        (W_key): Linear(in_features=1024, out_features=1024, bias=True)
        (W_value): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(i

### Data Loading and Preparation

In [45]:
with open("../data/full_feature_data.csv","r") as f:
    datalist = f.readlines()

In [46]:
featurelist = (datalist[0].replace('_', ' ').replace('num', 'number of')
               .replace('avg', 'average').replace('7d', ' 7 days,')
               .replace('30d', ' 30 days,').replace('90d', ' 90 days,')
               .replace('events week', 'events per week').split(','))
print(featurelist)

['userId', 'user churned', 'average events per weekend', 'average events per weekday', 'number of songs played  7 days', '', 'number of ads  7 days', '', 'number of error  7 days', '', 'number of songs played  30 days', '', 'number of songs played  90 days', '', 'number of sessions', 'average time per session', 'average events per session', 'average gap between session', 'number of events', 'number of songs', 'number of artists', 'number of thumbs down', 'number of thumbs up', 'number of add to playlist', 'number of ads', 'number of add friend', 'number of downgrade', 'number of upgrade', 'number of error', 'percentage ad', 'days since active', 'repeats ratio\n']


In [47]:
input_dict = {}
output_dict = {}
pos_label = "the user churned."
neg_label = "the user did not churn."
for datarow in datalist[1:]:
    features = datarow.split(',')
    input_value = ''
    xs = list(range(len(features[2:]))
    random.shuffle(xs)         # xs is now shuffled
    for i in xs:
    #for i in range(len(features[2:])):
        input_value += featurelist[2 + i] + ' is ' + features[2+i] + ". "
    input_dict[features[0]] = input_value
    output_dict[features[0]] = (pos_label if features[1] == "1"
                                else neg_label
                                )
df = pd.DataFrame(list(zip(input_dict.values(), output_dict.values())), 
                  columns=['input_text', 'Label'])

SyntaxError: '(' was never closed (4112105260.py, line 8)

In [None]:
df

In [None]:
print(df["Label"].value_counts())

In [None]:
def create_balanced_dataset(df):
    # todo: take out hard-coding of target names and values
    num_pos = df[df["Label"] == pos_label].shape[0]
    neg_set = df[df["Label"] == neg_label].sample(
        num_pos, random_state=123
    )
    balanced_df = pd.concat([
        neg_set, df[df["Label"] == pos_label]
    ])
    return balanced_df

In [None]:
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

In [None]:
balanced_df["Label"] = balanced_df["Label"].map({neg_label: 0, pos_label: 1})

In [None]:
balanced_df

In [None]:
def random_split(df, train_frac, validation_frac):

    df = df.sample(
        frac=1, random_state=123
    ).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df


In [None]:
train_df, validation_df, test_df= random_split(
    balanced_df, 0.7, 0.1)

In [None]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [None]:
class ChurnDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None,
                 pad_token_id=50256):
        self.data = pd.read_csv(csv_file)

        self.encoded_texts = [
            tokenizer.encode(text) for text in self.data["input_text"]
        ]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = max_length

            self.encoded_texts = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_texts
            ]

        self.encoded_texts = [
            encoded_text + [pad_token_id] *
            (self.max_length - len(encoded_text))
            for encoded_text in self.encoded_texts
        ]

    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]["Label"]
        return(
                torch.tensor(encoded, dtype=torch.long),
                torch.tensor(label, dtype=torch.long)
            )

    def __len__(self):
        return len(self.data)

    def _longest_encoded_length(self):
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

In [None]:
train_dataset = ChurnDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)
print("train length:", train_dataset.max_length)
val_dataset = ChurnDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = ChurnDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

In [None]:
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True
)
for input_batch, target_batch in train_loader:
    pass
print("Input batch dimensions:", input_batch.shape)
print("Label batch dimensions:", target_batch.shape)

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
CHOOSE_MODEL = "gpt2-medium (355M)"
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length" : 1024,
    "drop_rate" : 0.0,
    "qkv_bias" : True
}
model_configs = {
    "gpt2-small (124M)" : {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)" : {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)" : {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)" : {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
torch.manual_seed(123)
num_classes = 2
model.out_head= torch.nn.Linear(
    in_features = BASE_CONFIG["emb_dim"],
    out_features = num_classes
)

In [None]:
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

In [None]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs:", inputs)
print("Inputs dimensions:", inputs.shape)

In [None]:
with torch.no_grad():
    outputs = model(inputs)
print("Outputs\n", outputs)
print("Outputs dimensions:", outputs.shape)

In [None]:
print("Last output token:",  outputs[:, -1, :])

In [None]:
probas = torch.softmax(outputs[:, -1, :], dim=-1)
print(probas)
label = torch.argmax(probas)
print(label)
#print("Class label:", label)

In [None]:
logits = outputs[:, -1, :]
print(logits)
label = torch.argmax(logits)
print("Class label:", label.item())

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else cpu())
model.to(device)

In [None]:
torch.manual_seed(123)
train_accuracy = calc_accuracy_loader(
    train_loader, model, device, num_batches=10)
val_accuracy = calc_accuracy_loader(
    val_loader, model, device, num_batches=10)
test_accuracy = calc_accuracy_loader(
    test_loader, model, device, num_batches=10)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

In [None]:
def calc_loss_batch2(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device).long()

    logits = model(input_batch)
    # If the model outputs [B, L, C], take per-example logits
    if logits.dim() == 3:
        logits = logits[:, -1, :]  # or use CLS position if applicable

    return torch.nn.functional.cross_entropy(logits, target_batch.view(-1))

In [None]:
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 8
train_losses, val_losses, train_accs, val_accs, examples_seen = \
    train_classifier_simple(
        model, train_loader, val_loader, optimizer, device,
        num_epochs=num_epochs, eval_freq=50,
        eval_iter=5
    )
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed n {execution_time_minutes:.2f} minutes.")

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))

plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_accs))
examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs))

plot_values(
    epochs_tensor, examples_seen_tensor, train_accs, val_accs,
    label="accuracy"
)

In [None]:
train_accuracy = calc_accuracy_loader(
    train_loader, model, device)
val_accuracy = calc_accuracy_loader(
    val_loader, model, device)
test_accuracy = calc_accuracy_loader(
    test_loader, model, device)

print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")