In [4]:
#tokenization library
!pip install tiktoken==0.8.0

Collecting tiktoken==0.8.0
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m41.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [5]:
import tiktoken
import torch
from gpt import GPTModel, generate_text_simple
from dataloader import create_dataloader_v1
from pretraining import load_weights_into_gpt, text_to_token_ids, token_ids_to_text
from ft_classification import (random_split,calc_accuracy_loader, calc_loss_batch, 
        evaluate_model, train_classifier_simple, plot_values, classify_review )

  import pynvml  # type: ignore[import]


ModuleNotFoundError: No module named 'gpt'

### Initialize Model and Load Weights

In [7]:
GPT_CONFIG_355M = {
    "vocab_size": 50257,     # Vocabulary Size
    "context_length": 256,  # Context length
    "emb_dim": 1024,          # Embedding dimension
    "n_heads": 16,           # Number of attention heads
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [11]:
#initialize model
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_355M)
model.eval()

#load_weights in pytorch format
param_keys = [
    "blocks",
    "b",
    "g",
    "wpe",
    "wte"
]
import pickle
params = {}
# Load (deserialize)
for weight in param_keys:
    with open(weight+".pkl", "rb") as f:
        params[weight] = pickle.load(f)

In [12]:
#load pretrained weights
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-small (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-medium (355M)"
NEW_CONFIG = GPT_CONFIG_355M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length":1024})
NEW_CONFIG.update({"qkv_bias": True})

In [13]:
tokenizer = tiktoken.get_encoding("gpt2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt = GPTModel(NEW_CONFIG)
gpt.eval()

load_weights_into_gpt(gpt, params)
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 1024)
  (pos_emb): Embedding(1024, 1024)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=1024, out_features=1024, bias=True)
        (W_key): Linear(in_features=1024, out_features=1024, bias=True)
        (W_value): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(i

### Data Loading and Preparation

In [15]:
with open("data/full_feature_data.csv","r") as f:
    datalist = f.readlines()

In [16]:
featurelist = (datalist[0].replace('_', ' ').replace('num', 'number of')
               .replace('avg', 'average').replace('7d', ' 7 days,')
               .replace('30d', ' 30 days,').replace('90d', ' 90 days,')
               .replace('events week', 'events per week').split(','))
print(featurelist)

['userId', 'user churned', 'average events per weekend', 'average events per weekday', 'number of songs played  7 days', '', 'number of ads  7 days', '', 'number of error  7 days', '', 'number of songs played  30 days', '', 'number of songs played  90 days', '', 'number of sessions', 'average time per session', 'average events per session', 'average gap between session', 'number of events', 'number of songs', 'number of artists', 'number of thumbs down', 'number of thumbs up', 'number of add to playlist', 'number of ads', 'number of add friend', 'number of downgrade', 'number of upgrade', 'number of error', 'percentage ad', 'days since active', 'repeats ratio\n']


In [17]:
input_dict = {}
output_dict = {}
for datarow in datalist[1:]:
    features = datarow.split(',')
    input_value = ''
    for i in range(len(features[2:])):
        input_value += featurelist[2 + i] + ' is ' + features[2+i] + ". "
    input_dict[features[0]] = input_value
    output_dict[features[0]] = "the probability the user churned is "+features[1]
#for key in output_dict:
#    print(key, output_dict[key])
aws_churn_dataset = list(zip(input_dict.values(), output_dict.values()))

In [30]:
with open ("data/aws_churn_features.csv","w") as g:
    g.write("input")
    for key in input_dict:
        g.write(input_dict[key])
with open ("data/aws_churn_targets.csv","w") as g:
    g.write("target")
    for key in output_dict:
        g.write(output_dict[key])

In [38]:
import pandas as pd
df = pd.DataFrame(output_dict.values())
print(df.value_counts())

0                                    
the probability the user churned is 0    670
the probability the user churned is 1    344
Name: count, dtype: int64


In [None]:
with open("data/aws_churn_dataet.csv", "w") as g:
        g.write("input,target\n")
        for key in input_dict:
            g.write(input_dict[key][:-3] + ", " + output_dict[key]+"\n") 