## Testing data splits

In [5]:
import os
import json
import numpy as np

data_path = "../../data/"

%load_ext autoreload
%autoreload 2

from llm.llm_utils import (
    split_and_save_data,
    load_data_as_lists
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# save data splits
# raw_data_path = data_path + "raw/generator=base~dataset=p1.00.jsonl"
# split_and_save_data(raw_data_path)

In [3]:
# load data
train_data = load_data_as_lists(data_path + "splits/train.jsonl")
val_data = load_data_as_lists(data_path + "splits/val.jsonl")
test_data = load_data_as_lists(data_path + "splits/test.jsonl")

print(train_data[0][:2])
print(train_data[1][:2])  # 0 corresponds to human, 1 corresponds to machine

["Patrons crowd the platform at the Washington Metropolitan Area Transit Authority's (WMATA) Metro Center stop in Washington, D.C. on Dec. 20, 2004. Thousands use the public transit system daily to get them in and around the D.C. area. (Karen Bleier/AFP/Getty Images)\nUS Senators Threaten Metro Funding Over Chinese Manufacturer\nWASHINGTON —Federal lawmakers say they’ll approve badly needed funding for Washington’s transit system, but only if it avoids buying new rail cars from China.\nThe Washington Post reported on April 13 that U.S. Senators from Virginia and Maryland proposed the idea in new legislation. It reflects growing concerns that China’s state-owned rail company could hurt American manufacturers and make the system vulnerable to cyber espionage.\nDave Smolensky, spokesman for the China Railway Rolling Stock Corp, dismissed the espionage concerns. The company also said the United States should be promoting competition.\nThe company has won four major U.S. rail car contracts.

In [4]:
print(f"Num of train samples: {len(train_data[0])}")
print(f"Num of test samples: {len(test_data[0])}")
print(f"Num of val samples: {len(val_data[0])}")
# why are there 12k test samples wtf?

Num of train samples: 10000
Num of test samples: 12000
Num of val samples: 3000


## Testing Dataset functionality

In [5]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
data_files = {
    "train": data_path + "splits/train.jsonl",
    "val" : data_path + "splits/val.jsonl",
    "test": data_path + "splits/test.jsonl",
}
grover_dataset = load_dataset("json", data_files=data_files)

In [7]:
grover_dataset["train"][0]

{'article': "Patrons crowd the platform at the Washington Metropolitan Area Transit Authority's (WMATA) Metro Center stop in Washington, D.C. on Dec. 20, 2004. Thousands use the public transit system daily to get them in and around the D.C. area. (Karen Bleier/AFP/Getty Images)\nUS Senators Threaten Metro Funding Over Chinese Manufacturer\nWASHINGTON —Federal lawmakers say they’ll approve badly needed funding for Washington’s transit system, but only if it avoids buying new rail cars from China.\nThe Washington Post reported on April 13 that U.S. Senators from Virginia and Maryland proposed the idea in new legislation. It reflects growing concerns that China’s state-owned rail company could hurt American manufacturers and make the system vulnerable to cyber espionage.\nDave Smolensky, spokesman for the China Railway Rolling Stock Corp, dismissed the espionage concerns. The company also said the United States should be promoting competition.\nThe company has won four major U.S. rail car

In [None]:
# article length analysis
lens = {"train": [], "val": [], "test": []}

for split in grover_dataset.keys():
    for i in range(len(grover_dataset[split])):
        text = grover_dataset[split][i]['article']
        lens[split].append(len(text.split()))

print(sorted(lens["train"], reverse=True)[:10])
# plt.hist(lens['train'], bins=50)
print(sorted(lens["val"], reverse=True)[:10])
# plt.hist(lens['val'], bins=50)
print(sorted(lens["test"], reverse=True)[:10])
# plt.hist(lens['test'], bins=50)

#### The longest article seems to be 13k words!


In [9]:
# data imbalance checking
label_counts = {}
for split in grover_dataset.keys():
    df = pd.DataFrame(grover_dataset[split])
    label_counts[split] = df['label'].value_counts()
print(label_counts)

# weird, train is balanced, but val and test are imbalanced.

{'train': label
human      5000
machine    5000
Name: count, dtype: int64, 'val': label
human      2000
machine    1000
Name: count, dtype: int64, 'test': label
human      8000
machine    4000
Name: count, dtype: int64}


## Exploring the model (?)

In [10]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
import bitsandbytes as bnb
import accelerate

import os
os.environ['HF_HOME'] = '../../models/hf_cache/'

from transformers.utils import is_accelerate_available, is_bitsandbytes_available
print(is_accelerate_available())
print(is_bitsandbytes_available())
print(torch.cuda.is_available())
# is_bitsandbytes_available() only returns True if GPU is available

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


True
False
False


In [11]:
os.getcwd()

'/storage/ice1/4/1/dmishra45/CS-7641-Project/notebooks/llm'

In [12]:
# quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# load the model
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     # quantization_config=bnb_config,
#     cache_dir = os.environ['HF_HOME'] + "hub/",
# )

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir = os.environ['HF_HOME'] + "hub/",
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [13]:
# This is what you get when you print the model 
# after successfully loading it with bnb
"""
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)
"""

## Preprocessing the dataset

In [3]:
import os
os.environ['HF_HOME'] = '../../models/hf_cache/'

from functools import partial
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from llm.llm_utils import (
    create_prompt,
    tokenize_text
)
from transformers import (
    AutoTokenizer,
)

data_path = "../../data/"
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
MAX_TOKENS = 4096

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
data_files = {
    "train": data_path + "splits/train.jsonl",
    "val" : data_path + "splits/val.jsonl",
    "test": data_path + "splits/test.jsonl",
}
dataset = load_dataset("json", data_files=data_files)

In [5]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir = os.environ['HF_HOME'] + "hub/",
)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'label', 'orig_split', 'split', 'random_score', 'top_p'],
        num_rows: 10000
    })
    val: Dataset({
        features: ['article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'label', 'orig_split', 'split', 'random_score', 'top_p'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'label', 'orig_split', 'split', 'random_score', 'top_p'],
        num_rows: 12000
    })
})

In [7]:
dataset["train"] = dataset["train"].map(create_prompt)
# doesn't take long, ~15s

In [8]:
print(dataset["train"][1]['text'])

### Instructions:
Your task is to classify an excerpt from a news article as being human-generated or machine-generated. If it was machine-generated, respond with 'machine', else respond with 'human'.

### Input:
Classify the following news article excerpt as being human-generated or machine-generated:

On April 12, I wrote "With lows of nearly $45 a barrel, historically undisturbed deep discounting," as to why companies were trading off a window of opportunity with the lowest valuation they had seen in years.
High valuations
I am not an expert, but I would like to illustrate that many valuation metrics are insufficient to evaluate oil's current $45+ per barrel valuation.
The supply story is probably over-hanging at first, as it appears the black hole in cap market equity is only around $70 #2. This means if we were to seek weekly inventory data, that number would spiral downward to $70 #4. This is gradual, and if this $60 to $70 levels is not expected to be sustained as a matter of co

In [9]:
dataset["train"] = dataset["train"].remove_columns([
    'article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'orig_split', 'split', 'random_score', 'top_p'
])

In [10]:
dataset["train"]

Dataset({
    features: ['label', 'text'],
    num_rows: 10000
})

In [11]:
# what happens when we tokenize stuff?

In [12]:
sample_tokens = tokenizer(dataset["train"][1]["text"], max_length=4096, truncation=True)
print(sample_tokens['input_ids'])
print(len(sample_tokens['input_ids']))
print(sample_tokens['input_ids'][-10:])  # corresponds to \nmachine not \n machine?

[1, 835, 2799, 582, 1953, 29901, 13, 10858, 3414, 338, 304, 770, 1598, 385, 429, 2265, 415, 515, 263, 9763, 4274, 408, 1641, 5199, 29899, 13525, 470, 4933, 29899, 13525, 29889, 960, 372, 471, 4933, 29899, 13525, 29892, 10049, 411, 525, 23523, 742, 1683, 10049, 411, 525, 26029, 4286, 13, 13, 2277, 29937, 10567, 29901, 13, 2385, 1598, 278, 1494, 9763, 4274, 429, 2265, 415, 408, 1641, 5199, 29899, 13525, 470, 4933, 29899, 13525, 29901, 13, 13, 2951, 3786, 29871, 29896, 29906, 29892, 306, 5456, 376, 3047, 301, 1242, 310, 8886, 395, 29946, 29945, 263, 2594, 2674, 29892, 3603, 1711, 563, 391, 28179, 6483, 2313, 792, 292, 1699, 408, 304, 2020, 14582, 892, 3534, 292, 1283, 263, 3474, 310, 15130, 411, 278, 19604, 17134, 362, 896, 750, 3595, 297, 2440, 29889, 13, 16382, 17134, 800, 13, 29902, 626, 451, 385, 17924, 29892, 541, 306, 723, 763, 304, 28475, 393, 1784, 17134, 362, 21556, 526, 1663, 29884, 4543, 304, 14707, 17182, 29915, 29879, 1857, 395, 29946, 29945, 29974, 639, 2594, 2674, 17134, 36

In [13]:
print(tokenizer("###"))
print(tokenizer("### End"))
print(tokenizer("\n### End"))
print(tokenizer("\n### End\n"))
print(tokenizer("\nmachine\n### End\n"))
print(tokenizer("machine"))
print(tokenizer("\nmachine"))
print(tokenizer("\nhuman\n### End\n"))
print(tokenizer("human"))
print(tokenizer("\n machine\n### End\n"))

{'input_ids': [1, 835], 'attention_mask': [1, 1]}
{'input_ids': [1, 835, 2796], 'attention_mask': [1, 1, 1]}
{'input_ids': [1, 29871, 13, 2277, 29937, 2796], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 29871, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 29871, 13, 23523, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 4933], 'attention_mask': [1, 1]}
{'input_ids': [1, 29871, 13, 23523], 'attention_mask': [1, 1, 1, 1]}
{'input_ids': [1, 29871, 13, 26029, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 5199], 'attention_mask': [1, 1]}
{'input_ids': [1, 29871, 13, 4933, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
# sooo, we don't get tokenizer upto 4096? the rest will be padded during training, perhaps.

In [15]:
# tokenize the entire dataset
MAX_TOKENS = 4096
_preproc_func = partial(tokenize_text, tokenizer=tokenizer, max_length = MAX_TOKENS)
dataset["train"] = dataset["train"].map(_preproc_func)

# takes a while... ~1-1.5 min for dataset["train"]
# same if run with batched=True (batching does happen though)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
dataset["train"]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 10000
})