## Testing data splits

In [43]:
import os
import json
import numpy as np

data_path = "../../data/"

%load_ext autoreload
%autoreload 2

from llm.llm_utils import (
    split_and_save_data,
    load_data_as_lists
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
# save data splits
generator, p = "base", "1.00"
raw_data_path = data_path + "raw/"
split_and_save_data(raw_data_path, generator, p)

In [3]:
# load data
train_data = load_data_as_lists(data_path + "splits/train.jsonl")
val_data = load_data_as_lists(data_path + "splits/val.jsonl")
test_data = load_data_as_lists(data_path + "splits/test.jsonl")

print(train_data[0][:2])
print(train_data[1][:2])  # 0 corresponds to human, 1 corresponds to machine

["Patrons crowd the platform at the Washington Metropolitan Area Transit Authority's (WMATA) Metro Center stop in Washington, D.C. on Dec. 20, 2004. Thousands use the public transit system daily to get them in and around the D.C. area. (Karen Bleier/AFP/Getty Images)\nUS Senators Threaten Metro Funding Over Chinese Manufacturer\nWASHINGTON —Federal lawmakers say they’ll approve badly needed funding for Washington’s transit system, but only if it avoids buying new rail cars from China.\nThe Washington Post reported on April 13 that U.S. Senators from Virginia and Maryland proposed the idea in new legislation. It reflects growing concerns that China’s state-owned rail company could hurt American manufacturers and make the system vulnerable to cyber espionage.\nDave Smolensky, spokesman for the China Railway Rolling Stock Corp, dismissed the espionage concerns. The company also said the United States should be promoting competition.\nThe company has won four major U.S. rail car contracts.

In [4]:
print(f"Num of train samples: {len(train_data[0])}")
print(f"Num of test samples: {len(test_data[0])}")
print(f"Num of val samples: {len(val_data[0])}")
# why are there 12k test samples wtf?

Num of train samples: 10000
Num of test samples: 12000
Num of val samples: 3000


## Testing Dataset functionality

In [3]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data_path = "../../data/"

%load_ext autoreload
%autoreload 2

In [4]:
generator, p = "mega", "0.94"
partial_path = data_path + f"splits/generator={generator}~dataset=p{p}/"
data_files = {
    "train": partial_path + "train.jsonl",  # 10k
    "val" : partial_path + "val.jsonl",  # 3k
    "test": partial_path + "test.jsonl",  # 12k, won't need test data
}
grover_dataset = load_dataset("json", data_files=data_files)

In [5]:
grover_dataset["train"][0]

{'article': "Patrons crowd the platform at the Washington Metropolitan Area Transit Authority's (WMATA) Metro Center stop in Washington, D.C. on Dec. 20, 2004. Thousands use the public transit system daily to get them in and around the D.C. area. (Karen Bleier/AFP/Getty Images)\nUS Senators Threaten Metro Funding Over Chinese Manufacturer\nWASHINGTON —Federal lawmakers say they’ll approve badly needed funding for Washington’s transit system, but only if it avoids buying new rail cars from China.\nThe Washington Post reported on April 13 that U.S. Senators from Virginia and Maryland proposed the idea in new legislation. It reflects growing concerns that China’s state-owned rail company could hurt American manufacturers and make the system vulnerable to cyber espionage.\nDave Smolensky, spokesman for the China Railway Rolling Stock Corp, dismissed the espionage concerns. The company also said the United States should be promoting competition.\nThe company has won four major U.S. rail car

In [9]:
i = 0
for row in grover_dataset["train"]:
    i+=1
    print(row["label"])
    if i == 3:
        break

human
machine
human


In [None]:
# article length analysis
lens = {"train": [], "val": [], "test": []}

for split in grover_dataset.keys():
    for i in range(len(grover_dataset[split])):
        text = grover_dataset[split][i]['article']
        lens[split].append(len(text.split()))

print(sorted(lens["train"], reverse=True)[:10])
# plt.hist(lens['train'], bins=50)
print(sorted(lens["val"], reverse=True)[:10])
# plt.hist(lens['val'], bins=50)
print(sorted(lens["test"], reverse=True)[:10])
# plt.hist(lens['test'], bins=50)

#### The longest article seems to be 13k words!


In [None]:
# data imbalance checking
label_counts = {}
for split in grover_dataset.keys():
    df = pd.DataFrame(grover_dataset[split])
    label_counts[split] = df['label'].value_counts()
print(label_counts)

# weird, train is balanced, but val and test are imbalanced.

{'train': label
human      5000
machine    5000
Name: count, dtype: int64, 'val': label
human      2000
machine    1000
Name: count, dtype: int64, 'test': label
human      8000
machine    4000
Name: count, dtype: int64}


## Exploring the model (?)

In [10]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)
import bitsandbytes as bnb
import accelerate

import os
os.environ['HF_HOME'] = '../../models/hf_cache/'

from transformers.utils import is_accelerate_available, is_bitsandbytes_available
print(is_accelerate_available())
print(is_bitsandbytes_available())
print(torch.cuda.is_available())
# is_bitsandbytes_available() only returns True if GPU is available

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


True
False
False


In [11]:
os.getcwd()

'/storage/ice1/4/1/dmishra45/CS-7641-Project/notebooks/llm'

In [12]:
# quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# load the model
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     # quantization_config=bnb_config,
#     cache_dir = os.environ['HF_HOME'] + "hub/",
# )

# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir = os.environ['HF_HOME'] + "hub/",
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [13]:
# This is what you get when you print the model 
# after successfully loading it with bnb
"""
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)
"""

## Preprocessing the dataset

In [57]:
import os
os.environ['HF_HOME'] = '../../models/hf_cache/'

from functools import partial
from datasets import load_dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from llm.llm_utils import (
    create_prompt,
    tokenize_text
)
from transformers import (
    AutoTokenizer,
)

data_path = "../../data/"
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
MAX_TOKENS = 4096

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
generator, p = "base", "1.00"
partial_path = data_path + f"splits/generator={generator}~dataset=p{p}/"
data_files = {
    "train": partial_path + "train.jsonl",  # 10k
    "val" : partial_path + "val.jsonl",  # 3k
    "test": partial_path + "test.jsonl",  # 12k, won't need test data
}
grover_dataset = load_dataset("json", data_files=data_files)
dataset = load_dataset("json", data_files=data_files)

In [59]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir = os.environ['HF_HOME'] + "hub/",
)
tokenizer.pad_token = tokenizer.eos_token

In [60]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'label', 'orig_split', 'split', 'random_score', 'top_p'],
        num_rows: 10000
    })
    val: Dataset({
        features: ['article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'label', 'orig_split', 'split', 'random_score', 'top_p'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'label', 'orig_split', 'split', 'random_score', 'top_p'],
        num_rows: 12000
    })
})

In [61]:
dataset["val"] = dataset["val"].map(create_prompt)
# doesn't take long, ~15s

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [62]:
print(dataset["val"][1]['text'])

<s>[INST] <<SYS>>
Your task is to classify an excerpt from a news article as being human-generated or machine-generated. If it was machine-generated, respond with 'machine', else respond with 'human'. Respond with exactly one of "machine" or "human". The excerpt has been provided below.. 
<</SYS>>

Screenshot by Katie Conner/CBS Interactive
Knowing your credit score is an important part of your financial picture. You'll need a credit check for loans and major purchases, like down payments on a new home or vehicle, and when opening some new credit cards. Being involved in your credit is another way to verify that your identity hasn't been stolen. Note that looking into your credit score will not affect your credit.
With the new Apple Card coming out this summer, it may be time to do a checkup on your credit score before you apply. You need to be a well-qualified customer to be approved for the new credit card. Apple hasn't exactly explained what that means, but knowing your standing is 

In [20]:
dataset["train"] = dataset["train"].remove_columns([
    'article', 'domain', 'title', 'date', 'authors', 'ind30k', 'url', 'orig_split', 'split', 'random_score', 'top_p'
])

In [21]:
dataset["train"]

Dataset({
    features: ['label', 'text'],
    num_rows: 10000
})

In [22]:
# what happens when we tokenize stuff?

In [23]:
sample_tokens = tokenizer(dataset["train"][1]["text"], max_length=4096, truncation=True)
print(sample_tokens['input_ids'])
print(len(sample_tokens['input_ids']))
print(sample_tokens['input_ids'][-10:])  # corresponds to \nmachine not \n machine?

[1, 835, 2799, 582, 1953, 29901, 13, 10858, 3414, 338, 304, 770, 1598, 385, 429, 2265, 415, 515, 263, 9763, 4274, 408, 1641, 5199, 29899, 13525, 470, 4933, 29899, 13525, 29889, 960, 372, 471, 4933, 29899, 13525, 29892, 10049, 411, 525, 23523, 742, 1683, 10049, 411, 525, 26029, 4286, 13, 13, 2277, 29937, 10567, 29901, 13, 2385, 1598, 278, 1494, 9763, 4274, 429, 2265, 415, 408, 1641, 5199, 29899, 13525, 470, 4933, 29899, 13525, 29901, 13, 13, 2951, 3786, 29871, 29896, 29906, 29892, 306, 5456, 376, 3047, 301, 1242, 310, 8886, 395, 29946, 29945, 263, 2594, 2674, 29892, 3603, 1711, 563, 391, 28179, 6483, 2313, 792, 292, 1699, 408, 304, 2020, 14582, 892, 3534, 292, 1283, 263, 3474, 310, 15130, 411, 278, 19604, 17134, 362, 896, 750, 3595, 297, 2440, 29889, 13, 16382, 17134, 800, 13, 29902, 626, 451, 385, 17924, 29892, 541, 306, 723, 763, 304, 28475, 393, 1784, 17134, 362, 21556, 526, 1663, 29884, 4543, 304, 14707, 17182, 29915, 29879, 1857, 395, 29946, 29945, 29974, 639, 2594, 2674, 17134, 36

In [24]:
print(tokenizer("###"))
print(tokenizer("### End"))
print(tokenizer("\n### End"))
print(tokenizer("\n### End\n"))
print(tokenizer("\nmachine\n### End\n"))
print(tokenizer("machine"))
print(tokenizer("\nmachine"))
print(tokenizer("\nhuman\n### End\n"))
print(tokenizer("human"))
print(tokenizer("\n machine\n### End\n"))

{'input_ids': [1, 835], 'attention_mask': [1, 1]}
{'input_ids': [1, 835, 2796], 'attention_mask': [1, 1, 1]}
{'input_ids': [1, 29871, 13, 2277, 29937, 2796], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 29871, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 29871, 13, 23523, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 4933], 'attention_mask': [1, 1]}
{'input_ids': [1, 29871, 13, 23523], 'attention_mask': [1, 1, 1, 1]}
{'input_ids': [1, 29871, 13, 26029, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [1, 5199], 'attention_mask': [1, 1]}
{'input_ids': [1, 29871, 13, 4933, 13, 2277, 29937, 2796, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [25]:
# sooo, we don't get tokenizer upto 4096? the rest will be padded during training, perhaps.

In [15]:
# tokenize the entire dataset
MAX_TOKENS = 4096
_preproc_func = partial(tokenize_text, tokenizer=tokenizer, max_length = MAX_TOKENS)
dataset["train"] = dataset["train"].map(_preproc_func)

# takes a while... ~1-1.5 min for dataset["train"]
# same if run with batched=True (batching does happen though)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
dataset["train"]

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

In [34]:
# hmm
prompt = dataset["train"][0]["text"]
input = tokenizer(prompt, return_tensors="pt")

In [37]:
input.input_ids.shape[1]

405

In [10]:
test_str = """### Instructions:
Your task is to classify an excerpt from a news article as being human-generated or machine-generated. If it was machine-generated, respond with 'machine', else respond with 'human'.

### Input:
Classify the following news article excerpt as being human-generated or machine-generated:

The weather this winter has certainly been an excusable cause for comment, but why are we so obsessed with it?
By Lizzy Ioannidou
This newspaper offers its readers the full gamut of political scandals, collapse of banks and the tortuous attempts to implement a national health scheme, but there’s nothing like a two-paragraph story on the weather to shake up reactions.
Why does the weather fascinate us so much?
A quick scroll down the Cyprus Mail’s Facebook page reveals that Cyprus problem articles and those on the demise of the Co-op bank, for example, receive a respectful number of likes and some bleak, cynical comments.
But barring the story announcing that Nicholas Cage would be coming to shoot a movie on our Hollywood-star-deprived island – which garnered 935 reactions and 522 shares on Facebook – it’s the weather stories that never fail to perform, even when it’s just the same news all over again.
On April 7, a headline focusing on persisting dust in the atmosphere – hardly news for the island – gathered 83 reactions and 63 shares. On April 5, a small article on rainy days ahead – hardly earth-shattering news this entire winter – accumulated 83 reactions and 63 shares.
We’d be lying if we said that the weather this winter is undeserving of discussion. We’re traversing an extraordinarily rainy and long winter season, the likes of which the island has seen only once since the beginning of the 20th century, and forecasts show that it isn’t going away any time soon.
Plus, if the local legend about the inevitability of dark rainy days during the week of Easter comes through – apparently the planet mourns with us on the days leading up to Christ’s crucifixion – then expect almost identical weather stories to follow until the end of the month.
But reader interactions to news dealing with the consequences of the extraordinary weather, such as flooded roads and homes or damaged crops, pale in comparison to news simply announcing that scattered showers might affect a particular day, indicating that our fascination is located in the phenomenon of weather itself.
According to Yiannos Ioannou, a weather enthusiast equipped with a meteorological observation station for the past ten years and a blog platform for sharing his findings, our weather craze might be heightened this year, but it is nothing new.
“Due to the prominence and impact that the weather had in the lives of grandfathers and great-grandfathers, when there were no back-up plans for bad weather years – when it didn’t rain there was simply no water – perhaps our interest in the weather may even be rooted in our DNA,” he suggests.
Today’s enthusiasm might be an inherited residue of that age-old anxiety, but at the centre of it is curiosity, according to Ioannou.
“When I was a child, I was curious as to why there was sun one moment and clouds the next, but some clouds wouldn’t bring rain and some would, and I then learnt that only with a specific wind direction would it rain,” Ioannou said.
Curiosity was also the main stimulus for Feelix Eer, also a weather enthusiast. “From a young age I remember running to the rooftops of buildings in my neighbourhood to watch the incoming storms…Have you ever been in the middle of an electric storm? Have you ever felt the ground shake under your feet by the shockwave created? Feels like an earthquake. I guess extreme weather has always fascinated me since I was a child.”
Growing up, learning about all kinds of clouds and weather patterns, discovering observation tools for what is going on or what might happen, “things started making more sense,” Feelix said.
While a vast majority of us are receptive to the triviality of small talk, indirectly ascribing to Oscar Wilde’s belief that conversation about the weather is the last refuge of the unimaginative, we still do it religiously.
After all, who would we be as Cypriots if we didn’t complain whenever it rained during years of drought because we had just washed our cars, asking the heavens “to just make their minds up”?
Part of our current enthusiasm with the weather may also be because normally our weather is predictable.
“The mainly dry weather of our island makes any event even more exciting,” said Feelix.
Variability, and the claim that a single day’s weather may contain all four seasons may be what made the Brits renowned for their weather-craze, but the Cypriot love-affair with weather seems to be rooted in its loyalty to sameness, which makes every odd fluctuation word-class news.
Another explanation for our close ties with the weather is that it is one of the few relatively indisputable experiences we share, and one of the items on an extremely small list that we agree to be beyond our grasp, even though we have imposed ourselves in this arena too through global warming, bringing adverse effects to the weather.
Feelix, also a farmer for the past five years, and therefore highly dependent on the mood of the weather, said that what in fact draws him the most to the weather is “the life rain gives”.
“Whatever goes on on the other side of the planet will be affecting you here, [reminding us] how fragile and how interconnected we all are.”
Meaningless as weather conversation may appear, as with most things, there’s more to it than meets the eye. Feelix described the wonders of the unique climate of the Troodos mountains, where it could rain all day even though – or due to – the extreme heat of lower altitudes. Or who’s ever seen a phenomenon called ‘lake-effect snow’ or a ‘sun dog’ or an ‘omega block’, and recognised it? Feelix has.
But beyond the weather being a source of external wonders, it also central to internal and personal processes, such as our mood, our identity and how we live our lives.
“It can affect your day, or night. Some want to know if they can go out tomorrow. Others just want to sit at home and watch the rain with a hot coffee mug in their hands,” Feelix said.
Weather affects how we plan and go about our days, and the mental state in which we do so. Beyond more extreme symptoms associated with seasonal affective disorder (SAD), sunlight has been found to improve memory function and to affect optimism and energy levels for the better; cloudy and rainy weather enhances our ability to focus.
“There’s a general happiness during good weather years, and a general depression during bad years, regardless of our own personal circumstances or jobs,” said Ioannou.
Some even went as far as to explain the European debt crisis of 2009 in terms of the climate, whereby the countries that were hit the hardest – Greece, Portugal, Spain, Cyprus – were hot countries, and therefore were composed of a lazy population, which led to an unmanageable debt crisis. Go figure.
In some ways, despite the attempt to pick apart our fascination with the weather, beneath it all is the positive acknowledgement that above all else, and as much as we try to separate ourselves from it, we are still highly connected to and dependent on nature.
So, if it’s weather updates you want, it’s weather updates you’ll get.
Temperatures are expected to rise to above average for the time of the year during the weekend, with light dust expected to emerge on Saturday clearing away by Sunday.
Be sure to enjoy the good weather today as come Monday forecasts show a return of scattered showers and isolated thunderstorms, with temperatures dropping to below average.
.

### Response:

    human"""

In [12]:
res = test_str.split("### Response:")[-1]

In [13]:
"human" in res

True