# Step 4
In this notebook, we will:

1. Split all the speeches we have cleaned into groups of two sentences
2. Load those groups into a Huggingface Dataset
3. Use that dataset to train a language model on text-generation


**DataFrame --> Sentences**

In [3]:
import pandas as pd

# Make sure to upload cleaned texts
df = pd.read_json("krishnamurti_clean.json")
df.head(1)

Unnamed: 0,Text source,Talk Type,Participants Category,Decade,Date Code,Country,City,text,Participants
0,AV,Talk,Public,60s,660302,India,Bombay (Mumbai),This is the last talk of this year. I think th...,


In [4]:
# Now let's iterate over the text column of the dataframe
super_long_string = ""

# df.iterrows() is a generator
for index, row in df.iterrows():
    super_long_string += row['text'] + " "


In [5]:
%%capture
super_long_string

In [6]:
# Let's remove white space
print(f"Length of string with whitespace: {len(super_long_string)}")
super_long_string = " ".join(super_long_string.split())
print(f"Length of string without whitespace: {len(super_long_string)}")

Length of string with whitespace: 9799617
Length of string without whitespace: 9742660


In [7]:
# Create a list of composed of double sentences
# This might take long to execute: ~ 15 minutes
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(super_long_string)
sentences[:4]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['This is the last talk of this year.',
 "I think the more one observes the world's condition, the more it becomes clear that there must be a totally different kind of action.",
 'One sees in the world - including in India - the confusion, the great sorrow, the misery, the starvation, the general decline.',
 "One is aware of this, one knows it from newspapers, reading magazines, books, but it remains on the intellectual level because we don't seem to be able to do anything about it."]

In [8]:
# Grouping sentences into triples
add_string = ""
remainder = ""
triple_sentences = [] 

for i, sentence in enumerate(sentences):
  if i % 3 == 0 and i != 0:
    triple_sentences.append(add_string)
    add_string = sentence + " "
  else:
    add_string += sentence + " "
    remainder = add_string

if remainder:
  triple_sentences.append(remainder)

###### **Sentences --> Dataset**

In [9]:
# Creating training and validation senetences
percent90 = int(.9 * len(triple_sentences))
training_sentences = triple_sentences[:percent90]
validation_sentences = triple_sentences[percent90:]

# Creating dataframes for two sets
import pandas as pd
tset = pd.DataFrame(training_sentences, columns=["text"])
vset = pd.DataFrame(validation_sentences, columns=["text"])


In [None]:
vset

In [10]:
# Notice the that around each sentence is 8 words
count = 0
for row in vset.itertuples():  # Better to use itertuples as this generator returns a series object for each row
  count += len(row[1])

In [11]:
print(f"The average length of eachs string is: {count/vset.size}")

The average length of eachs string is: 186.41294473883423


**Converting DataFrames into Huggingface Datasets**

In [12]:
%%capture
# Installing Huggingface Transformers Library (ignoring pip dependecy error)
! pip install datasets "transformers[sentencepiece]"

In [13]:
from datasets import Dataset
split1_ds = Dataset.from_pandas(tset)
split2_ds = Dataset.from_pandas(vset)

# Creating a dataset dictionary out of two datasets
from datasets import DatasetDict
ddict = DatasetDict({
    "train": split1_ds,   # split1_ds is an instance of `datasets.Dataset`
    "validation": split2_ds
})

In [14]:
%%capture
# Here are three random samples from my dataset:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML
from datasets import load_dataset

datasets = load_dataset("eliwill/Krishnamurti")

picks = []
for _ in range(5):
    pick = random.randint(0, len(datasets["train"])-1)
    while pick in picks:
        pick = random.randint(0, len(datasets["train"])-1)
    picks.append(pick)

df = pd.DataFrame(datasets["train"][picks])
for column, typ in datasets["train"].features.items():
    if isinstance(typ, ClassLabel):
        df[column] = df[column].transform(lambda i: typ.names[i])
display(HTML(df.to_html()))

In [None]:
# Pushing to huggingface
! huggingface-cli login 
ddict.push_to_hub("eliwill/krishnamurti_discussions")


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: 

###### **Dataset to language model**

In this section I follow the [Huggingface Causal Language Model Tutorial](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb#scrollTo=NyZvu_MF3l_P), to create a Krishnamurti text-generation model

In [None]:
# Logging into Huggingface
from huggingface_hub import notebook_login
notebook_login()

# Installing dependencies
!apt install git-lfs
import transformers

# Let's check out an example of validation dataset
from datasets import load_dataset
datasets = load_dataset("eliwill/krishnamurti_discussions")
datasets["validation"][10]

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
# Here we define the checkpoint we will be using and set up the trainer
%%capture
model_checkpoint = "distilgpt2"
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def tokenize_function(examples):
    return tokenizer(examples["text"])
    
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

block_size = tokenizer.model_max_length

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

# Loading pretrained weights of distilGPT2
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

from transformers import Trainer, TrainingArguments
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"{model_name}-a-to-a-1.0",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [None]:
# Training model
trainer.train()

***** Running training *****
  Num examples = 1975
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 741


KeyboardInterrupt: ignored

In [2]:
# Push to Huggingface Hub
trainer.push_to_hub()

NameError: ignored