# Large Langage Model Automation example

Elsa Bidant

April 2025

## 1. LLM package install and import

In [2]:
!pip install torch transformers accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6

In [5]:
!pip install --upgrade transformers



In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
import torch

## 2. Model loading

In [None]:
model_name = "unsloth/phi-4-unsloth-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.15M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

2025-05-01 17:46:33.464390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746121593.722932      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746121593.794178      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/160k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.39G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/170 [00:00<?, ?B/s]

## 3. Text generation

In [8]:
# Wrap llm generation into a function
def generation(prompt) :
  model.generation_config.pad_token_id = tokenizer.pad_token_id

  messages = [
      {"role": "user", "content": prompt}
  ]
  input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
  outputs = model.generate(input_tensor.to(model.device), max_new_tokens = 1000)

  result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)

  return result

In [9]:
# Test
prompt = "what's up bro?"
response = generation(prompt)
print(response)

Hey there! I'm just here to help with any questions or information you might need. How can I assist you today? üòä


## 4. Data import

In [10]:
import pandas as pd

df_rap = pd.read_csv('/kaggle/input/corpus/corpus.csv')
df_poe = pd.read_csv('/kaggle/input/corpus/corpus_poetica_cleaned.csv')

df_poe_random = pd.read_csv('/kaggle/input/corpus/df_poetry_annotated.csv') #import the dataset I previously did with Google Colab

## 5. Multiple text generation for poetry

## 5.1 Poetry and figures of speech

In [11]:
# Creating a function to extract the responses and the figure of speech found

import ast
import re

def extract_summary_from_python_block(text):
    try:
        # Cherche un bloc Python entre ```python ... ```
        match = re.search(r"```python\s+({.*?})\s+```", text, flags=re.DOTALL)
        if not match:
            print("No Python block found")
            return {}

        python_dict_str = match.group(1)

        # On √©value le dictionnaire Python de mani√®re s√©curis√©e
        data = ast.literal_eval(python_dict_str)

        summary = {}
        figures = data.get("figures_of_speech", {})
        for name, info in figures.items():
            summary[name.strip().lower()] = info.get("occurrences", 0)
        return summary

    except Exception as e:
        print("Parsing error :", e)
        return {}

In [12]:
# Randomly choosing 150 poems to create an annotated dataset for poetry
df_poe_random = df_poe.sample(n = 150, random_state = 42)
display(df_poe_random)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,title,author,date,text,url
2618,A la Belgique,Emile Verhaeren,,"H√©las, depuis les jours des supr√™mes combats, ...",https://www.poetica.fr/poeme-1945/emile-verhae...
346,Le D√©sespoir de la vieille,Charles Baudelaire,1869.0,La petite vieille ratatin√©e se sentit toute r√©...,https://www.poetica.fr/poeme-1444/charles-baud...
170,La Souris,Guillaume Apollinaire,1911.0,"Belles journ√©es, souris du temps, Vous rongez...",https://www.poetica.fr/poeme-124/guillaume-apo...
354,Le Reniement de saint Pierre,Charles Baudelaire,1857.0,Qu‚Äôest-ce que Dieu fait donc de ce flot d‚Äôanat...,https://www.poetica.fr/poeme-3308/charles-baud...
1558,L‚Äôisolement,Alphonse de Lamartine,,"Souvent sur la montagne, √† l‚Äôombre du vieux ch...",https://www.poetica.fr/poeme-482/alphonse-de-l...
...,...,...,...,...,...
1292,Autre guitare,Victor Hugo,,"Comment, disaient-ils, Avec nos nacelles, Fu...",https://www.poetica.fr/poeme-888/victor-hugo-a...
2168,Age d‚Äôor,Arthur Rimbaud,1930.0,"Ilona Singer, Une fille avec une plante en po...",https://www.poetica.fr/poeme-624/arthur-rimbau...
932,Polaire,Susy Desrosiers,,neige perp√©tuelle horizon blanc sur blanc ...,https://www.poetica.fr/poeme-7937/susy-desrosi...
270,L‚ÄôAutomne,Th√©odore de Banville,,"Sois le bienvenu, rouge Automne, Accours dans...",https://www.poetica.fr/poeme-264/theodore-de-b...


In [13]:
# Annotate the poem for their figure of speech
annotation = list()
for _, poem in df_poe_random.iterrows():
  prompt = f"""Here is the poem '{poem['title']}' : '{poem['text']}
Analyze the following poem and identify all the figures of speech present in the text.
For each figure of speech, do the following:

1. Quote the relevant part of the poem.
2. Name the figure of speech.
3. Briefly explain why it is an example of that figure.

At the end of the analysis, provide a summary listing:

1. Each figure of speech identified (by name),
2. How many times it appears in the poem.

Format the output as a Python dictionary with the following structure:
{{
  "figures_of_speech": {{
    "figure_name_1": {{
      "occurrences": number_of_occurrences,
      "examples": [
        ("relevant_part_of_the_poem_1", "brief_explanation_1"),
        ("relevant_part_of_the_poem_2", "brief_explanation_2"),
        ...
      ]
    }},
    "figure_name_2": {{
      "occurrences": number_of_occurrences,
      "examples": [
        ("relevant_part_of_the_poem_1", "brief_explanation_1"),
        ...
      ]
    }},
    ...
  }},
  "total_figures_of_speech": total_number
}}

Be thorough and precise. Include both common (e.g., metaphor, simile, alliteration) and less common figures (e.g., anaphora, chiasmus, hypallage)."""

  annotation.append(generation(prompt))

In [14]:
df_poe_random['annotation'] = annotation

In [None]:
df_poe_random["summary_dict"] = df_poe_random["annotation"].apply(extract_summary_from_python_block)

# Extract the figures of speech found
all_figures = set()
for summary in df_poe_random["summary_dict"]:
    all_figures.update(summary.keys())

# One column by figures
for figure in all_figures:
    df_poe_random[figure] = df_poe_random["summary_dict"].apply(lambda d: d.get(figure, 0))

# Column "total_figures"
df_poe_random["total_figures"] = df_poe_random[list(all_figures)].sum(axis = 1)

No Python block found
No Python block found
Parsing error : malformed node or string on line 12: <ast.Call object at 0x7d9a95375840>
Parsing error : '{' was never closed (<unknown>, line 1)


In [16]:
df_poe_random.to_csv("/kaggle/working/df_poe_annotated.csv", index = False)
df_poe_random

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,title,author,date,text,url,annotation,summary_dict,chiasmus,contrast,irony,...,imperative,hypallage,juxtaposition,alliteration,hypothetical condition,simile,apostrophe,repetition,ellipsis,total_figures
2618,A la Belgique,Emile Verhaeren,,"H√©las, depuis les jours des supr√™mes combats, ...",https://www.poetica.fr/poeme-1945/emile-verhae...,"Here's an analysis of the poem ""A la Belgique""...","{'metaphor': 4, 'personification': 3, 'alliter...",0,0,0,...,0,0,0,2,0,0,0,0,0,17
346,Le D√©sespoir de la vieille,Charles Baudelaire,1869.0,La petite vieille ratatin√©e se sentit toute r√©...,https://www.poetica.fr/poeme-1444/charles-baud...,"Here's an analysis of the poem ""Le D√©sespoir d...","{'simile': 1, 'metaphor': 1, 'personification'...",0,0,1,...,0,0,0,0,0,1,0,0,0,6
170,La Souris,Guillaume Apollinaire,1911.0,"Belles journ√©es, souris du temps, Vous rongez...",https://www.poetica.fr/poeme-124/guillaume-apo...,"To analyze the poem ""La Souris"" by Guillaume A...","{'metaphor': 1, 'personification': 1, 'hyperbo...",0,0,0,...,0,0,0,1,0,0,1,0,0,6
354,Le Reniement de saint Pierre,Charles Baudelaire,1857.0,Qu‚Äôest-ce que Dieu fait donc de ce flot d‚Äôanat...,https://www.poetica.fr/poeme-3308/charles-baud...,"Here's an analysis of the poem ""Le Reniement d...","{'metaphor': 3, 'simile': 1, 'alliteration': 2...",0,0,1,...,0,0,0,2,0,1,0,0,0,12
1558,L‚Äôisolement,Alphonse de Lamartine,,"Souvent sur la montagne, √† l‚Äôombre du vieux ch...",https://www.poetica.fr/poeme-482/alphonse-de-l...,"```python\n{\n ""figures_of_speech"": {\n ""M...","{'metaphor': 3, 'simile': 2, 'personification'...",1,0,0,...,0,0,0,2,0,2,0,0,0,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1292,Autre guitare,Victor Hugo,,"Comment, disaient-ils, Avec nos nacelles, Fu...",https://www.poetica.fr/poeme-888/victor-hugo-a...,"To analyze the poem ""Autre guitare"" by Victor ...","{'repetition': 3, 'anaphora': 3, 'antithesis':...",0,0,0,...,0,0,0,0,0,0,0,3,0,12
2168,Age d‚Äôor,Arthur Rimbaud,1930.0,"Ilona Singer, Une fille avec une plante en po...",https://www.poetica.fr/poeme-624/arthur-rimbau...,"Here's an analysis of the poem ""Age d‚Äôor"" by A...","{'metaphor': 3, 'personification': 2, 'alliter...",1,0,0,...,0,0,0,2,0,0,0,0,0,11
932,Polaire,Susy Desrosiers,,neige perp√©tuelle horizon blanc sur blanc ...,https://www.poetica.fr/poeme-7937/susy-desrosi...,"Here's an analysis of the poem ""Polaire"" by Su...","{'metaphor': 2, 'personification': 1, 'alliter...",0,0,0,...,0,0,0,2,0,0,0,0,0,12
270,L‚ÄôAutomne,Th√©odore de Banville,,"Sois le bienvenu, rouge Automne, Accours dans...",https://www.poetica.fr/poeme-264/theodore-de-b...,"Here's an analysis of the poem ""L‚ÄôAutomne"" by ...","{'anaphora': 3, 'personification': 2, 'metapho...",0,0,0,...,0,0,0,2,0,0,0,0,0,11


## 5.2. Rap and figures of speech

In [None]:
# Randomly choosing 150 rap to create an annotated dataset for rap
df_rap_random = df_rap.sample(n = 150, random_state = 42)
display(df_rap_random)

In [None]:
# Annotate the rap for their figure of speech

annotation = list()
for _, rap in df_rap_random.iterrows():
  prompt = f"""Here is the poem '{rap['title']}' : '{rap['lyrics']}
Analyze the following poem and identify all the figures of speech present in the text.
For each figure of speech, do the following:

1. Quote the relevant part of the poem.
2. Name the figure of speech.
3. Briefly explain why it is an example of that figure.

At the end of the analysis, provide a summary listing:

1. Each figure of speech identified (by name),
2. How many times it appears in the poem.

Format the output as a Python dictionary with the following structure:
{{
  "figures_of_speech": {{
    "figure_name_1": {{
      "occurrences": number_of_occurrences,
      "examples": [
        ("relevant_part_of_the_poem_1", "brief_explanation_1"),
        ("relevant_part_of_the_poem_2", "brief_explanation_2"),
        ...
      ]
    }},
    "figure_name_2": {{
      "occurrences": number_of_occurrences,
      "examples": [
        ("relevant_part_of_the_poem_1", "brief_explanation_1"),
        ...
      ]
    }},
    ...
  }},
  "total_figures_of_speech": total_number
}}

Be thorough and precise. Include both common (e.g., metaphor, simile, alliteration) and less common figures (e.g., anaphora, chiasmus, hypallage)."""

  annotation.append(generation(prompt))

In [None]:
df_rap_random['annotation'] = annotation

In [None]:
df_rap_random

In [None]:
df_rap_random["summary_dict"] = df_rap_random["annotation"].apply(extract_summary_from_python_block)

# Extract the figures of speech found
all_figures = set()
for summary in df_rap_random["summary_dict"]:
    all_figures.update(summary.keys())

# One column by figure
for figure in all_figures:
    df_rap_random[figure] = df_rap_random["summary_dict"].apply(lambda d: d.get(figure, 0))

# Column "total_figures"
df_rap_random["total_figures"] = df_rap_random[list(all_figures)].sum(axis = 1)


In [None]:
df_rap_random.to_csv("/kaggle/working/df_rap_annotated.csv", index = False)
df_rap_random