In [1]:
import glob
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import pandas as pd
import tiktoken
import time
from tqdm.notebook import tqdm
from os.path import isfile

In [2]:
OPENAI_API_KEY = "YOUR API KEY GOES HERE"
OPENAI_API_MODEL = 'gpt-3.5-turbo-0125'

In [3]:
# Silence setting with copy warning
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
tagging_prompt = ChatPromptTemplate.from_template(
    """
    You are a multilingual specialist that classifies speeches in parliament.

    You will read texts that come in many different languages.
    
    Then you will classify it with a word or short phrase, considering topics that are
    usually covered in the EU Parliament.
    
    Some example topics would be: 'parliament procedures', 'elections', 'climate', 'war',
    'technology', 'innovation', 'finance', 'migration', 'industry', 'democracy'. This list is not
    exhaustive.
    
    Notice that one speech might include more than one topic. If needed, classify it over more than
    one tag as well.
        
    You will also note down in which language the source text was.

    Return the output in the format specified in the Classification class provided.
    
    An example input would be:
    
    ```A União Europeia precisa de mais esforços para combater as mudanças climáticas, que
    estão afetando todo o planeta. O clima está mudando. E a guerra na Ucrânia segue sendo um problema.
    Precisamos parar com a guerra.```
    
    An the exemple output would be:
    
    topic: climate, war
    language: portuguese

    The text for translation is below:

    ---
    {text}.
    """
)

class Classifxqication(BaseModel):
    topic: str = Field(description="The main topic of the text")
    language: str = Field(description="The language the text is written in")

# LLM
llm = ChatOpenAI(temperature=0.1, 
                 model=OPENAI_API_MODEL, 
                 openai_api_key=OPENAI_API_KEY).with_structured_output(
    Classification
)

tagging_chain = tagging_prompt | llm
    
    

In [16]:
# Counts how many tokens we will have on each row
def count_tokens(text):
    # Initialize the tokenizer with the model's encoding
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

    # Encode the text to get the token count
    tokens = encoding.encode(text)
    token_count = len(tokens)

    return token_count

In [17]:
# Splits the given DataFrame into chunks of specified size.
def split_dataframe(df, chunk_size=30):
    return [df[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

In [18]:
# Read all the chunks and concat
df = pd.concat([pd.read_csv(f) for f in glob.glob("../output/lang-detected/chunks/*.csv")])
# Keep only the last legislature entries
df = df[df.date >= "2019-06-02"]

In [19]:
# Counts the tokens
df['token_count'] = df.speech.apply(count_tokens)

In [20]:
# There are two very long speeches, with more than 16k tokens. They would be truncated,
# so we got rid of them as well.
long_entries = df[df.token_count > 16000].copy()
df = df[df.token_count <= 16000]

In [21]:
display(long_entries)

Unnamed: 0,level_0,index,speech,speaker_id,subject,speaker_span,fname,date,term,year,speech_length_in_characters,token_count
104132,119658,195644,"Νίκος Χριστοδουλίδης, Πρόεδρος της Κυπριακής Δ...","generic photo, parse from text",4. This is Europe - Debate with the President ...,"Νίκος Χριστοδουλίδης,",../output/csvs/parsed-9-2023-06-13.csv,2023-06-13,9,2023,29933.0,18308
171302,197600,341284,"Κυριάκος Μητσοτάκης, Πρωθυπουργός της Ελλάδας....","generic photo, parse from text",4. This is Europe - Debate with the Prime Mini...,"Κυριάκος Μητσοτάκης,",../output/csvs/parsed-9-2022-07-05.csv,2022-07-05,9,2022,22952.0,20313


In [16]:
# Splits the dataframe 
chunk_size = 100
dfs = split_dataframe(df, chunk_size=chunk_size)
for i, subset in enumerate(tqdm(dfs)):
                      
    fname = f"../output/classified-llm/{chunk_size}_{i+1}.csv"
    if isfile(fname):
        continue
    
    # could also depend on the ammount of tokens in the dataframe.
    results = tagging_chain.batch([{'text': text} for text in subset['speech']], 
                          config={"max_concurrency": 70})
    
    # Saves the topic and language classification back in the dataframe
    subset['topic'] = [result.topic for result in results]
    subset['language'] = [result.language for result in results]

    # Saves it as CSV
    subset.to_csv(fname)
    
    if i % 20 == 0:
        time.sleep(30)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=594.0), HTML(value='')))


