In [11]:
import pandas as pd
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('gdpr_fines.db')


query = "SELECT * FROM fines WHERE country = 'spain'"  
df_fines = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Display the DataFrame
df_fines.head()

Unnamed: 0,id,country,flag_url,authority,date,fine_eur,company,sectors,gdpr_articles,violation,summary,verdict_link,case_url
0,ETid-2737,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-23,60000,"AIRE NETWORKS DEL MEDITERRÃNEO, S.L.","Media, Telecoms and Broadcasting",Art. 5 (1) f) GDPR,Insufficient technical and organisational meas...,"The Spanish DPA imposed a fine of EUR 60,000 o...",https://www.aepd.es/documento/ps-00025-2025.pdf,https://www.enforcementtracker.com/ETid-2737
1,ETid-2736,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-21,200000,"TELEFÓNICA MÓVILES ESPAÑA, S.A.","Media, Telecoms and Broadcasting",Art. 6 (1) GDPR,Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00159-2024.pdf,https://www.enforcementtracker.com/ETid-2736
2,ETid-2735,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-20,360,RED ESPAÑOLA DE IDENTIFICACIÓN DE ANIMALES DE ...,Individuals and Private Associations,Art. 58 (1) GDPR,Insufficient cooperation with supervisory auth...,The Spanish DPA imposed a fine of EUR 360 on R...,https://www.aepd.es/documento/ps-00212-2025.pdf,https://www.enforcementtracker.com/ETid-2735
3,ETid-2734,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-19,200000,"ASNEF-EQUIFAX, SERVICIOS DE INFORMACIÓN SOBRE ...","Finance, Insurance and Consulting","Art. 6 (1) GDPR, Art. 17 GDPR",Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00157-2024.pdf,https://www.enforcementtracker.com/ETid-2734
4,ETid-2733,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-16,30000,ATRESMEDIA CORPORACIÓN DE MEDIOS DE COMUNICACI...,"Media, Telecoms and Broadcasting",Art. 5 (1) c) GDPR,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 30,000 o...",https://www.aepd.es/documento/ps-00175-2024.pdf,https://www.enforcementtracker.com/ETid-2733


In [None]:
from openai import OpenAI

from pydantic import BaseModel

from dotenv import load_dotenv
import os

load_dotenv()

openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


class RevenueInfo(BaseModel):
    annual_revenue_in_euro: int


def get_revenue(row):
    try:
        company_name = row['company']
        sector = row['sectors']
        year = row['date'].split('-')[0]

        # Create a prompt for the AI to search for company revenue
        prompt = f"""
        Please search for the annual revenue of the company "{company_name}" in the {sector} sector for the year {year} or the most recent available data.
        If applicable, please use zoominfo.com as a datasource as they have comprehensive financial data.
        Please provide the annual revenue in Euros. If the revenue is reported in another currency, please convert it to Euros using historical exchange rates for that year.
        """

        # Call OpenAI with web search capabilities
        response = openai.responses.parse(
            model="gpt-4o-2024-08-06",
            tools=[{
                "type": "web_search_preview",
                "search_context_size": "low",
            }],
            input=[
                {"role": "system", "content": "You are a helpful assistant that can search the web for company financial information. Please provide accurate revenue data based on your search results."},
                {"role": "user", "content": prompt}
            ],
            text_format=RevenueInfo,
        )
        revenue_data = response.output_parsed.annual_revenue_in_euro
        return revenue_data

    except Exception as e:
        print(f"Error getting revenue for {row['company']}: {e}")
        return None


print(
    f"Company: {df_fines.iloc[0]['company']}, Revenue: {get_revenue(df_fines.iloc[0])}")

Company: AIRE NETWORKS DEL MEDITERRÃNEO, S.L., Revenue: 107746223


In [35]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=16)
print("Getting revenue data for companies (this may take a while)...")
df_fines['revenue'] = df_fines.parallel_apply(get_revenue, axis=1)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Getting revenue data for companies (this may take a while)...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=62), Label(value='0 / 62'))), HBox…

In [None]:
df_fines.head()


Unnamed: 0,id,country,flag_url,authority,date,fine_eur,company,sectors,gdpr_articles,violation,summary,verdict_link,case_url,revenue
0,ETid-2737,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-23,60000,"AIRE NETWORKS DEL MEDITERRÃNEO, S.L.","Media, Telecoms and Broadcasting",Art. 5 (1) f) GDPR,Insufficient technical and organisational meas...,"The Spanish DPA imposed a fine of EUR 60,000 o...",https://www.aepd.es/documento/ps-00025-2025.pdf,https://www.enforcementtracker.com/ETid-2737,107800000
1,ETid-2736,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-21,200000,"TELEFÓNICA MÓVILES ESPAÑA, S.A.","Media, Telecoms and Broadcasting",Art. 6 (1) GDPR,Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00159-2024.pdf,https://www.enforcementtracker.com/ETid-2736,41315000000
2,ETid-2735,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-20,360,RED ESPAÑOLA DE IDENTIFICACIÓN DE ANIMALES DE ...,Individuals and Private Associations,Art. 58 (1) GDPR,Insufficient cooperation with supervisory auth...,The Spanish DPA imposed a fine of EUR 360 on R...,https://www.aepd.es/documento/ps-00212-2025.pdf,https://www.enforcementtracker.com/ETid-2735,0
3,ETid-2734,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-19,200000,"ASNEF-EQUIFAX, SERVICIOS DE INFORMACIÓN SOBRE ...","Finance, Insurance and Consulting","Art. 6 (1) GDPR, Art. 17 GDPR",Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00157-2024.pdf,https://www.enforcementtracker.com/ETid-2734,31085753
4,ETid-2733,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-16,30000,ATRESMEDIA CORPORACIÓN DE MEDIOS DE COMUNICACI...,"Media, Telecoms and Broadcasting",Art. 5 (1) c) GDPR,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 30,000 o...",https://www.aepd.es/documento/ps-00175-2024.pdf,https://www.enforcementtracker.com/ETid-2733,918950000


In [37]:
df_fines['fine_eur'] = df_fines['fine_eur'].str.replace(',', '').astype(int)

In [38]:
df_fines.head()

Unnamed: 0,id,country,flag_url,authority,date,fine_eur,company,sectors,gdpr_articles,violation,summary,verdict_link,case_url,revenue
0,ETid-2737,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-23,60000,"AIRE NETWORKS DEL MEDITERRÃNEO, S.L.","Media, Telecoms and Broadcasting",Art. 5 (1) f) GDPR,Insufficient technical and organisational meas...,"The Spanish DPA imposed a fine of EUR 60,000 o...",https://www.aepd.es/documento/ps-00025-2025.pdf,https://www.enforcementtracker.com/ETid-2737,107800000
1,ETid-2736,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-21,200000,"TELEFÓNICA MÓVILES ESPAÑA, S.A.","Media, Telecoms and Broadcasting",Art. 6 (1) GDPR,Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00159-2024.pdf,https://www.enforcementtracker.com/ETid-2736,41315000000
2,ETid-2735,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-20,360,RED ESPAÑOLA DE IDENTIFICACIÓN DE ANIMALES DE ...,Individuals and Private Associations,Art. 58 (1) GDPR,Insufficient cooperation with supervisory auth...,The Spanish DPA imposed a fine of EUR 360 on R...,https://www.aepd.es/documento/ps-00212-2025.pdf,https://www.enforcementtracker.com/ETid-2735,0
3,ETid-2734,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-19,200000,"ASNEF-EQUIFAX, SERVICIOS DE INFORMACIÓN SOBRE ...","Finance, Insurance and Consulting","Art. 6 (1) GDPR, Art. 17 GDPR",Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00157-2024.pdf,https://www.enforcementtracker.com/ETid-2734,31085753
4,ETid-2733,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-16,30000,ATRESMEDIA CORPORACIÓN DE MEDIOS DE COMUNICACI...,"Media, Telecoms and Broadcasting",Art. 5 (1) c) GDPR,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 30,000 o...",https://www.aepd.es/documento/ps-00175-2024.pdf,https://www.enforcementtracker.com/ETid-2733,918950000


In [39]:
# Create a connection to the new SQLite database
conn_new = sqlite3.connect('spain_gdpr_fines.db')

# Write the dataframe to the new database
df_fines.to_sql('fines', conn_new, if_exists='replace', index=False)

# Close the connection
conn_new.close()

In [56]:
import pandas as pd
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('spain_gdpr_fines.db')


query = "SELECT * FROM fines"  
df_fines_spain = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Display the DataFrame
df_fines_spain.head()

Unnamed: 0,id,country,flag_url,authority,date,fine_eur,company,sectors,gdpr_articles,violation,summary,verdict_link,case_url,revenue
0,ETid-2737,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-23,60000,"AIRE NETWORKS DEL MEDITERRÃNEO, S.L.","Media, Telecoms and Broadcasting",Art. 5 (1) f) GDPR,Insufficient technical and organisational meas...,"The Spanish DPA imposed a fine of EUR 60,000 o...",https://www.aepd.es/documento/ps-00025-2025.pdf,https://www.enforcementtracker.com/ETid-2737,107800000
1,ETid-2736,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-21,200000,"TELEFÓNICA MÓVILES ESPAÑA, S.A.","Media, Telecoms and Broadcasting",Art. 6 (1) GDPR,Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00159-2024.pdf,https://www.enforcementtracker.com/ETid-2736,41315000000
2,ETid-2735,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-20,360,RED ESPAÑOLA DE IDENTIFICACIÓN DE ANIMALES DE ...,Individuals and Private Associations,Art. 58 (1) GDPR,Insufficient cooperation with supervisory auth...,The Spanish DPA imposed a fine of EUR 360 on R...,https://www.aepd.es/documento/ps-00212-2025.pdf,https://www.enforcementtracker.com/ETid-2735,0
3,ETid-2734,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-19,200000,"ASNEF-EQUIFAX, SERVICIOS DE INFORMACIÓN SOBRE ...","Finance, Insurance and Consulting","Art. 6 (1) GDPR, Art. 17 GDPR",Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00157-2024.pdf,https://www.enforcementtracker.com/ETid-2734,31085753
4,ETid-2733,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-16,30000,ATRESMEDIA CORPORACIÓN DE MEDIOS DE COMUNICACI...,"Media, Telecoms and Broadcasting",Art. 5 (1) c) GDPR,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 30,000 o...",https://www.aepd.es/documento/ps-00175-2024.pdf,https://www.enforcementtracker.com/ETid-2733,918950000


In [57]:
from openai import OpenAI

from pydantic import BaseModel, Field
from typing import List

from dotenv import load_dotenv
import os

load_dotenv()

openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

class GDPR_Article(BaseModel):
    article: int
    paragraph: int | None
    subsection: str | None = Field(None, min_length=1, max_length=1)

class GDPR_Article_List(BaseModel):
    article_list: List[GDPR_Article]

def format_gdpr_articles(articles):
    return ', '.join([
        f"{article.article}-{article.paragraph}-{article.subsection}" if article.paragraph and article.subsection 
        else f"{article.article}-{article.paragraph}" if article.paragraph 
        else f"{article.article}" for article in articles
    ])


def get_gdpr_articles(row):
    try:
        gdpr_article_string = row['gdpr_articles']

        # Create a prompt for the AI to search for company revenue
        prompt = f"""
        Please transform the following GDPR article(s) {gdpr_article_string} into a list of structured data.
        For an input such as Art. 5(1)(a) GDPR, please provide the article number, paragraph number (if applicable), and subsection (if applicable) in a structured format.
        If the input is like Art. 5 (1) c) f) GDPR or Art. 5 (1) c),f) GDPR please create separate entries to the list for each subsection.
        """

        # Call OpenAI with web search capabilities
        response = openai.responses.parse(
            model="gpt-4o-2024-08-06",
            tools=[{
                "type": "web_search_preview",
                "search_context_size": "low",
            }],
            input=[
                {"role": "system", "content": "You are a helpful AI assistant that helps parsing and formatting information into a strictly structured format."},
                {"role": "user", "content": prompt}
            ],
            text_format=GDPR_Article_List,
        )
        articles: GDPR_Article_List = response.output_parsed
        return format_gdpr_articles(articles.article_list)

    except Exception as e:
        print(f"Error getting structured GDPR articles for {row['id']}: {e}")
        return None

print(
    f"ID: {df_fines_spain.iloc[465]['id']}, Articles: {get_gdpr_articles(df_fines_spain.iloc[465])}")

ID: ETid-1486, Articles: 5-1-a, 5-1-b, 5-1-e, 6-1, 8, 12-1, 12-2, 13, 25, 30-1, 22-2


In [58]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=16)
print("Transforming GDPR article ref (this may take a while)...")
df_fines_spain['gdpr_articles'] = df_fines_spain.parallel_apply(get_gdpr_articles, axis=1)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Transforming GDPR article ref (this may take a while)...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=62), Label(value='0 / 62'))), HBox…

In [59]:
df_fines_spain.head()

Unnamed: 0,id,country,flag_url,authority,date,fine_eur,company,sectors,gdpr_articles,violation,summary,verdict_link,case_url,revenue
0,ETid-2737,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-23,60000,"AIRE NETWORKS DEL MEDITERRÃNEO, S.L.","Media, Telecoms and Broadcasting",5-1-f,Insufficient technical and organisational meas...,"The Spanish DPA imposed a fine of EUR 60,000 o...",https://www.aepd.es/documento/ps-00025-2025.pdf,https://www.enforcementtracker.com/ETid-2737,107800000
1,ETid-2736,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-21,200000,"TELEFÓNICA MÓVILES ESPAÑA, S.A.","Media, Telecoms and Broadcasting",6-1,Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00159-2024.pdf,https://www.enforcementtracker.com/ETid-2736,41315000000
2,ETid-2735,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-20,360,RED ESPAÑOLA DE IDENTIFICACIÓN DE ANIMALES DE ...,Individuals and Private Associations,58-1,Insufficient cooperation with supervisory auth...,The Spanish DPA imposed a fine of EUR 360 on R...,https://www.aepd.es/documento/ps-00212-2025.pdf,https://www.enforcementtracker.com/ETid-2735,0
3,ETid-2734,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-19,200000,"ASNEF-EQUIFAX, SERVICIOS DE INFORMACIÓN SOBRE ...","Finance, Insurance and Consulting","6-1, 17",Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00157-2024.pdf,https://www.enforcementtracker.com/ETid-2734,31085753
4,ETid-2733,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-16,30000,ATRESMEDIA CORPORACIÓN DE MEDIOS DE COMUNICACI...,"Media, Telecoms and Broadcasting",5-1-c,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 30,000 o...",https://www.aepd.es/documento/ps-00175-2024.pdf,https://www.enforcementtracker.com/ETid-2733,918950000


In [60]:
# Create a connection to the new SQLite database
conn_new = sqlite3.connect('spain_gdpr_fines_gdpr.db')

# Write the dataframe to the new database
df_fines_spain.to_sql('fines', conn_new, if_exists='replace', index=False)

# Close the connection
conn_new.close()

In [None]:
from augment_dataset import get_gdpr_classifications

print("Transforming GDPR classifications (this may take a while)...")
df_fines_spain['gdpr_classifications'] = df_fines_spain.parallel_apply(get_gdpr_classifications, axis=1)
print("Done, adding separated columns for each classification.")

df_fines_spain['lawfulness_of_processing'] = df_fines_spain['gdpr_classifications'].apply(lambda x: x[0] if x else None)
df_fines_spain['data_subject_rights_compliance'] = df_fines_spain['gdpr_classifications'].apply(lambda x: x[1] if x else None)
df_fines_spain['risk_management_and_safeguards'] = df_fines_spain['gdpr_classifications'].apply(lambda x: x[2] if x else None)
df_fines_spain['accountability_and_governance'] = df_fines_spain['gdpr_classifications'].apply(lambda x: x[3] if x else None)

print("Done, filtering out rows with missing classifications.")
df_fines_spain_filtered = df_fines_spain[
    df_fines_spain[['lawfulness_of_processing', 'data_subject_rights_compliance', 
                    'risk_management_and_safeguards', 'accountability_and_governance']].notnull().all(axis=1)
]

In [None]:
# Create a connection to the new SQLite database
conn_new = sqlite3.connect('spain_gdpr_fines_gdpr_classified.db')

# Write the dataframe to the new database
df_fines_spain.to_sql('fines', conn_new, if_exists='replace', index=False)

# Close the connection
conn_new.close()

In [68]:
import pandas as pd
import sqlite3

# Connect to the SQLite database
conn = sqlite3.connect('spain_gdpr_fines_with_labels.db')


query = "SELECT * FROM fines WHERE country = 'spain'"  
df_fines = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Display the DataFrame
df_fines.head()

Unnamed: 0,id,country,flag_url,authority,date,fine_eur,company,sectors,gdpr_articles,violation,summary,verdict_link,case_url,revenue,lawfulness_of_processing,data_subject_rights_compliance,risk_management_and_safeguards,accountability_and_governance
0,ETid-2737,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-23,60000,"AIRE NETWORKS DEL MEDITERRÃNEO, S.L.","Media, Telecoms and Broadcasting",Art. 5 (1) f) GDPR,Insufficient technical and organisational meas...,"The Spanish DPA imposed a fine of EUR 60,000 o...",https://www.aepd.es/documento/ps-00025-2025.pdf,https://www.enforcementtracker.com/ETid-2737,107800000,no_valid_basis,non_compliance,insufficient_protection,partially_accountable
1,ETid-2736,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-21,200000,"TELEFÓNICA MÓVILES ESPAÑA, S.A.","Media, Telecoms and Broadcasting",Art. 6 (1) GDPR,Insufficient legal basis for data processing,"The Spanish DPA imposed a fine of EUR 200,000 ...",https://www.aepd.es/documento/ps-00159-2024.pdf,https://www.enforcementtracker.com/ETid-2736,41315000000,no_valid_basis,non_compliance,insufficient_protection,partially_accountable
2,ETid-2735,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-20,360,RED ESPAÑOLA DE IDENTIFICACIÓN DE ANIMALES DE ...,Individuals and Private Associations,Art. 58 (1) GDPR,Insufficient cooperation with supervisory auth...,The Spanish DPA imposed a fine of EUR 360 on R...,https://www.aepd.es/documento/ps-00212-2025.pdf,https://www.enforcementtracker.com/ETid-2735,0,no_valid_basis,non_compliance,insufficient_protection,not_accountable
3,ETid-2733,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-16,30000,ATRESMEDIA CORPORACIÓN DE MEDIOS DE COMUNICACI...,"Media, Telecoms and Broadcasting",Art. 5 (1) c) GDPR,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 30,000 o...",https://www.aepd.es/documento/ps-00175-2024.pdf,https://www.enforcementtracker.com/ETid-2733,918950000,lawful_but_principle_violation,non_compliance,insufficient_protection,partially_accountable
4,ETid-2732,spain,./flags/flag_spain.png,Spanish Data Protection Authority (aepd),2025-05-13,100000,PLATAFORMA CABANILLAS SA.,Employment,Art. 5 (1) c) GDPR,Non-compliance with general data processing pr...,"The Spanish DPA imposed a fine of EUR 100,000 ...",https://www.aepd.es/documento/ps-00162-2024.pdf,https://www.enforcementtracker.com/ETid-2732,31000000,lawful_but_principle_violation,non_compliance,insufficient_protection,not_accountable


In [None]:
import weaviate
from weaviate.classes.init import Auth
import os
from dotenv import load_dotenv
load_dotenv()

# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]
openai_api_key = os.environ["OPENAI_API_KEY"]

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,  # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(weaviate_api_key),  # Replace with your Weaviate Cloud key
    headers={'X-OpenAI-Api-key': openai_api_key}  # Replace with your OpenAI API key
)

print("Connected to Weaviate:", client.is_ready())

Connected to Weaviate: True


- id
- country
- flag_url
- authority
- date
- fine_eur
- company
- sectors
- gdpr_articles
- violation
- summary
- verdict_link
- case_url
- revenue
- lawfulness_of_processing
- data_subject_rights_compliance
- risk_management_and_safeguards
- accountability_and_governance

In [70]:
from weaviate.classes.config import Configure, Property, DataType

collection = client.collections.create(
        name="Precedent",
        properties=[
            Property(name="precedent_id", data_type=DataType.TEXT),
            Property(name="country", data_type=DataType.TEXT),
            Property(name="flag_url", data_type=DataType.TEXT),
            Property(name="authority", data_type=DataType.TEXT),
            Property(name="date", data_type=DataType.TEXT),
            Property(name="fine_eur", data_type=DataType.INT),
            Property(name="company", data_type=DataType.TEXT),
            Property(name="sectors", data_type=DataType.TEXT),
            Property(name="gdpr_articles", data_type=DataType.TEXT),
            Property(name="violation", data_type=DataType.TEXT),
            Property(name="summary", data_type=DataType.TEXT),
            Property(name="verdict_link", data_type=DataType.TEXT),
            Property(name="case_url", data_type=DataType.TEXT),
            Property(name="revenue", data_type=DataType.INT),
            Property(name="lawfulness_of_processing", data_type=DataType.TEXT),
            Property(name="data_subject_rights_compliance", data_type=DataType.TEXT),
            Property(name="risk_management_and_safeguards", data_type=DataType.TEXT),
            Property(name="accountability_and_governance", data_type=DataType.TEXT),
            Property(name="chunk", data_type=DataType.TEXT),
            Property(name="page", data_type=DataType.INT),
        ],
        vector_config=[
        Configure.Vectors.text2vec_openai(
            name="summary_vector",
            source_properties=["summary"]
        ),
        Configure.Vectors.text2vec_openai(
            name="chunk_vector", 
            source_properties=["chunk"]
        ),
        ]
    
    )
print("Collection 'Precedent' created successfully.")

Collection 'Precedent' created successfully.


In [71]:
import os
import PyPDF2

def extract_text_from_pdf(pdf_file_name) -> list[str]:
    text = []
    pdf_file_name = os.path.join("verdicts", pdf_file_name)
    if os.path.isfile(pdf_file_name): 
        with open(pdf_file_name, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            if len(reader.pages) <= 50:
                for page in reader.pages:
                    extracted_text = page.extract_text()
                    text.append(extracted_text)
    return text

In [75]:
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()

openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def create_english_translation(text: str) -> str:
    try:
        response = openai.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that translates text into English. You only return the translated text without any additional commentary."},
                {"role": "user", "content": f"Please translate the following text into English:\n\n{text}"}
            ]
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error translating text: {e}")
        return text  # Return original text in case of error

In [78]:
from openai import OpenAI
from dotenv import load_dotenv
import os
load_dotenv()

openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def create_weaviate_objects(row):
    try:
        # Get the collection reference
        collection = client.collections.get("Precedent")
        
        # Create a Weaviate object with the data from the row
        base_obj = {
            "precedent_id": row['id'],
            "country": row['country'],
            "flag_url": row['flag_url'],
            "authority": row['authority'],
            "date": row['date'],
            "fine_eur": row['fine_eur'],
            "company": row['company'],
            "sectors": row['sectors'],
            "gdpr_articles": row['gdpr_articles'],
            "violation": row['violation'],
            "summary": row['summary'],
            "verdict_link": row['verdict_link'],
            "case_url": row['case_url'],
            "revenue": row['revenue'],
            "lawfulness_of_processing": row['lawfulness_of_processing'],
            "data_subject_rights_compliance": row['data_subject_rights_compliance'],
            "risk_management_and_safeguards": row['risk_management_and_safeguards'],
            "accountability_and_governance": row['accountability_and_governance'],
        }
        
        file_name = row['verdict_link'].split('/')[-1]
        if file_name.split('.')[-1] == "pdf":
            spanish_text = extract_text_from_pdf(file_name)
            for i, text in enumerate(spanish_text):
                english_text = create_english_translation(text)
                obj = {
                    **base_obj,
                    "chunk": english_text,
                    "page": i + 1,  # Page numbers start from 1
                }
                # Insert object into Weaviate
                collection.data.insert(obj)
        else:
            english_text = row['summary']
            obj = {
                **base_obj,
                "chunk": english_text,
                "page": 1,  # Single page document
            }
            # Insert object into Weaviate
            collection.data.insert(obj)
        
        return obj
    except Exception as e:
        print(f"Error creating Weaviate object for {row['id']}: {e}")
        return None

In [79]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=16)
print("Transforming GDPR article ref (this may take a while)...")
df_fines.parallel_apply(create_weaviate_objects, axis=1)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Transforming GDPR article ref (this may take a while)...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=53), Label(value='0 / 53'))), HBox…

Overwriting cache for 0 420


Error creating Weaviate object for ETid-754: Object was not added! Unexpected status code: 503, with response body: None.
Error creating Weaviate object for ETid-1554: Object was not added! Unexpected status code: 502, with response body: None.


0      {'precedent_id': 'ETid-2737', 'country': 'spai...
1      {'precedent_id': 'ETid-2736', 'country': 'spai...
2      {'precedent_id': 'ETid-2735', 'country': 'spai...
3      {'precedent_id': 'ETid-2733', 'country': 'spai...
4      {'precedent_id': 'ETid-2732', 'country': 'spai...
                             ...                        
843    {'precedent_id': 'ETid-197', 'country': 'spain...
844    {'precedent_id': 'ETid-191', 'country': 'spain...
845    {'precedent_id': 'ETid-175', 'country': 'spain...
846    {'precedent_id': 'ETid-174', 'country': 'spain...
847    {'precedent_id': 'ETid-147', 'country': 'spain...
Length: 848, dtype: object