**Import libraries**

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

**wikipedia scraping**

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_data_breaches'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

tables = soup.find_all('table', {'class': 'wikitable'})

**Government table**

In [3]:
government_table = tables[0]
data_government = []

for row in government_table.find_all('tr')[1:]:
    cells = row.find_all('td')
    if len(cells) >= 6:
        agency = cells[1].text.strip()
        year = cells[2].text.strip()
        records = cells[3].text.strip()
        organization_type = cells[4].text.strip()
        method = cells[5].text.strip() if len(cells) > 5 else 'Unknown'
        data_government.append([agency, records, year, organization_type, method])

gov_data_breach = pd.DataFrame(data_government, columns=['organization', 'records lost', 'year', 'sector', 'method'])

**Companies table**

In [4]:
company_table = tables[1]
data_company = []

for row in company_table.find_all('tr')[1:]:
    cells = row.find_all('td')
    if len(cells) >= 5:
        entity = cells[0].text.strip()
        year = cells[1].text.strip()
        records = cells[2].text.strip()
        organization_type = cells[3].text.strip()
        method = cells[4].text.strip() if len(cells) > 4 else 'Unknown'
        data_company.append([entity, records, year, organization_type, method])

company_data_breach = pd.DataFrame(data_company, columns=['organization', 'records lost', 'year', 'sector', 'method'])

merged_data = pd.concat([gov_data_breach, company_data_breach], ignore_index=True)

**Merge with Baloon_dataset**

In [5]:
balloon_race_df = pd.read_csv('Balloon Race Data Breaches - LATEST - breaches.csv')

balloon_race_df.columns = balloon_race_df.columns.str.strip()
balloon_race_df_clean = balloon_race_df[['organisation', 'records lost', 'year', 'sector', 'method']].dropna()
balloon_race_df_clean = balloon_race_df_clean.rename(columns={'organisation': 'organization'})

final_merged_data = pd.concat([balloon_race_df_clean, merged_data], ignore_index=True)

**First dataset cleaning**

In [6]:
for col in final_merged_data.select_dtypes(include=['object']).columns:
    final_merged_data[col] = final_merged_data[col].str.strip().str.lower()
final_merged_dataset = final_merged_data.sort_values(by='organization', ignore_index=True)

**Save dataset**

In [7]:
final_merged_dataset.to_csv("final_merged_data_breaches.csv", index=False)