In [3]:
import requests
import pandas as pd
from rdflib import Graph, Namespace
from rdflib import Graph, Namespace, URIRef

In [5]:
# Download the RDF/XML data
url = "https://ldf.fi/henko/data?graph=http://ldf.fi/henko/nimet/"
response = requests.get(url)

# Save to file
with open("henko_nimet.rdf", "wb") as f:
    f.write(response.content)


In [6]:
with open("henko_nimet.rdf", "rb") as f:
    head = f.read(500)
    print(head.decode("utf-8", errors="replace"))

PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX ent:     <http://www.w3.org/ns/entailment/>
PREFIX foaf:    <http://xmlns.com/foaf/0.1/>
PREFIX ldf:     <http://ldf.fi/schema/ldf/>
PREFIX ns1:     <http://ldf.fi/schema/henko/>
PREFIX ns2:     <http://www.w3.org/2004/02/skos/core#>
PREFIX ns3:     <http://purl.org/dc/terms/>
PREFIX ns4:     <http://www.w3.org/2002/07/owl#>
PREFIX ns5:     <https://www.wikidata.org/prop/direct/>
PREFIX rdfs:    <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sd:


In [9]:
from rdflib import Graph, Namespace

# Load the data
g = Graph()
g.parse("https://ldf.fi/henko/data?graph=http://ldf.fi/henko/nimet/", format="turtle")

# Define namespaces
NS1 = Namespace("http://ldf.fi/schema/henko/")
SCHEMA = Namespace("http://schema.org/")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")

# SPARQL query to extract names and gender
query = """
PREFIX ns1: <http://ldf.fi/schema/henko/>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT ?name ?genderLabel WHERE {
  ?usage a ns1:GivenNameUsage ;
         ns1:gender ?genderURI ;
         ns1:hasName ?nameURI .

  ?nameURI skos:prefLabel ?name .

  FILTER(LANG(?name) = "fi")

  BIND(REPLACE(STR(?genderURI), "http://schema.org/", "") AS ?genderLabel)
}
"""


In [10]:
print(f"✅ RDF graph loaded with {len(g)} triples.")


✅ RDF graph loaded with 3252695 triples.


In [11]:

# Run the query
results = g.query(query)

# Convert to list of tuples
data = [(str(row.name), str(row.genderLabel)) for row in results]

# Print some sample data
for name, gender in data[:10]:
    print(f"{name} - {gender}")


Juhani - Male
Olavi - Male
Petteri - Male
Otto - Male
Niilas - Male
Jeanette - Female
Christel - Female
Josefine - Female
Catharina - Female
Serafia - Female


In [12]:
print(f"Total names extracted: {len(data)}")


Total names extracted: 106524


In [14]:
import pandas as pd

df = pd.DataFrame(data, columns=["name", "gender"])
df.to_csv("henko_name_gender.csv", index=False)



In [15]:
print(df["gender"].value_counts())
print(df.head())

gender
Female    59096
Male      47428
Name: count, dtype: int64
      name gender
0   Juhani   Male
1    Olavi   Male
2  Petteri   Male
3     Otto   Male
4   Niilas   Male


In [16]:
import pandas as pd

# Assuming 'data' is already created from your SPARQL query
# data = [(name1, gender1), (name2, gender2), ...]

# Convert to DataFrame
df = pd.DataFrame(data, columns=["name", "gender"])

# Drop duplicate rows
df_unique = df.drop_duplicates()

# Save to CSV
df_unique.to_csv("henko_unique_name_gender.csv", index=False)

print(f"✅ Saved {len(df_unique)} unique name-gender pairs to 'henko_unique_name_gender.csv'")


✅ Saved 24134 unique name-gender pairs to 'henko_unique_name_gender.csv'


In [18]:
print(df_unique["gender"].value_counts())
print(df_unique.head())

gender
Female    13227
Male      10907
Name: count, dtype: int64
      name gender
0   Juhani   Male
1    Olavi   Male
2  Petteri   Male
3     Otto   Male
4   Niilas   Male
