# 01 - Data Extraction from Wikidata

This notebook extracts human rights defenders data from Wikidata and enriches it with works counts.

## Steps
1. Load raw human rights defender data
2. Clean and standardize country names
3. Fetch works counts from Wikidata API
4. Save processed data

In [None]:
import pandas as pd
import requests
import time
from tqdm import tqdm

## 1. Load Raw Data

In [None]:
df = pd.read_csv('../data/human_rights_defender.csv')
print(f"Loaded {len(df)} rows")
df.head()

## 2. Data Cleaning

### 2.1 Parse dates and calculate productive year

In [None]:
# Convert date columns
df['birthDate'] = pd.to_datetime(df['birthDate'], errors='coerce')
df['deathDate'] = pd.to_datetime(df['deathDate'], errors='coerce')

# Calculate productive year (birth + 35 years, or death - 35 years)
df['productive_year'] = df.apply(
    lambda row: row['birthYear'] + 35 if pd.notna(row['birthYear']) 
                else row['deathYear'] - 35 if pd.notna(row['deathYear']) 
                else None, 
    axis=1
)

print(f"Productive year range: {df['productive_year'].min():.0f} - {df['productive_year'].max():.0f}")

### 2.2 Standardize country names

Map historical countries to modern equivalents.

In [None]:
COUNTRY_MAPPING = {
    'Kingdom of England': 'United Kingdom',
    'Kingdom of France': 'France',
    'West Germany': 'Germany',
    'Soviet Union': 'Russia',
    'Republic of Vietnam': 'Vietnam',
    'Czechoslovakia': 'Czech Republic',
    'Tibet Autonomous Region': 'China',
    "People's Republic of China": 'China',
    'Hong Kong': 'China',
    'British Raj': 'India',
    'British Hong Kong': 'China',
    'Ottoman Empire': 'Turkey',
    'Kingdom of Romania': 'Romania',
    'Russian Empire': 'Russia',
    'Austrian Empire': 'Austria',
    'Austria-Hungary': 'Austria',
    'Weimar Republic': 'Germany',
    'German Democratic Republic': 'Germany',
    'Kingdom of Italy': 'Italy',
    'Empire of Japan': 'Japan',
    'Qing dynasty': 'China',
    'French protectorate of Tunisia': 'Tunisia',
    'Kingdom of Great Britain': 'United Kingdom',
    'United Kingdom of Great Britain and Ireland': 'United Kingdom',
    'statelessness': None,
    'Yugoslavia': 'Serbia',
    'Socialist Federal Republic of Yugoslavia': 'Serbia',
    'Federal Republic of Yugoslavia': 'Serbia',
    'Serbia and Montenegro': 'Serbia',
}

df['modern_country'] = df['citizenshipLabel'].replace(COUNTRY_MAPPING)
df['modern_country'] = df['modern_country'].fillna(df['citizenshipLabel'])

print(f"Unique countries: {df['modern_country'].nunique()}")

## 3. Fetch Works Counts from Wikidata

In [None]:
def get_wikidata_works_count_batch(entity_ids):
    """
    Get the number of works for multiple Wikidata entities in a single query.
    
    Args:
        entity_ids: List of Wikidata IDs like ['Q557', 'Q4715', ...]
    
    Returns:
        Dictionary mapping entity_id to count of works
    """
    values_clause = " ".join([f"wd:{eid}" for eid in entity_ids])
    
    sparql_query = f"""
    SELECT ?person (COUNT(DISTINCT ?work) as ?count) WHERE {{
      VALUES ?person {{ {values_clause} }}
      {{
        ?work wdt:P50 ?person .  # author
      }} UNION {{
        ?work wdt:P170 ?person . # creator
      }} UNION {{
        ?work wdt:P655 ?person . # translator
      }} UNION {{
        ?work wdt:P86 ?person .  # composer
      }} UNION {{
        ?work wdt:P57 ?person .  # director
      }} UNION {{
        ?work wdt:P58 ?person .  # screenwriter
      }} UNION {{
        ?work wdt:P161 ?person . # cast member
      }} UNION {{
        ?work wdt:P800 ?person . # notable work
      }}
    }}
    GROUP BY ?person
    """
    
    endpoint_url = "https://query.wikidata.org/sparql"
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; HumanRightsResearch/1.0)',
        'Accept': 'application/sparql-results+json'
    }
    params = {'query': sparql_query, 'format': 'json'}
    
    try:
        response = requests.get(endpoint_url, params=params, headers=headers, timeout=60)
        response.raise_for_status()
        data = response.json()
        
        results = {eid: 0 for eid in entity_ids}
        
        if 'results' in data and 'bindings' in data['results']:
            for binding in data['results']['bindings']:
                person_uri = binding['person']['value']
                entity_id = person_uri.split('/')[-1]
                count = int(binding['count']['value'])
                results[entity_id] = count
        
        return results
        
    except Exception as e:
        print(f"Error: {e}")
        return {eid: None for eid in entity_ids}

In [None]:
# Extract entity IDs
df['entity_id'] = df['person'].str.extract(r'/(Q\d+)$')[0]
valid_entities = df[df['entity_id'].notna()]['entity_id'].unique().tolist()

print(f"Processing {len(valid_entities)} unique entities...")

# Fetch in batches
batch_size = 50
all_results = {}

for i in tqdm(range(0, len(valid_entities), batch_size), desc="Fetching works counts"):
    batch = valid_entities[i:i + batch_size]
    batch_results = get_wikidata_works_count_batch(batch)
    all_results.update(batch_results)
    time.sleep(1)  # Rate limiting

df['works_count'] = df['entity_id'].map(all_results)

## 4. Save Processed Data

In [None]:
# Save cleaned data
df.to_csv('../data/human_rights_defender_clean.csv', index=False)
print(f"Saved {len(df)} rows to data/human_rights_defender_clean.csv")

# Summary statistics
print(f"\nSummary:")
print(f"  Total records: {len(df)}")
print(f"  Unique individuals: {df['person'].nunique()}")
print(f"  Date range: {df['productive_year'].min():.0f} - {df['productive_year'].max():.0f}")
print(f"  Countries: {df['modern_country'].nunique()}")