# Country Codes & Continents: A Dataset with ISO 3166-1 Alpha-2

This notebook aims to create a dataset of countries, their corresponding ISO 3166-1 Alpha-2 codes, and their respective continents.

**Key Features:**

* Utilizes the ISO 3166-1 Alpha-2 standard for country codes.
* Includes a comprehensive list of countries and their associated continents.
* Provides a clean and organized dataset for various data analysis and mapping projects.

**Potential Use Cases:**

* Geocoding and mapping applications.
* Data analysis and visualization projects.
* Internationalization and localization tasks.
* Building applications that require country-specific information.

This notebook demonstrates a simple and efficient approach to gathering and organizing country-related data.

**Note:** 

* Data sources may vary, and the accuracy of the information should be verified independently.

In [1]:
# Updated function to fetch continent from Wikipedia by parsing the infobox
def fetch_continent_from_wikipedia(country_name):
    """
    Scrape Wikipedia to determine the continent of a country by parsing its infobox.
    This function looks for a row with a header containing 'Continent' and then checks 
    the cell text against known continent keywords.
    """
    try:
        search_url = f"https://en.wikipedia.org/wiki/{country_name.replace(' ', '_')}"
        response = requests.get(search_url)
        soup = BeautifulSoup(response.text, "lxml")
        
        # Define continent keywords and corresponding codes
        continent_keywords = {
            "Africa": "AF",
            "Asia": "AS",
            "Europe": "EU",
            "North America": "NA",
            "South America": "SA",
            "Oceania": "OC",
            "Antarctica": "AN"
        }
        
        # Try to locate the infobox table
        infobox = soup.find("table", class_="infobox")
        if infobox:
            for row in infobox.find_all("tr"):
                header = row.find("th")
                if header and "Continent" in header.text:
                    cell = row.find("td")
                    if cell:
                        text = cell.get_text(separator=" ", strip=True)
                        for key, code in continent_keywords.items():
                            if key in text:
                                return code
        # Fallback: search the entire page text
        page_text = soup.get_text()
        for key, code in continent_keywords.items():
            if key in page_text:
                return code
        return "Unknown"
    except Exception as e:
        return "Unknown"

def get_continent_code(alpha2, country_name):
    """
    First, attempt to get the continent code using pycountry_convert.
    If that fails or the result is ambiguous, scrape Wikipedia for a more accurate value.
    """
    try:
        continent_code_pc = pc.country_alpha2_to_continent_code(alpha2)
    except Exception:
        continent_code_pc = None
    
    scraped_code = fetch_continent_from_wikipedia(country_name)
    
    # Prefer the scraped code if available; otherwise, use the pycountry_convert result.
    if scraped_code != "Unknown":
        return scraped_code
    elif continent_code_pc is not None:
        return continent_code_pc
    else:
        return "Unknown"

def ISO_3166_1_Alpha_2(it_hub_regions, continent_name_mapping):
    """
    Fetch ISO country data from Wikipedia and, for each country:
      - Extract its ISO 3166-1 Alpha-2 code and name.
      - Determine the continent code (using automated methods).
      - Map the continent code to the full continent name.
      - Determine if it is an IT hub based on a predefined list.
    This fully automated approach avoids manual customizations.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": "ISO_3166-1_alpha-2",
        "format": "json",
        "prop": "text"
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    page_content = data.get("parse", {}).get("text", {}).get("*")
    if not page_content:
        raise ValueError("Failed to fetch data from Wikipedia.")
    
    soup = BeautifulSoup(page_content, "lxml")
    iso_data = {}
    
    # Locate the "Officially assigned code elements" header
    header = soup.find('h3', id='Officially_assigned_code_elements')
    if not header:
        raise ValueError("Section with id 'Officially_assigned_code_elements' not found.")
    
    table = header.find_next("table")
    if not table:
        raise ValueError("Table not found after the header.")
    
    for row in table.find_all("tr"):
        cells = row.find_all("td")
        if len(cells) < 2:
            continue
        
        # Extract ISO alpha-2 country code.
        code_span = cells[0].find('span', class_="monospaced")
        key = code_span.text.strip() if code_span else ""
        
        # Extract country name.
        link = cells[1].find('a')
        value = link.text.strip() if link else ""
        
        if key:
            continent_cd = get_continent_code(key, value)
            continent_name = continent_name_mapping.get(continent_cd, "Unknown")
            it_hub_status = "Yes" if key in it_hub_regions else "No"
            
            iso_data[key] = {
                "country_name": value,
                "continent": continent_name,
                "continent_code": continent_cd,
                "it_hub_country": it_hub_status
            }
    
    return iso_data

def RawFile(it_hub_regions, continent_name_mapping):
    """
    Generate and save a JSON file containing the structured country details.
    The filename includes a timestamp to ensure uniqueness.
    """
    try:
        dictionary = ISO_3166_1_Alpha_2(it_hub_regions, continent_name_mapping)
        if dictionary:
            timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
            filename = f"R_{timestamp}_country_details.json"
            with open(filename, "w", encoding="utf-8") as file:
                json.dump(dictionary, file, indent=4, ensure_ascii=False)
            print(f"Dictionary saved as {filename}")
        else:
            print("No data to save since empty dictionary returned.")
        return True
    except Exception as e:
        print(f"Error during raw file creation: {e}")
        return False

def PushToGithub(repo_url):
    """
    Automate pushing the generated JSON file to GitHub.
    Finds the latest JSON file, clones or pulls the target repository,
    copies the file into the repositoryâ€™s daily directory, commits, and pushes.
    """
    output_files = os.listdir(os.getcwd())
    try:
        json_files = [file for file in output_files if file.startswith("R_") and file.endswith("_country_details.json")]
        if json_files:
            latest_file = max(json_files, key=os.path.getctime)
        else:
            raise ValueError("No JSON files found!")
    except ValueError as e:
        print(f"An error occurred at fetching recent .json file: {e}")
        return False
    
    kaggle_repo_url = os.path.join(os.getcwd(), "YouTubeFoodChannelAnalysis")
    destination_path = os.path.join(kaggle_repo_url, "Requirement", "Daily")
    
    print(f"Latest JSON file: {latest_file}")
    try:
        if os.path.exists(kaggle_repo_url):
            print("Repository already cloned; pulling latest changes.")
            repo = git.Repo(kaggle_repo_url)
            origin = repo.remote(name='origin')
            origin.pull()
            print("Successfully pulled the latest changes.")
        else:
            repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
            print("Successfully cloned the repository.")
        
        if not os.path.exists(destination_path):
            os.makedirs(destination_path)
        shutil.copyfile(os.path.join(os.getcwd(), latest_file),
                        os.path.join(destination_path, latest_file))
        
        repo = Repo(kaggle_repo_url)
        repo.index.add([os.path.join(destination_path, latest_file)])
        timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        repo.index.commit(f"{timestamp} - Added {latest_file} from Kaggle notebook")
        origin = repo.remote(name="origin")
        push_result = origin.push()
        if push_result:
            print("Output files successfully pushed to GitHub!")
        else:
            print("Pushing to GitHub failed.")
        return True
    except Exception as e:
        print(f"An error occurred during git automation: {e}")
        return False

def main(repo_url, it_hub_regions, continent_name_mapping):
    RawFile(it_hub_regions, continent_name_mapping)
    PushToGithub(repo_url)

In [2]:
if __name__ == "__main__":
    import requests
    from bs4 import BeautifulSoup
    import pycountry_convert as pc
    import datetime, os, json, shutil, git
    from git import Repo
    from pytz import timezone
    from IPython.display import display, JSON
    import pandas as pd
    # Setup and import necessary modules
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("requirementRepoUrl")  # Fetch repository URL
    repo_url = secret_value_0
    
    # Set timezone to Indian Standard Time (IST)
    ist = timezone('Asia/Kolkata')
    
    # Define IT hub regions (automated list)
    it_hub_regions = [
        'US', 'IN', 'CN', 'JP', 'KR', 'DE', 'GB', 'FR', 'CA', 'AU',
        'SG', 'SE', 'FI', 'IE', 'IL', 'NL', 'CH', 'ES', 'IT', 'BR', 
        'ZA', 'RU', 'AE', 'TR', 'PL', 'VN', 'MY', 'PH', 'TH', 'ID', 
        'HK', 'TW',
    ]
    
    # Automated mapping from continent codes to full names.
    continent_name_mapping = {
        "AF": "Africa",
        "AS": "Asia",
        "EU": "Europe",
        "NA": "North America",
        "SA": "South America",
        "OC": "Oceania",
        "AN": "Antarctica"
    }
    
    main(repo_url, it_hub_regions, continent_name_mapping)

Dictionary saved as R_2025-02-03_19:36:50_country_details.json
Latest JSON file: R_2025-02-03_19:36:50_country_details.json
Successfully cloned the repository.
Output files successfully pushed to GitHub!
