# Country Codes & Continents: A Dataset with ISO 3166-1 Alpha-2

This notebook aims to create a dataset of countries, their corresponding ISO 3166-1 Alpha-2 codes, and their respective continents. 

**Key Features:**

* Utilizes the ISO 3166-1 Alpha-2 standard for country codes.
* Includes a comprehensive list of countries and their associated continents.
* Provides a clean and organized dataset for various data analysis and mapping projects.

**Potential Use Cases:**

* Geocoding and mapping applications.
* Data analysis and visualization projects.
* Internationalization and localization tasks.
* Building applications that require country-specific information.

This notebook demonstrates a simple and efficient approach to gathering and organizing country-related data. 

**Note:** 

* Data sources may vary, and the accuracy of the information should be verified independently. 
<!-- * This is a basic example, and you can further enhance it by adding more details such as country names, currencies, or time zones. -->

# [ISO 3166-1 alpha-2](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#References)

In [1]:
# Function to fetch continent from Wikipedia if pycountry_convert fails
def fetch_continent_from_wikipedia(country_name):
    """Scrape Wikipedia to find the continent of a country."""
    try:
        search_url = f"https://en.wikipedia.org/wiki/{country_name.replace(' ', '_')}"
        response = requests.get(search_url)
        soup = BeautifulSoup(response.text, "lxml")

        # Define the continent keywords and their codes
        continent_keywords = {
            "Africa": "AF", "Asia": "AS", "Europe": "EU",
            "North America": "NA", "South America": "SA",
            "Oceania": "OC", "Antarctica": "AN"
        }

        # Search for the continent-related keywords in the page text
        for key, code in continent_keywords.items():
            if key in soup.text:
                return code
        
        return "Unknown"  # If no match is found, return Unknown
    except Exception as e:
        return "Unknown"  # If scraping fails, return Unknown

In [2]:
def get_continent_code(alpha2, country_name):
    """First, try pycountry_convert, otherwise scrape Wikipedia."""
    try:
        # Try getting the continent code using pycountry_convert
        continent_code = pc.country_alpha2_to_continent_code(alpha2)
        return continent_code
    except KeyError:
        # If pycountry_convert fails, scrape Wikipedia
        return fetch_continent_from_wikipedia(country_name)

In [3]:
def ISO_3166_1_Alpha_2(it_hub_regions,continent_name_mapping):
    # Fetch ISO country data from Wikipedia API
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": "ISO_3166-1_alpha-2",
        "format": "json",
        "prop": "text"
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    page_content = data["parse"]["text"]["*"] if "parse" in data else None
    if not page_content:
        raise ValueError("Failed to fetch data from Wikipedia.")
    
    soup = BeautifulSoup(page_content, "lxml")
    ISO_3166_1_Alpha_2 = {}
    
    # Find the header and then the table
    class_legal = soup.find('h3', id='Officially_assigned_code_elements')
    if not class_legal:
        raise ValueError("Section with id 'Officially_assigned_code_elements' not found.")
    
    table = class_legal.find_next("table")
    if not table:
        raise ValueError("Table not found after the header.")
    
    # Iterate over each table row to extract country codes and names
    for row in table.find_all("tr"):
        cells = row.find_all("td")
        if len(cells) < 2:
            continue  # Skip rows that don't have enough cells
        
        # Extract ISO alpha-2 country code
        code_span = cells[0].find('span', class_="monospaced")
        key = code_span.text.strip() if code_span else ""
        
        # Extract country name
        link = cells[1].find('a')
        value = link.text.strip() if link else ""
    
        # Only add if valid
        if key:
            # Get continent code (either from pycountry_convert or Wikipedia scraping)
            continent_cd = get_continent_code(key, value)
            
            # Get continent name from the continent code
            continent_name = continent_name_mapping.get(continent_cd, "Unknown")
            
            # Check if the country is in the IT hub regions
            it_hub_status = "Yes" if key in it_hub_regions else "No"
            
            # Store the result in the dictionary
            ISO_3166_1_Alpha_2[key] = {
                "country_name": value,
                "continent": continent_name, 
                "continent_code": continent_cd, 
                "it_hub_country": it_hub_status
            }
    

    return ISO_3166_1_Alpha_2

In [4]:
def RawFile(it_hub_regions,continent_name_mapping):
    """
    Processes video details, structures the data, and saves it as a JSON file.

    Args:
        max_record_count (int): The maximum number of records to process.

    Returns:
        bool: True if the file is successfully created and saved, False otherwise.
    """
    try:
        # Call the function to structure video details and return a DataFrame.
        # `kw_list` is assumed to be a global variable containing the search keyword(s).
        dictionary = ISO_3166_1_Alpha_2(it_hub_regions,continent_name_mapping)
        
        # Check if the DataFrame is not empty before saving.
        if dictionary:
            
            # Generate a timestamp for the file name using the current time in IST (Indian Standard Time).
            timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        
            # Create a filename using the generated timestamp to ensure uniqueness with number of records.
            filename = f"R_{timestamp}_country_details.json"
            
            with open(filename, "w", encoding="utf-8") as file: 
                json.dump(dictionary, file, indent=4, ensure_ascii=False)
            print(f"dictionary saved as {filename}")
        else:
            # Log a message if the DataFrame is empty.
            print("No data to save since empty dictionary returned.")
        
        # Return True indicating the process was successful.
        return True
    except Exception as e:
        # Handle and log any errors that occur during the process.
        print(f"Error during raw file creation: {e}")
        
        # Return False indicating the process failed.
        return False

In [5]:
def PushToGithub(repo_url):
    # List all files in the working directory
    output_files = os.listdir('/kaggle/working')
    
    try:
        # Filter and find the most recent .json file
        json_files = [file for file in output_files if file.startswith("R_") and file.endswith("_country_details.json")]
        if json_files:
            LatestFiles = max(json_files, key=os.path.getctime)  # Get the latest file based on creation time
        else:
            raise ValueError("No JSON files found!")  # Raise an error if no JSON files are found
    except ValueError as e:
        print(f"An error occurred at fetching recent .json file: {e}")
        return False  # Exit the function if there's an error in fetching JSON files
    
    # Define repository and destination paths
    kaggle_repo_url = '/kaggle/working/YouTubeFoodChannelAnalysis'
    destination_path = '/kaggle/working/YouTubeFoodChannelAnalysis/Requirement/Daily'

    
    print(LatestFiles)  # Print the latest JSON file name
    try:
        # Check if the repository already exists
        if os.path.exists(kaggle_repo_url):
            print("Already cloned and the repo file exists")
            repo = git.Repo(kaggle_repo_url)  # Access the existing repository
            origin = repo.remote(name='origin')  # Get the remote repository
            origin.pull()  # Pull the latest changes from the repository
            print("Successfully pulled the git repo before push")
        else:
            # Clone the repository if it doesn't exist
            repo = git.Repo.clone_from(repo_url, kaggle_repo_url)
            print("Successfully cloned the git repo")
        
        # Check if the destination path exists, and copy the latest file
        if os.path.exists(destination_path):
            shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
        else:
            # Create the destination directory if it doesn't exist
            os.makedirs(destination_path)
            shutil.copyfile(f'/kaggle/working/{LatestFiles}', f'{destination_path}/{LatestFiles}')
        
        # Initialize the repository for git operations
        repo = Repo(kaggle_repo_url)
        
        # Add the copied file to the staging area
        repo.index.add([f"{destination_path}/{LatestFiles}"])
        
        timestamp = datetime.datetime.now(ist).strftime("%Y-%m-%d_%H:%M:%S")
        # Commit the changes with a message including the timestamp and file name
        repo.index.commit(f"{timestamp} Added files from Kaggle notebook, {LatestFiles}")
        
        # Push the changes to the remote repository
        origin = repo.remote(name="origin")
        push_result = origin.push()
        if push_result:
            print("Output files successfully pushed to GitHub!")
        else:
            print("Output files pushed to GitHub failed:(")
        return True  # Return True if the process completes successfully
    
    except Exception as e:
        # Handle any errors that occur during the git automation process
        print(f"An error occurred at git automation code: {e}")
        return False  # Return False if an error occurs

In [6]:
def main(repo_url,it_hub_regions,continent_name_mapping):
    RawFile(it_hub_regions,continent_name_mapping)
    PushToGithub(repo_url)

In [7]:
# Entry point of the script
if __name__ == "__main__":
    import requests
    from IPython.display import display,JSON
    from bs4 import BeautifulSoup
    import pycountry_convert as pc
    import datetime
    import os
    import pandas as pd
    import git  
    from git import Repo 
    import shutil  
    from pytz import timezone 
    from datetime import timedelta 
    import json

    # Fetching secrets from Kaggle's secure environment
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("requirementRepoUrl")  # Fetch the source repository URL
    
    # Assigning secrets to variables
    repo_url = secret_value_0
    
    # Setting the timezone to Indian Standard Time (IST)
    ist = timezone('Asia/Kolkata')
    
    # List of IT hub regions
    it_hub_regions = [
        'US',  'IN',  'CN',  'JP',  'KR',  'DE',  'GB',  'FR',  'CA',  'AU',
        'SG',  'SE',  'FI',  'IE',  'IL',  'NL',  'CH',  'ES',  'IT',  'BR', 
        'ZA',  'RU',  'AE',  'TR',  'PL',  'VN',  'MY',  'PH',  'TH',  'ID', 
        'HK',  'TW',
    ]
    
    # Continent code to continent name mapping
    continent_name_mapping = {
        "AF": "Africa",
        "AS": "Asia",
        "EU": "Europe",
        "NA": "North America",
        "SA": "South America",
        "OC": "Oceania",
        "AN": "Antarctica"
    }

    main(repo_url,it_hub_regions,continent_name_mapping)



dictionary saved as R_2025-02-03_19:08:45_country_details.json
R_2025-02-03_19:08:45_country_details.json
Successfully cloned the git repo
Output files successfully pushed to GitHub!
