In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
import requests
import pandas as pd
from tqdm import tqdm
import numpy as np
import requests
import pandas as pd
import numpy as np
from io import StringIO

## Парсинг википедии - данные по визовым требованиям для всех стран

In [1]:
def get_wikipedia_page_links(category_url):
    base_url = "https://en.wikipedia.org/w/api.php"
    pages = []
    continue_param = None
    
    while True:
        params = {
            "action": "query",
            "format": "json",
            "list": "categorymembers",
            "cmtitle": category_url,
            "cmtype": "page",
            "cmlimit": "max",
        }
        
        if continue_param:
            params['cmcontinue'] = continue_param
        
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        
        pages.extend([page['title'] for page in data['query']['categorymembers']])
        
        if 'continue' in data:
            continue_param = data['continue']['cmcontinue']
        else:
            break
    
    return pages

def get_visa_requirements_table(country_page):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "parse",
        "page": country_page,
        "format": "json",
        "prop": "text"
    }

    response = requests.get(base_url, params=params)
    response.raise_for_status()
    data = response.json()

    html_content = data["parse"]["text"]["*"]

    soup = BeautifulSoup(html_content, "html.parser")

    visa_section = None
    for header in soup.find_all(['h2', 'h3']):
        if "Visa requirements" in header.get_text():
            visa_section = header
            break

    if not visa_section:
        print(f"Раздел 'Visa requirements' не найден для {country_page}.")
        return None

    table = visa_section.find_next("table", {"class": "wikitable"})
    if not table:
        print(f"Таблица не найдена для {country_page}.")
        return None

    df = pd.read_html(str(table))[0]

    country_name = country_page.split("Visa_requirements_for_")[-1].replace("_", " ")
    df.insert(0, "Country_сitizen", country_name)

    return df

def collect_visa_data_from_all_pages(category_url):
    pages = get_wikipedia_page_links(category_url)

    all_data = []
    for page in tqdm(pages, desc="Парсинг страниц", unit="страница"):
        visa_df = get_visa_requirements_table(page)
        if visa_df is not None:
            all_data.append(visa_df)

    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        print("Не удалось собрать данные.")
        return None

category_url = "Category:Visa_requirements_by_nationality"
visa_data = collect_visa_data_from_all_pages(category_url)

if visa_data is not None:
    print(visa_data.head())
    visa_data.to_csv("visa_requirements_all_countries.csv", index=False, encoding="utf-8")
else:
    print("Не удалось получить данные.")


Парсинг страниц:   0%|▎                                                          | 1/216 [00:00<03:29,  1.02страница/s]

Раздел 'Visa requirements' не найден для Visa requirements for Abkhazian citizens.


  df = pd.read_html(str(table))[0]
  df = pd.read_html(str(table))[0]
Парсинг страниц:   1%|▌                                                          | 2/216 [00:07<12:37,  3.54s/страница]

KeyboardInterrupt



## Сбор данных по государствам ООН - получение синонимов

In [18]:
## Получаем данные по государстам членам-оон с wiki data
def fetch_wikidata_countries():
    """Fetch country data from Wikidata including UN members and specific additions."""
    query = """
    SELECT 
      ?countryLabel
      (GROUP_CONCAT(DISTINCT ?altLabel; SEPARATOR=", ")     AS ?aliases)
      (GROUP_CONCAT(DISTINCT ?altLabel_en; SEPARATOR=", ")  AS ?aliases_en)
      ?countryLabel_sco
    WHERE {
      {
        ?country wdt:P31 wd:Q3624078.
        ?country wdt:P463 wd:Q1065.
        FILTER( ?country != wd:Q865 )
      }
      UNION
      {
        VALUES ?country { wd:Q148 wd:Q45 }
      }

      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }

      OPTIONAL {
        ?country rdfs:label ?countryLabel_sco.
        FILTER(LANG(?countryLabel_sco) = "sco")
      }

      OPTIONAL {
        ?country skos:altLabel ?altLabel.
        FILTER(LANG(?altLabel) = "en")
      }

      OPTIONAL {
        ?country skos:altLabel ?altLabel_en.
        FILTER(LANG(?altLabel_en) = "en")
      }
    }
    GROUP BY ?countryLabel ?countryLabel_sco
    ORDER BY ?countryLabel
    """
    
    url = "https://query.wikidata.org/sparql"
    response = requests.get(url, params={"query": query, "format": "json"})
    data = response.json()
    
    rows = []
    for item in data["results"]["bindings"]:
        label = item["countryLabel"]["value"]
        aliases = item.get("aliases", {}).get("value", "")
        aliases_en = item.get("aliases_en", {}).get("value", "")
        label_sco = item.get("countryLabel_sco", {}).get("value", "")
        
        combined_aliases = f"{label}, {aliases}, {aliases_en}, {label_sco}".strip(", ")
        
        rows.append({
            "Label": label, 
            "Also known as": combined_aliases, 
            "Also known as sco": label_sco
        })
    
    return pd.DataFrame(rows)
    
## Очистка и дополнение данных
def clean_visa_data(visa_data):
    """Clean and normalize visa data columns."""
    idx = len(visa_data)
    visa_data.loc[idx] = [np.nan] * len(visa_data.columns)
    visa_data.at[idx, 'Country_сitizen'] = 'Visa requirements for Moroccan citizens'
    visa_data.at[idx, 'Country'] = 'Algeria'
    visa_data.at[idx, 'Visa requirement'] = 'Visa required'
    
    visa_data['Country'] = np.where(
        visa_data['Country / Region'].str.strip().replace('', pd.NA).notna(),
        visa_data['Country / Region'],
        visa_data['Country']
    )
    
    visa_data['Visa requirement'] = np.where(
        visa_data['Entry requirement'].str.strip().replace('', pd.NA).notna(),
        visa_data['Entry requirement'],
        visa_data['Visa requirement']
    )
    
    visa_data['Country'] = visa_data['Country'].apply(lambda x: re.sub(r'\[.*', '', str(x)))
    visa_data['Country / Region'] = visa_data['Country / Region'].str.split('[').str[0].str.strip()
    
    visa_data['Country'] = visa_data['Country'].str.replace(r'\s+and territories\b', '', regex=True)
    visa_data['Country'] = visa_data['Country'].str.replace(r'\s+and Crown dependencies\b', '', regex=True)
    
    visa_data = visa_data[visa_data['Country_сitizen'] != "Visa requirements for Taiwanese citizens"]
    
    return visa_data


## Ищем государства по альтернативным названиям
def find_country_aliases(country_name, df):
    """Find country aliases from Wikidata DataFrame."""
    if country_name in df['Label'].values:
        label = df.loc[df['Label'] == country_name, 'Label'].values[0]
        aliases = df.loc[df['Label'] == country_name, 'Also known as'].values[0]
        return f"{label}, {aliases}".strip(", ")
    
    for _, row in df.iterrows():
        aliases = row['Also known as'].split(", ")
        if country_name in aliases:
            return f"{row['Label']}, {row['Also known as']}".strip(", ")
    
    return None


## Маппинг
def map_country_aliases(visa_data, wikidata_df):
    """Map countries to their aliases using Wikidata."""
    unique_countries = visa_data['Country'].unique()
    country_to_aliases = {}
    
    for country in tqdm(unique_countries, desc="Processing unique countries"):
        country_to_aliases[country] = find_country_aliases(country, wikidata_df)
    
    visa_data['Also known as'] = visa_data['Country'].map(country_to_aliases)
    return visa_data


## Нормализаия названий стран
def normalize_country_names(visa_data):
    """Normalize country names using primary names from Wikidata."""
    def get_primary_name(also_known_as):
        if pd.notna(also_known_as):
            return also_known_as.split(",")[0].strip()
        return None
    
    visa_data['normalized_country'] = visa_data['Also known as'].apply(get_primary_name)
    
    visa_data['Country'] = visa_data.apply(
        lambda row: row['normalized_country'] if pd.notna(row['normalized_country']) else row['Country'],
        axis=1
    )
    
    visa_data.drop(columns=['normalized_country'], inplace=True)
    return visa_data

def map_citizen_countries(visa_data):
    """Map each citizen group to their missing UN member country."""
    un_member_countries = visa_data[visa_data['Also known as'].notna()]['Country'].unique()
    country_mapping = {}
    
    for citizen_group, group_data in visa_data.groupby('Country_сitizen'):
        countries_in_group = group_data['Country'].unique()
        missing = [c for c in un_member_countries if c not in countries_in_group]
        
        if len(missing) != 1:
            print(f"Group '{citizen_group}': missing {len(missing)} UN countries: {missing}")
        
        country_mapping[citizen_group] = missing[0] if len(missing) == 1 else None
    
    visa_data['citizen_country'] = visa_data['Country_сitizen'].map(country_mapping)
    return visa_data

## Маппинг статусов для виз и регионов для стран
def merge_additional_data(visa_data):
    """Merge status mapping and region mapping data."""
    visa_data = visa_data.dropna(subset=['Also known as', 'citizen_country'])
    
    visa_data['Visa requirement'] = visa_data['Visa requirement'].str.split('[').str[0].str.strip()
    
    status_df = pd.read_csv('status_map.txt', sep=';', header=None, names=['status', 'requirement_category'])
    visa_data['Visa requirement'] = visa_data['Visa requirement'].str.lower()
    status_df['status'] = status_df['status'].str.lower()
    status_df['requirement_category'] = status_df['requirement_category'].str.lower()
    status_df = status_df.drop_duplicates()
    
    region_df = pd.read_csv('region_map.txt', header=None, names=['Country_reg', 'Region'])
    
    visa_data = visa_data.merge(region_df, left_on='citizen_country', right_on='Country_reg', how='left')
    visa_data = visa_data.merge(status_df, left_on='Visa requirement', right_on='status', how='left')
    
    return visa_data

def main():
    """Main processing pipeline."""
    wikidata_df = fetch_wikidata_countries()
    
    visa_data = pd.read_csv('visa_requirements_all_countries.csv')
    visa_data = clean_visa_data(visa_data)
    visa_data = map_country_aliases(visa_data, wikidata_df)
    visa_data = normalize_country_names(visa_data)
    visa_data = map_citizen_countries(visa_data)
    visa_data = merge_additional_data(visa_data)
    
    visa_data.to_csv('test_dfsds.csv')
    print(visa_data)

if __name__ == "__main__":
    main()

  visa_data = pd.read_csv('visa_requirements_all_countries.csv')
Processing unique countries: 100%|██████████████████████████████████████████████████| 238/238 [00:01<00:00, 125.13it/s]


Group 'Visa requirements for British Nationals (Overseas)': missing 0 UN countries: []
Group 'Visa requirements for British Overseas Territories citizens': missing 0 UN countries: []
Group 'Visa requirements for British Overseas citizens': missing 0 UN countries: []
Group 'Visa requirements for Chinese citizens of Hong Kong': missing 193 UN countries: ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'The Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic', 'Chad', 'Chile', "People's Republic of China", 'Colombia', 'Comoros', 'Republic of the Congo', 'Democratic Republic of the Congo', 'Costa Rica', 'Ivory Coast', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Kingdom of Denmark', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visa_data['Visa requirement'] = visa_data['Visa requirement'].str.split('[').str[0].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  visa_data['Visa requirement'] = visa_data['Visa requirement'].str.lower()


                                 Country_сitizen              Country  \
0          Visa requirements for Afghan citizens              Albania   
1          Visa requirements for Afghan citizens              Algeria   
2          Visa requirements for Afghan citizens              Andorra   
3          Visa requirements for Afghan citizens               Angola   
4          Visa requirements for Afghan citizens  Antigua and Barbuda   
...                                          ...                  ...   
37051  Visa requirements for Zimbabwean citizens            Venezuela   
37052  Visa requirements for Zimbabwean citizens              Vietnam   
37053  Visa requirements for Zimbabwean citizens                Yemen   
37054  Visa requirements for Zimbabwean citizens               Zambia   
37055    Visa requirements for Moroccan citizens              Algeria   

        Visa requirement Allowed stay  \
0                  evisa          NaN   
1          visa required          NaN   


## Данные по туристическому трафику по странам за 2024 год

In [24]:
url = "https://datawrapper.dwcdn.net/Y8MiY/2/dataset.csv"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    df = pd.read_csv(StringIO(response.text))
    split_data = df.iloc[:, 0].str.split('\t', expand=True)
    split_data.columns = ["Rank", "Country", "Tourist Arrivals", "% of World Visits"]
    split_data["Tourist Arrivals"] = (
        split_data["Tourist Arrivals"]
        .str.replace('M', 'e6')
        .str.replace('K', 'e3')
        .astype(float)
    )
    country_to_tourist = split_data.set_index("Country")["Tourist Arrivals"].to_dict()
except Exception as e:
    print(f"Ошибка загрузки данных: {e}")
    country_to_tourist = {}

all_aliases = visa_data["Also known as"].dropna().str.split(", ").explode()
alias_to_avg = {
    alias: country_to_tourist[alias]
    for alias in all_aliases.unique()
    if alias in country_to_tourist
}

def get_tourist_value(aliases):
    if pd.isna(aliases):
        return None
    for alias in aliases.split(", "):
        if alias in alias_to_avg:
            return alias_to_avg[alias]
    return None
visa_data.head()

Unnamed: 0,Country_сitizen,Country,Visa requirement,Allowed stay,Notes (excluding departure fees),Unnamed: 4,Reciprocity,Country / Region,Reciprocity [Note 1],Notes,...,citizen_country,Country_reg,Region,status,requirement_category,Median,Root_Median,Root_Median_Ratio,Коэффициент легкости,passport_power_by_traffic
0,Visa requirements for Afghan citizens,Albania,evisa,,,,,,,,...,Afghanistan,Afghanistan,Asia,evisa,evisa,2700000.0,14.808762,0.665699,0.5,0.33285
1,Visa requirements for Afghan citizens,Algeria,visa required,,,,,,,,...,Afghanistan,Afghanistan,Asia,visa required,visa required,591000.0,13.289571,0.530978,0.2,0.106196
2,Visa requirements for Afghan citizens,Andorra,visa required,,,,,,,,...,Afghanistan,Afghanistan,Asia,visa required,visa required,5200000.0,15.464169,0.72382,0.2,0.144764
3,Visa requirements for Afghan citizens,Angola,visa required,,,,,,,,...,Afghanistan,Afghanistan,Asia,visa required,visa required,218000.0,12.29225,0.442537,0.2,0.088507
4,Visa requirements for Afghan citizens,Antigua and Barbuda,evisa,,,,,,,,...,Afghanistan,Afghanistan,Asia,evisa,evisa,384500.0,12.859699,0.492858,0.5,0.246429


## Расчет силы паспорта

In [26]:
visa_data["Median"] = visa_data["Also known as"].apply(get_tourist_value)
visa_data['Median'] = visa_data['Median'].fillna(1500).replace(0, 1500)
visa_data['Root_Median'] = np.log(visa_data['Median'])
max_root = visa_data['Root_Median'].max()
min_root = visa_data['Root_Median'].min()
visa_data['Root_Median_Ratio'] = (
    (visa_data['Root_Median'] - min_root)
    / (max_root - min_root)
    * (1 - 0.001)
    + 0.001
)
visa_data['Root_Median_Ratio'] = visa_data['Root_Median_Ratio'].fillna(1.0)
visa_data['requirement_category'] = visa_data['requirement_category'].str.lower().str.strip()
difficulty_coefficients = {
    'freedom of movement': 1.0,
    'visa not required': 1.0,
    'evisa': 0.5,
    'visa on arrival': 0.5,
    'visa required': 0.2,
    'admission refused': 0.0
}
visa_data['Коэффициент легкости'] = visa_data['requirement_category'].str.lower().map(difficulty_coefficients)
visa_data['passport_power_by_traffic'] = visa_data['Root_Median_Ratio'] * visa_data['Коэффициент легкости']
visa_data.to_csv('visa_data4.csv', index=False)
visa_data.head()

Unnamed: 0,Country_сitizen,Country,Visa requirement,Allowed stay,Notes (excluding departure fees),Unnamed: 4,Reciprocity,Country / Region,Reciprocity [Note 1],Notes,...,citizen_country,Country_reg,Region,status,requirement_category,Median,Root_Median,Root_Median_Ratio,Коэффициент легкости,passport_power_by_traffic
0,Visa requirements for Afghan citizens,Albania,evisa,,,,,,,,...,Afghanistan,Afghanistan,Asia,evisa,evisa,2700000.0,14.808762,0.665699,0.5,0.33285
1,Visa requirements for Afghan citizens,Algeria,visa required,,,,,,,,...,Afghanistan,Afghanistan,Asia,visa required,visa required,591000.0,13.289571,0.530978,0.2,0.106196
2,Visa requirements for Afghan citizens,Andorra,visa required,,,,,,,,...,Afghanistan,Afghanistan,Asia,visa required,visa required,5200000.0,15.464169,0.72382,0.2,0.144764
3,Visa requirements for Afghan citizens,Angola,visa required,,,,,,,,...,Afghanistan,Afghanistan,Asia,visa required,visa required,218000.0,12.29225,0.442537,0.2,0.088507
4,Visa requirements for Afghan citizens,Antigua and Barbuda,evisa,,,,,,,,...,Afghanistan,Afghanistan,Asia,evisa,evisa,384500.0,12.859699,0.492858,0.5,0.246429


In [27]:
visa_data5 = visa_data.groupby(['citizen_country', 'Region']).agg({'passport_power_by_traffic': 'sum'}).sort_values(by='passport_power_by_traffic', ascending=False)
visa_data5['rank'] = visa_data5['passport_power_by_traffic'].rank(method='first', ascending=False).astype(int)
visa_data5.to_csv('ppi.csv')
visa_data5

Unnamed: 0_level_0,Unnamed: 1_level_0,passport_power_by_traffic,rank
citizen_country,Region,Unnamed: 2_level_1,Unnamed: 3_level_1
Singapore,Asia,90.060109,1
Finland,Europe,88.019647,2
Malta,Europe,87.688419,3
France,Europe,87.590043,4
Kingdom of Denmark,Europe,87.483597,5
...,...,...,...
Afghanistan,Asia,30.933909,189
Sudan,Africa,30.537986,190
Eritrea,Africa,30.297790,191
Syria,Asia,29.633741,192
