In [1]:
import pandas as pd
from pprint import pprint
from item.historical.scripts.util.managers.dataframe import DataframeManager
from item.historical.scripts.util.managers.dataframe import ColumnName
from item.historical.scripts.util.managers.country_code import CountryCodeManager
from item.common import paths

# Variables used across the notebook

In [2]:
# Creating a dataframe from the csv data
DATASET_ID = "T000"
dataframeManager = DataframeManager(DATASET_ID)
countryCodeManager = CountryCodeManager()

# Opening the dataset

In [3]:
path = paths['data']/'historical'/'input'/'T000_input.csv'
df = pd.read_csv(path)
df

Unnamed: 0,COUNTRY,Country,VARIABLE,Variable,YEAR,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1990,1990,PKM,Passenger-kilometres,6,Millions,,,90323.00000,,
1,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1991,1991,PKM,Passenger-kilometres,6,Millions,,,82691.00000,,
2,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1992,1992,PKM,Passenger-kilometres,6,Millions,,,69357.00000,,
3,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1993,1993,PKM,Passenger-kilometres,6,Millions,,,47142.00000,,
4,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1994,1994,PKM,Passenger-kilometres,6,Millions,,,39952.00000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9221,ARG,Argentina,T-PASS-TOT-INLD,Total inland passenger transport,2013,2013,PKM,Passenger-kilometres,6,Millions,,,47955.53068,,
9222,ARG,Argentina,T-PASS-TOT-INLD,Total inland passenger transport,2014,2014,PKM,Passenger-kilometres,6,Millions,,,49094.90265,,
9223,ARG,Argentina,T-PASS-TOT-INLD,Total inland passenger transport,2015,2015,PKM,Passenger-kilometres,6,Millions,,,51422.02431,,
9224,ARG,Argentina,T-PASS-TOT-INLD,Total inland passenger transport,2016,2016,PKM,Passenger-kilometres,6,Millions,,,54904.03347,,


# Determining data consistency for the column "Variable"
### Rationale: The variable "Road Passenger Transport" is the sum of "RPT by buses and coaches" + "RPT by passenger car". Therefore, to we need to identify for each country in what years the variables "Road Passenger Transport" is present but one if its components is missing.

In [4]:
# Getting the list of countries and grouping the df by country
group_by_country = df.groupby(df["Country"])
list_of_countries = list(set(df["Country"]))

# Determining the available variables per country
dic_country_df = {}
for name, group in group_by_country:
    dic_country_df[name] = group
    
# For each country, store the years that generate problems
result_per_country = {}
for country in list_of_countries:
    
    # Get the dataframe corresponding to country X
    df_for_country_x = dic_country_df[country]
    
    # Get the years available to the country X
    years_for_country_x = list(set(df_for_country_x.Year))
    
    # group the dataframe of the country X based on year
    country_x_grouped_by_year = df_for_country_x.groupby(df_for_country_x.Year)
    
    # Creating a variable for holding the number of variables per year
    dic_problematic_years = []
    
    # For each year of country X, identify the number of variables
    for year in years_for_country_x:
        
        # get the group of year Y
        df_year_Y_country_X = country_x_grouped_by_year.get_group(year)
        
        # get the list of variables available in year Y
        variables_available_in_year_y = list(df_year_Y_country_X["Variable"])
        
        if "Road passenger transport" in variables_available_in_year_y:
            if "Road passenger transport by buses and coaches" in variables_available_in_year_y and "Road passenger transport by passenger cars" in variables_available_in_year_y:
                continue
            else:
                dic_problematic_years.append(year)
    
    # Save the problematic data for the year
    if len(dic_problematic_years) > 0:
        result_per_country[country] = dic_problematic_years

pprint(result_per_country)

{'Albania': [1970,
             1971,
             1972,
             1973,
             1974,
             1975,
             1976,
             1977,
             1978,
             1979,
             1980,
             1981,
             1982,
             1983,
             1984,
             1985,
             1986,
             1987,
             1988,
             1989,
             1990,
             1991],
 'Armenia': [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017],
 'Azerbaijan': [1970,
                1971,
                1972,
                1973,
                1974,
                1975,
                1976,
                1977,
                1978,
                1979,
                1980,
                1981,
                1982,
                1983,
                1984],
 'Belarus': [1970,
             1971,
             1972,
             1973,
             1974,
             1975,
             1976,
             1977,
             1978,
           

## The results from the above cell demonstrates that there are 22 countries that for certain years have missing data. Below are the rules on how to handle each country:

###  Rule #1: The following countries keep them as they are: Bosnia-Herzegovina, China, Croatia, Estonia, Georgia, India, Latvia, Mexico, Moldova, Montenegro, Republic of, Romania, Turkey, Ukraine.

### Rule #2: Perform the following operations for each country:
    - Albania: Remove the "Road passenger transport" & "Road passenger transport by buses and coaches" categories during the problematic time periods.
    - Armenia: Remove the "Road passenger transport" category during the problematic time periods.
    - Azerbaijan: Remove the "Road passenger transport" category during the problematic time periods.
    - Belarus: Remove the "Road passenger transport" category during the problematic time periods.
    - Bulgaria: Remove the "Road passenger transport" category during the problematic time periods.
    - Canada: Remove the "Road passenger transport" category during the problematic time periods.
    - Russian Federation: Remove the "Road passenger transport" category during the problematic time periods.
    - Switzerland: Remove the "Road passenger transport" category during the problematic time periods.
    - United States: Remove the "Road passenger transport" category during the problematic time periods.

## Applying Rule#2 to handle all the problematic countries

In [5]:
# Getting the countries to handle
list_of_countries_that_need_cleaning = ['Albania', 'Armenia', 'Azerbaijan', 'Belarus', 'Bulgaria', 'Canada', 'Russian Federation', 'Switzerland', 'United States']

# Storing the variable to erase
variable_to_erase = "Road passenger transport"

# For each country that needs cleaning, do the following
list_of_indices_to_erase = []
for country in list_of_countries_that_need_cleaning:
    
    # Get the dataframe for the country
    country_to_clean_df = group_by_country.get_group(country)
    
    # Get the list of "dirty" years for the given country
    list_of_dirty_years_for_country_x = result_per_country[country]
    
    # For each dirty year, perform the following code
    for year in list_of_dirty_years_for_country_x:
        filtered_for_desired_rows = country_to_clean_df[(country_to_clean_df.Variable == variable_to_erase) & (country_to_clean_df.Year == year)]
        list_of_indices_to_erase = list_of_indices_to_erase + list(filtered_for_desired_rows.index)
        
        if country == 'Albania':
            albania_specific_bus_row = country_to_clean_df[(country_to_clean_df.Variable == "Road passenger transport by buses and coaches") & (country_to_clean_df.Year == year)]
            list_of_indices_to_erase = list_of_indices_to_erase + list(albania_specific_bus_row.index)

In [6]:
# Drop all the indices
df.drop(list_of_indices_to_erase, inplace=True)

In [7]:
# Group by country since we recently erased rows
group_by_country = df.groupby(df.Country)

# Determining the available variables per country
dic_country = {}
for name, group in group_by_country:
    local = {}
    local["Variables"] = set(group.Variable)
    local["Number of Vars"] = len(set(group.Variable))
    dic_country[name] = local

In [8]:
# Five is the max number of variables a country can have. So,check what countries have less than 5
dic_special_countries = {}

# Filtering the countries
for country in dic_country.keys():
    # Getting the number of variables for each country
    if dic_country[country]["Number of Vars"] < 5:
        dic_special_countries[country] = dic_country[country]

# Printing the result
pprint(dic_special_countries)

{'Bosnia-Herzegovina': {'Number of Vars': 4,
                        'Variables': {'Rail passenger transport',
                                      'Road passenger transport',
                                      'Road passenger transport by buses and '
                                      'coaches',
                                      'Total inland passenger transport'}},
 'China': {'Number of Vars': 3,
           'Variables': {'Rail passenger transport',
                         'Road passenger transport',
                         'Total inland passenger transport'}},
 'Croatia': {'Number of Vars': 4,
             'Variables': {'Rail passenger transport',
                           'Road passenger transport',
                           'Road passenger transport by buses and coaches',
                           'Total inland passenger transport'}},
 'Estonia': {'Number of Vars': 4,
             'Variables': {'Rail passenger transport',
                           'Road passenger t

### NOTE:  Based on the analysis done above, we discovered that 17 countries are missing variables and thus it is necessary to handle such countries.

### Rule 1: Countries having 1 or 3 variables are left as is. Therefore, the following countries are left as is: 
    China, Georgia, India, Ireland, Israel, Liechtenstein, Luxembourg, Montenegro, Turkey.
    
### Rule 2: To countries containing 4 variables we will the following two variables:  <i>Road Passenger Transport</i>  & <i>Total Inland Passenger Transport</i>. Below are the counties to which such variables are erased:
    Bosnia-Herzegovina, Croatia, Estonia, Latvia, Mexico, Maldova, Romania, Ukraine



# Applying the Rules for data consistency

In [9]:
# Variables to erase
variables_to_erase = ["Road passenger transport", "Total inland passenger transport"]

# Gathering the list of countries to which we need to erase variables
countries_to_erase_variables = []
for key in dic_special_countries.keys():
    if dic_special_countries[key]["Number of Vars"] == 4:
        countries_to_erase_variables.append(key)

# List of index to erase
index_to_erase = []

# For each row of the countries of interest, determine the indices that need to be erased
for country in countries_to_erase_variables:
    df_for_X_country = group_by_country.get_group(country)
    filtered_for_desired_rows = df_for_X_country[(df_for_X_country.Variable == variables_to_erase[0])|(df_for_X_country.Variable == variables_to_erase[1])]
    index_to_erase = index_to_erase + list(filtered_for_desired_rows.index)

# Erasing from the dataframe the indices selected
df.drop(index_to_erase, inplace=True)

# Dropping unnecessary columns
### Rule: To comply with the latest template, we are dropping repeated columns and renaming others.

In [10]:
# Droping the repeated columns
columns_to_delete = ["COUNTRY", "YEAR", "VARIABLE","Reference Period Code","Unit Code","Reference Period", "Flag Codes", "Flags", "PowerCode Code"]
df.drop(columns=columns_to_delete, inplace = True)

# Adding the 'Source' column
### Rule: Add the same source to all rows since all data comes from same source

In [11]:
dataframeManager.simple_column_insert(df,ColumnName.SOURCE.value,"International Transport Forum")
df

Unnamed: 0,Source,Country,Variable,Year,Unit,PowerCode,Value
0,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1990,Passenger-kilometres,Millions,90323.00000
1,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1991,Passenger-kilometres,Millions,82691.00000
2,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1992,Passenger-kilometres,Millions,69357.00000
3,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1993,Passenger-kilometres,Millions,47142.00000
4,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1994,Passenger-kilometres,Millions,39952.00000
...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Total inland passenger transport,2013,Passenger-kilometres,Millions,47955.53068
9222,International Transport Forum,Argentina,Total inland passenger transport,2014,Passenger-kilometres,Millions,49094.90265
9223,International Transport Forum,Argentina,Total inland passenger transport,2015,Passenger-kilometres,Millions,51422.02431
9224,International Transport Forum,Argentina,Total inland passenger transport,2016,Passenger-kilometres,Millions,54904.03347


# Adding the 'Service' column
### Rule: Since all the data is associated to passenger data, the service for all rows corresponds to 'Passenger'

In [12]:
dataframeManager.simple_column_insert(df,ColumnName.SERVICE.value,"Passenger")
df

Unnamed: 0,Service,Source,Country,Variable,Year,Unit,PowerCode,Value
0,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1990,Passenger-kilometres,Millions,90323.00000
1,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1991,Passenger-kilometres,Millions,82691.00000
2,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1992,Passenger-kilometres,Millions,69357.00000
3,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1993,Passenger-kilometres,Millions,47142.00000
4,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1994,Passenger-kilometres,Millions,39952.00000
...,...,...,...,...,...,...,...,...
9221,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2013,Passenger-kilometres,Millions,47955.53068
9222,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2014,Passenger-kilometres,Millions,49094.90265
9223,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2015,Passenger-kilometres,Millions,51422.02431
9224,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2016,Passenger-kilometres,Millions,54904.03347


# Adding the 'Mode' and 'Vehicle_type' Columns
### Rule: We use keywords from the values on the "Variable" column to determine the 'Mode' and 'Vehicle' type.

In [13]:
# For each row, we are going to read the value in the "Variable" column and decide on the "Mode" and "Vehicle_Type"
list_vehicle_type = []
list_mode = []

# Looping through each row, reading the value in 'Variable' column and deciding on the "Mode" and "Vehicle_type"
for index, row in df.iterrows():
    
    # Determining the mode and vehicle type
    if "Rail" in row.Variable:
        list_mode.append("Rail")
        list_vehicle_type.append("All")
        
    elif "Road" in row.Variable:
        list_mode.append("Road")
        
        if "by buses" in row.Variable:
            list_vehicle_type.append("Bus")
        elif "by passenger" in row.Variable:
            list_vehicle_type.append("LDV")
        else:
            list_vehicle_type.append("All")
        
    else:
        list_mode.append("All")
        list_vehicle_type.append("All")
        
# Assert that the lenght of the new columns is the same as the length of the dataframe
assert len(df) == len(list_vehicle_type) and len(df) == len(list_mode)

# Adding the "Mode" and "Vehicle type" to the dataframe
df[ColumnName.MODE.value] = list_mode
df[ColumnName.VEHICLE_TYPE.value] = list_vehicle_type
df

Unnamed: 0,Service,Source,Country,Variable,Year,Unit,PowerCode,Value,Mode,Vehicle Type
0,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1990,Passenger-kilometres,Millions,90323.00000,Road,Bus
1,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1991,Passenger-kilometres,Millions,82691.00000,Road,Bus
2,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1992,Passenger-kilometres,Millions,69357.00000,Road,Bus
3,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1993,Passenger-kilometres,Millions,47142.00000,Road,Bus
4,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1994,Passenger-kilometres,Millions,39952.00000,Road,Bus
...,...,...,...,...,...,...,...,...,...,...
9221,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2013,Passenger-kilometres,Millions,47955.53068,All,All
9222,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2014,Passenger-kilometres,Millions,49094.90265,All,All
9223,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2015,Passenger-kilometres,Millions,51422.02431,All,All
9224,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2016,Passenger-kilometres,Millions,54904.03347,All,All


# Adding the 'Fuel' and 'Technology' column
### Rule: The dataset does not provide any data about those two columns, so we added the default value in both cases

In [14]:
# Adding the "Technology" and "Fuel" columns
dataframeManager.simple_column_insert(df,ColumnName.TECHNOLOGY.value,"All")
dataframeManager.simple_column_insert(df,ColumnName.FUEL.value,"All")
df

Unnamed: 0,Fuel,Technology,Service,Source,Country,Variable,Year,Unit,PowerCode,Value,Mode,Vehicle Type
0,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1990,Passenger-kilometres,Millions,90323.00000,Road,Bus
1,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1991,Passenger-kilometres,Millions,82691.00000,Road,Bus
2,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1992,Passenger-kilometres,Millions,69357.00000,Road,Bus
3,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1993,Passenger-kilometres,Millions,47142.00000,Road,Bus
4,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1994,Passenger-kilometres,Millions,39952.00000,Road,Bus
...,...,...,...,...,...,...,...,...,...,...,...,...
9221,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2013,Passenger-kilometres,Millions,47955.53068,All,All
9222,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2014,Passenger-kilometres,Millions,49094.90265,All,All
9223,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2015,Passenger-kilometres,Millions,51422.02431,All,All
9224,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2016,Passenger-kilometres,Millions,54904.03347,All,All


# Managing the 'Unit' column
### Rule: The data provides values in Million so we are converting it to Billion. Additionally, we set the correct name for the unit based on the template

In [15]:
# Since the there is only one unit, drop the current 'Unit' column and add a new one
columns_to_delete = ["Unit","PowerCode"]
df.drop(columns=columns_to_delete, inplace = True)
dataframeManager.simple_column_insert(df,ColumnName.UNIT.value,"10^9 passenger-km / yr")

# Transforming the current value in Million to Billion (1M = 0.001B)
for index, row in df.iterrows():
    current_value = row.Value
    new_value = current_value * float(0.001)
    df.Value[index] = new_value
    
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Unit,Fuel,Technology,Service,Source,Country,Variable,Year,Value,Mode,Vehicle Type
0,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1990,90.323000,Road,Bus
1,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1991,82.691000,Road,Bus
2,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1992,69.357000,Road,Bus
3,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1993,47.142000,Road,Bus
4,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1994,39.952000,Road,Bus
...,...,...,...,...,...,...,...,...,...,...,...
9221,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2013,47.955531,All,All
9222,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2014,49.094903,All,All
9223,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2015,51.422024,All,All
9224,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,Total inland passenger transport,2016,54.904033,All,All


# Managing the 'Variable' column
### Rule: To comply with the current template we are setting the correct value that pertains to passenger related activities.

In [16]:
# Since all variables available are related to passenger activity then drop the current column and add the correct on
columns_to_delete = ["Variable"]
df.drop(columns=columns_to_delete, inplace = True)
dataframeManager.simple_column_insert(df,ColumnName.VARIABLE.value,"Passenger Activity")
df

Unnamed: 0,Variable,Unit,Fuel,Technology,Service,Source,Country,Year,Value,Mode,Vehicle Type
0,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1990,90.323000,Road,Bus
1,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1991,82.691000,Road,Bus
2,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1992,69.357000,Road,Bus
3,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1993,47.142000,Road,Bus
4,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1994,39.952000,Road,Bus
...,...,...,...,...,...,...,...,...,...,...,...
9221,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2013,47.955531,All,All
9222,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2014,49.094903,All,All
9223,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2015,51.422024,All,All
9224,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2016,54.904033,All,All


# Adding the ISO value to each country
    Rule: For each country we need to assign their respective ISO code

## Determining which countries do not have an ISO code
    As seen from the code below, four countries appear to not have ISO code. However, the reason is because the countries are written in a format that is not understandable. So, this is how each those "missing" countries will be called in order to obtain their ISO code
    
    Original Name --> New name
        > Montenegro, Republic of --> Montenegro
        > Bosnia-Herzegovina --> Bosnia and Herzegovina
        > Korea --> Korea, Republic of
        > Serbia, Republic of --> Serbia

In [17]:
# Getting the list of countries available
list_of_countries = list(set(df["Country"]))

# Getting the list of countries with no ISO code
countries_with_no_ISO_code = countryCodeManager.get_list_of_countries_with_no_iso_code(list_of_countries)
        
# Print this list of countries with no ISO codes
countries_with_no_ISO_code

['Bosnia-Herzegovina',
 'Serbia, Republic of',
 'Korea',
 'Montenegro, Republic of']

## Adding the ISO column to the dataset

In [18]:
dirty_list_of_all_countries = df["Country"]
clean_list_of_all_countries = []

for country in dirty_list_of_all_countries:
    if country == "Montenegro, Republic of":
        clean_list_of_all_countries.append("Montenegro")
    elif country == "Bosnia-Herzegovina":
        clean_list_of_all_countries.append("Bosnia and Herzegovina")
    elif country == "Korea":
        clean_list_of_all_countries.append("Korea, Republic of")
    elif country == "Serbia, Republic of":
        clean_list_of_all_countries.append("Serbia")
    else:
        clean_list_of_all_countries.append(country)

# Ensure the size of the cleaned list is the same as the dirty list
assert len(clean_list_of_all_countries) == len(dirty_list_of_all_countries)

# Assert that for all elements in the new list, no country is left without an ISO code
assert len(countryCodeManager.get_list_of_countries_with_no_iso_code(clean_list_of_all_countries)) == 0

# Getting the list of iso codes
list_of_iso_codes = countryCodeManager.get_list_of_iso_for_countries(clean_list_of_all_countries)

# Adding the column to the dataframe
df[ColumnName.ISO_CODE.value] = list_of_iso_codes
df

Unnamed: 0,Variable,Unit,Fuel,Technology,Service,Source,Country,Year,Value,Mode,Vehicle Type,ISO Code
0,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1990,90.323000,Road,Bus,UKR
1,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1991,82.691000,Road,Bus,UKR
2,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1992,69.357000,Road,Bus,UKR
3,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1993,47.142000,Road,Bus,UKR
4,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1994,39.952000,Road,Bus,UKR
...,...,...,...,...,...,...,...,...,...,...,...,...
9221,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2013,47.955531,All,All,ARG
9222,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2014,49.094903,All,All,ARG
9223,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2015,51.422024,All,All,ARG
9224,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2016,54.904033,All,All,ARG


# Getting the ITEM Region for each country
    Rule: For each country, we need to specify the ITEM region it belongs to

## Determining which countries are missing an ITEM Region
    As seen below, all countries belong to a region. Therefore, no special processing needs to be done.

In [19]:
# Getting the list of ISO codes
list_of_iso_codes = list(set(df["ISO Code"]))

# Getting the list of ISO code with no region
iso_code_with_no_region = countryCodeManager.get_list_of_iso_codes_with_no_region(list_of_iso_codes)

# printing the list of ISO codes
iso_code_with_no_region

[]

## Adding the ITEM region column to the dataset

In [20]:
# Getting the complete list of iso codes
list_of_all_codes = df["ISO Code"]

item_region = countryCodeManager.get_list_of_regions_for_iso_codes(list_of_all_codes)

# Adding the column to the dataframe
df[ColumnName.ITEM_REGION.value] = item_region
df

Unnamed: 0,Variable,Unit,Fuel,Technology,Service,Source,Country,Year,Value,Mode,Vehicle Type,ISO Code,Region
0,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1990,90.323000,Road,Bus,UKR,Non-EU Europe
1,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1991,82.691000,Road,Bus,UKR,Non-EU Europe
2,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1992,69.357000,Road,Bus,UKR,Non-EU Europe
3,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1993,47.142000,Road,Bus,UKR,Non-EU Europe
4,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Ukraine,1994,39.952000,Road,Bus,UKR,Non-EU Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9221,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2013,47.955531,All,All,ARG,Other Latin America
9222,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2014,49.094903,All,All,ARG,Other Latin America
9223,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2015,51.422024,All,All,ARG,Other Latin America
9224,Passenger Activity,10^9 passenger-km / yr,All,All,Passenger,International Transport Forum,Argentina,2016,54.904033,All,All,ARG,Other Latin America


# Reordering the columns positions
### Rule: The order of the columns is based on the order stated in the current template

In [21]:
df = dataframeManager.reorder_columns(df)
df

Unnamed: 0,Source,Country,ISO Code,Region,Variable,Unit,Service,Mode,Vehicle Type,Technology,Fuel,Value,Year
0,International Transport Forum,Ukraine,UKR,Non-EU Europe,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,90.323000,1990
1,International Transport Forum,Ukraine,UKR,Non-EU Europe,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,82.691000,1991
2,International Transport Forum,Ukraine,UKR,Non-EU Europe,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,69.357000,1992
3,International Transport Forum,Ukraine,UKR,Non-EU Europe,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,47.142000,1993
4,International Transport Forum,Ukraine,UKR,Non-EU Europe,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,39.952000,1994
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,ARG,Other Latin America,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,47.955531,2013
9222,International Transport Forum,Argentina,ARG,Other Latin America,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,49.094903,2014
9223,International Transport Forum,Argentina,ARG,Other Latin America,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,51.422024,2015
9224,International Transport Forum,Argentina,ARG,Other Latin America,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,54.904033,2016


# Exporting the results

In [62]:
# Programming Friendly View
dataframeManager.create_programming_friendly_file(df)

# User Friendly View
dataframeManager.create_user_friendly_file(df)

> PF File saved at: /Users/hlinero/Documents/database/item/historical/scripts
> UF File saved at: /Users/hlinero/Documents/database/item/historical/scripts


# Final Note

### After analysing the final data result, we discovered that the values presented for India are not accurate. The argument is that India is not the country with the largest PKT.