In [4]:
import pandas as pd
from pprint import pprint
from item.historical.scripts.util.managers.dataframe import DataframeManager
from item.historical.scripts.util.managers.dataframe import ColumnName
from item.historical.scripts.util.managers.country_code import CountryCodeManager
from item.common import paths

# Variables used all over the notebook

In [5]:
DATASET_ID = "T003"
dataframeManager = DataframeManager(DATASET_ID)
countryCodeManager = CountryCodeManager()

# Opening the dataset

In [6]:
path = paths['data']/'historical'/'input'/'T003_input.csv'
df = pd.read_csv(path)
df

Unnamed: 0,COUNTRY,Country,VARIABLE,Variable,YEAR,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,SVN,Slovenia,T-GOODS-RD-OWN,Road freight transport on own account,1970,1970,TONNEKM,Tonnes-kilometres,6,Millions,,,534.0,,
1,SVN,Slovenia,T-GOODS-RD-OWN,Road freight transport on own account,1971,1971,TONNEKM,Tonnes-kilometres,6,Millions,,,700.0,,
2,SVN,Slovenia,T-GOODS-RD-OWN,Road freight transport on own account,1972,1972,TONNEKM,Tonnes-kilometres,6,Millions,,,1098.0,,
3,SVN,Slovenia,T-GOODS-RD-OWN,Road freight transport on own account,1973,1973,TONNEKM,Tonnes-kilometres,6,Millions,,,1078.0,,
4,SVN,Slovenia,T-GOODS-RD-OWN,Road freight transport on own account,1974,1974,TONNEKM,Tonnes-kilometres,6,Millions,,,745.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13207,KOR,Korea,T-GOODS-RD-REW,Road freight transport for hire and reward,2013,2013,TONNEKM,Tonnes-kilometres,6,Millions,,,118582.0,,
13208,KOR,Korea,T-GOODS-RD-REW,Road freight transport for hire and reward,2014,2014,TONNEKM,Tonnes-kilometres,6,Millions,,,124650.0,,
13209,KOR,Korea,T-GOODS-RD-REW,Road freight transport for hire and reward,2015,2015,TONNEKM,Tonnes-kilometres,6,Millions,,,132382.0,,
13210,KOR,Korea,T-GOODS-RD-REW,Road freight transport for hire and reward,2016,2016,TONNEKM,Tonnes-kilometres,6,Millions,,,135259.0,,


# Removing all unnecessary columns
    Rule: To comply with the latest template, we will drop all the unnecessary columns and rename others.

In [7]:
# Droping the repeated columns
columns_to_delete = ["COUNTRY", "YEAR", "VARIABLE","Reference Period Code","Unit Code","Reference Period", "Flag Codes", "Flags", "PowerCode Code"]
df.drop(columns=columns_to_delete, inplace = True)
df

Unnamed: 0,Country,Variable,Year,Unit,PowerCode,Value
0,Slovenia,Road freight transport on own account,1970,Tonnes-kilometres,Millions,534.0
1,Slovenia,Road freight transport on own account,1971,Tonnes-kilometres,Millions,700.0
2,Slovenia,Road freight transport on own account,1972,Tonnes-kilometres,Millions,1098.0
3,Slovenia,Road freight transport on own account,1973,Tonnes-kilometres,Millions,1078.0
4,Slovenia,Road freight transport on own account,1974,Tonnes-kilometres,Millions,745.0
...,...,...,...,...,...,...
13207,Korea,Road freight transport for hire and reward,2013,Tonnes-kilometres,Millions,118582.0
13208,Korea,Road freight transport for hire and reward,2014,Tonnes-kilometres,Millions,124650.0
13209,Korea,Road freight transport for hire and reward,2015,Tonnes-kilometres,Millions,132382.0
13210,Korea,Road freight transport for hire and reward,2016,Tonnes-kilometres,Millions,135259.0


# Identifying the countries with missing values
    Rule: We will erase all the countries with missing values

In [8]:
list_of_countries_with_missing_values = list(set(df[df['Value'].isnull()]["Country"]))
print(">> Number of countries missing values: {}".format(len(list_of_countries_with_missing_values)))
print(">> Countries missing values:")
pprint(list_of_countries_with_missing_values)
print(">> Number of rows to erase: {}".format(len(df[df['Value'].isnull()])))

>> Number of countries missing values: 29
>> Countries missing values:
['Korea',
 'North Macedonia',
 'Japan',
 'Turkey',
 'Ireland',
 'Moldova',
 'Greece',
 'Estonia',
 'Luxembourg',
 'Sweden',
 'Bosnia-Herzegovina',
 'Australia',
 'Norway',
 'Portugal',
 'New Zealand',
 'Israel',
 'Liechtenstein',
 'Finland',
 'Armenia',
 'Mexico',
 'Azerbaijan',
 'Slovenia',
 'Iceland',
 'Albania',
 'Georgia',
 'Denmark',
 'Montenegro, Republic of',
 'Malta',
 'Spain']
>> Number of rows to erase: 2067


In [9]:
# Dropping the values
df.dropna(inplace = True)

# Adding the "Source" Column
    Rule: Since all the data comes from the same dataset, all rows have the same source, which is the "International Transport Forum"

In [10]:
dataframeManager.simple_column_insert(df,ColumnName.SOURCE.value,"International Transport Forum")
df

Unnamed: 0,Source,Country,Variable,Year,Unit,PowerCode,Value
0,International Transport Forum,Slovenia,Road freight transport on own account,1970,Tonnes-kilometres,Millions,534.0
1,International Transport Forum,Slovenia,Road freight transport on own account,1971,Tonnes-kilometres,Millions,700.0
2,International Transport Forum,Slovenia,Road freight transport on own account,1972,Tonnes-kilometres,Millions,1098.0
3,International Transport Forum,Slovenia,Road freight transport on own account,1973,Tonnes-kilometres,Millions,1078.0
4,International Transport Forum,Slovenia,Road freight transport on own account,1974,Tonnes-kilometres,Millions,745.0
...,...,...,...,...,...,...,...
13207,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Tonnes-kilometres,Millions,118582.0
13208,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Tonnes-kilometres,Millions,124650.0
13209,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Tonnes-kilometres,Millions,132382.0
13210,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Tonnes-kilometres,Millions,135259.0


# Determing what countries have missing data related to the "Road" mode and Determining what countries have missing data related to "Total Inland FT" mode.

    Rationale 1: Mode "Road" and VT "All" is broken down into two VTs: "Road freight transport for hire and reward" and "Road freight transport on own account." We need to identify for each country, the years in which the the "All" VT is presented but one if its components is missing.
    
    Rationale 2: Just as above with the "Road" mode, the "Inland" mode has "All" as VT. However, the components that make up the "All" include the VT "Pipeline," "Inland Waterway," "Rail," and "Road ALL". Therefore, if one of those components are missing for a country, we need to identify it.

In [11]:
list(set(df["Variable"]))

['Road freight transport on own account',
 'Road freight transport',
 'Rail freight transport',
 'Inland waterways freight transport',
 'Pipelines transport',
 'Road freight transport for hire and reward',
 'Total inland freight transport']

In [12]:
# group the datafram by country
df_by_country = df.groupby(df["Country"])

# Variable holding for each country the list of problematic year
result_problematic_countries_road_mode = {}
result_problematic_countries_inland_mode = {}

# Loop through each country
for name, group in df_by_country:
    
    # For country X, group the df by year
    country_x_by_year = group.groupby(group.Year)
    
    # variable holding the problematic years for country X
    problematic_years_road_analysis = []
    problematic_years_inland_analysis = []
    
    # For each year Y in country X, check what values are missing
    for year_name, year_group in country_x_by_year:
        list_of_variables_for_country_x = set(year_group["Variable"])
        
        # Checking for problems with "Road" mode
        if "Road freight transport" in list_of_variables_for_country_x:
            for_hire_name = "Road freight transport for hire and reward"
            on_account_name = "Road freight transport on own account"
            if (for_hire_name not in list_of_variables_for_country_x and on_account_name in list_of_variables_for_country_x) or (for_hire_name in list_of_variables_for_country_x and on_account_name not in list_of_variables_for_country_x):
                problematic_years_road_analysis.append(year_name)

        # Checking for problem with "Inland" Mode
        if "Total inland freight transport" in list_of_variables_for_country_x:
            variables_of_interest_to_be_present = ['Inland waterways freight transport','Road freight transport','Pipelines transport','Rail freight transport']
            is_at_leat_one_variable_of_interest_present = False
            for variable in variables_of_interest_to_be_present:
                if variable in list_of_variables_for_country_x:
                    is_at_leat_one_variable_of_interest_present = True
            
            if is_at_leat_one_variable_of_interest_present == True:
                problematic_years_inland_analysis.append(year_name)
                    
    # Adding the list of problematic years to the country
    if len(problematic_years_road_analysis) > 0:
        result_problematic_countries_road_mode[name] = problematic_years_road_analysis
        
    
    if len(problematic_years_inland_analysis) > 0:
        result_problematic_countries_inland_mode[name] = problematic_years_inland_analysis

print("These are the countries with the year in which there are problem with the Road mode")
# result_problematic_countries_road_mode
result_problematic_countries_inland_mode

These are the countries with the year in which there are problem with the Road mode


{'Albania': [1970,
  1971,
  1972,
  1973,
  1974,
  1975,
  1976,
  1977,
  1978,
  1979,
  1980,
  1981,
  1982,
  1983,
  1984,
  1985,
  1986,
  1987,
  1988,
  1989,
  1990,
  1991,
  1992,
  1993,
  1994,
  1995,
  1996,
  1997,
  1998,
  1999,
  2000,
  2001,
  2002,
  2003,
  2004,
  2005,
  2006,
  2007,
  2008,
  2009,
  2010,
  2011,
  2012,
  2013],
 'Armenia': [2001,
  2002,
  2003,
  2004,
  2005,
  2006,
  2007,
  2008,
  2009,
  2010,
  2011,
  2012,
  2013,
  2014,
  2015,
  2016,
  2017],
 'Australia': [1970,
  1971,
  1972,
  1973,
  1974,
  1975,
  1976,
  1977,
  1978,
  1979,
  1980,
  1981,
  1982,
  1983,
  1984,
  1985,
  1986,
  1987,
  1988,
  1989,
  1990,
  1991,
  1992,
  1993,
  1994,
  1995,
  1996,
  1997,
  1998,
  1999,
  2000,
  2001,
  2002,
  2003,
  2004,
  2005,
  2006,
  2007,
  2008,
  2009,
  2010,
  2011,
  2012,
  2013,
  2014,
  2015,
  2016],
 'Austria': [1970,
  1971,
  1972,
  1973,
  1974,
  1975,
  1976,
  1977,
  1978,
  1979,
  1980,

# Adding the "Service" column
    Rule: As seen from the above section, there are seven(7) variables. Six (6) of them are related to freight, while one is related to pipeline. Therefore, depending on the case, the service is either "Freight" or "Pipeline"

In [13]:
# Variables holding the service value for each row
service_column = []

# For each value row, determine
for index, row in df.iterrows():
    if row["Variable"] == "Pipelines transport":
        service_column.append("Pipeline")
    else:
        service_column.append("Freight")
        
# Adding the service column to the df
df[ColumnName.SERVICE.value] = service_column

# Validating the data by asserting that the service assigned matches with the variable
for index, row in df.iterrows():
    if row["Variable"] == "Pipelines transport":
        assert row["Service"] == "Pipeline"
    else:
        assert row["Service"] == "Freight"

df

Unnamed: 0,Source,Country,Variable,Year,Unit,PowerCode,Value,Service
0,International Transport Forum,Slovenia,Road freight transport on own account,1970,Tonnes-kilometres,Millions,534.0,Freight
1,International Transport Forum,Slovenia,Road freight transport on own account,1971,Tonnes-kilometres,Millions,700.0,Freight
2,International Transport Forum,Slovenia,Road freight transport on own account,1972,Tonnes-kilometres,Millions,1098.0,Freight
3,International Transport Forum,Slovenia,Road freight transport on own account,1973,Tonnes-kilometres,Millions,1078.0,Freight
4,International Transport Forum,Slovenia,Road freight transport on own account,1974,Tonnes-kilometres,Millions,745.0,Freight
...,...,...,...,...,...,...,...,...
13207,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Tonnes-kilometres,Millions,118582.0,Freight
13208,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Tonnes-kilometres,Millions,124650.0,Freight
13209,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Tonnes-kilometres,Millions,132382.0,Freight
13210,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Tonnes-kilometres,Millions,135259.0,Freight


# Adding the "Fuel" and "Technology" columns
    Rule: The dataset does not provide any information about fuel and technology. Therefore, we assing "All" to both cases.

In [14]:
dataframeManager.simple_column_insert(df,ColumnName.TECHNOLOGY.value,"All")
dataframeManager.simple_column_insert(df,ColumnName.FUEL.value,"All")
df

Unnamed: 0,Fuel,Technology,Source,Country,Variable,Year,Unit,PowerCode,Value,Service
0,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1970,Tonnes-kilometres,Millions,534.0,Freight
1,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1971,Tonnes-kilometres,Millions,700.0,Freight
2,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1972,Tonnes-kilometres,Millions,1098.0,Freight
3,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1973,Tonnes-kilometres,Millions,1078.0,Freight
4,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1974,Tonnes-kilometres,Millions,745.0,Freight
...,...,...,...,...,...,...,...,...,...,...
13207,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Tonnes-kilometres,Millions,118582.0,Freight
13208,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Tonnes-kilometres,Millions,124650.0,Freight
13209,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Tonnes-kilometres,Millions,132382.0,Freight
13210,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Tonnes-kilometres,Millions,135259.0,Freight


# Adding the "Mode" column
    Rule: The mapping from each variable to the corresponding Mode is as follows:
    
         'Road freight transport' --> Road
         'Road freight transport for hire and reward' --> Road
         'Road freight transport on own account' --> Road
         'Rail freight transport' --> Rail
         'Pipelines transport' --> Pipeline
         'Inland waterways freight transport' --> Shipping
         'Total inland freight transport' --> Inland
         

In [15]:
# Variable holding info about mode of each column
mode_column = []

# For each row, do the following analysis
for index, row in df.iterrows():
    if "Road" in row["Variable"]:
        mode_column.append("Road")
        
    elif "Rail" in row["Variable"]:
        mode_column.append("Rail")
        
    elif "Pipelines" in row["Variable"]:
        mode_column.append("Pipeline")
        
    elif row["Variable"] == "Inland waterways freight transport" :
        mode_column.append("Shipping")
    
    elif row["Variable"] == "Total inland freight transport":
        mode_column.append("Inland")

# Assigning the mode column to the df
df[ColumnName.MODE.value] = mode_column
df

Unnamed: 0,Fuel,Technology,Source,Country,Variable,Year,Unit,PowerCode,Value,Service,Mode
0,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1970,Tonnes-kilometres,Millions,534.0,Freight,Road
1,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1971,Tonnes-kilometres,Millions,700.0,Freight,Road
2,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1972,Tonnes-kilometres,Millions,1098.0,Freight,Road
3,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1973,Tonnes-kilometres,Millions,1078.0,Freight,Road
4,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1974,Tonnes-kilometres,Millions,745.0,Freight,Road
...,...,...,...,...,...,...,...,...,...,...,...
13207,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Tonnes-kilometres,Millions,118582.0,Freight,Road
13208,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Tonnes-kilometres,Millions,124650.0,Freight,Road
13209,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Tonnes-kilometres,Millions,132382.0,Freight,Road
13210,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Tonnes-kilometres,Millions,135259.0,Freight,Road


# Adding the "Vehicle Type" column
    Rule: The mapping from each variable to the corresponding Vehicle Type is as follows:
    
         'Pipelines transport' --> Pipeline
         'Rail freight transport' --> All
         'Road freight transport' --> All
         'Road freight transport for hire and reward' --> For Hire and Reward
         'Road freight transport on own account' --> For Own Account
         'Inland waterways freight transport' --> Inland
         'Total inland freight transport' --> All

In [16]:
# Variable holding the vehhicle type
vehicle_type = []

# For each row, do the following:
for index, row in df.iterrows():
    if row["Variable"] == "Pipelines transport":
        vehicle_type.append("Pipeline")
        
    elif row["Variable"] == "Rail freight transport":
        vehicle_type.append("All")
        
    elif row["Variable"] == "Road freight transport":
        vehicle_type.append("All")
        
    elif row["Variable"] == "Road freight transport for hire and reward":
        vehicle_type.append("For hire and reward")
    
    elif row["Variable"] == "Road freight transport on own account":
        vehicle_type.append("For Own Account")
    
    elif row["Variable"] == "Inland waterways freight transport":
        vehicle_type.append("Inland Waterway")
    
    elif row["Variable"] == "Total inland freight transport":
        vehicle_type.append("All")

# Adding the vehicle type column
df[ColumnName.VEHICLE_TYPE.value] = vehicle_type
df

Unnamed: 0,Fuel,Technology,Source,Country,Variable,Year,Unit,PowerCode,Value,Service,Mode,Vehicle Type
0,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1970,Tonnes-kilometres,Millions,534.0,Freight,Road,For Own Account
1,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1971,Tonnes-kilometres,Millions,700.0,Freight,Road,For Own Account
2,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1972,Tonnes-kilometres,Millions,1098.0,Freight,Road,For Own Account
3,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1973,Tonnes-kilometres,Millions,1078.0,Freight,Road,For Own Account
4,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1974,Tonnes-kilometres,Millions,745.0,Freight,Road,For Own Account
...,...,...,...,...,...,...,...,...,...,...,...,...
13207,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Tonnes-kilometres,Millions,118582.0,Freight,Road,For hire and reward
13208,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Tonnes-kilometres,Millions,124650.0,Freight,Road,For hire and reward
13209,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Tonnes-kilometres,Millions,132382.0,Freight,Road,For hire and reward
13210,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Tonnes-kilometres,Millions,135259.0,Freight,Road,For hire and reward


# Analyzing how many units are available

In [17]:
set(df["Unit"])

{'Tonnes-kilometres'}

# Setting the "Unit" column
    Rule: Since there the dataset only has one unit, we will change it to the appropriate unit which is "10 ^9 tonnes - km / yr"

In [18]:
# Erasing the current "Unit" column
df.drop(columns=["Unit"], inplace = True)
dataframeManager.simple_column_insert(df,ColumnName.UNIT.value,"10^9 tonne-km / yr")
df

Unnamed: 0,Unit,Fuel,Technology,Source,Country,Variable,Year,PowerCode,Value,Service,Mode,Vehicle Type
0,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1970,Millions,534.0,Freight,Road,For Own Account
1,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1971,Millions,700.0,Freight,Road,For Own Account
2,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1972,Millions,1098.0,Freight,Road,For Own Account
3,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1973,Millions,1078.0,Freight,Road,For Own Account
4,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1974,Millions,745.0,Freight,Road,For Own Account
...,...,...,...,...,...,...,...,...,...,...,...,...
13207,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Millions,118582.0,Freight,Road,For hire and reward
13208,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Millions,124650.0,Freight,Road,For hire and reward
13209,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Millions,132382.0,Freight,Road,For hire and reward
13210,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Millions,135259.0,Freight,Road,For hire and reward


# Setting the "Value" column
    Rule: The magnitude of the value is millions. We are changing it to billion, given than 1B = 1000M

In [19]:
# Variable holding the new magnitude
magnitude_in_billion = []

for index, row in df.iterrows():
    current_magnitude = row["Value"]
    new_magnitude = current_magnitude/1000
    magnitude_in_billion.append(new_magnitude)

# Removing the old magnitude
df.drop(columns=["Value"], inplace=True)

# Adding the new values in Billion
df[ColumnName.VALUE.value] = magnitude_in_billion
df

Unnamed: 0,Unit,Fuel,Technology,Source,Country,Variable,Year,PowerCode,Service,Mode,Vehicle Type,Value
0,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1970,Millions,Freight,Road,For Own Account,0.534
1,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1971,Millions,Freight,Road,For Own Account,0.700
2,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1972,Millions,Freight,Road,For Own Account,1.098
3,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1973,Millions,Freight,Road,For Own Account,1.078
4,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,Road freight transport on own account,1974,Millions,Freight,Road,For Own Account,0.745
...,...,...,...,...,...,...,...,...,...,...,...,...
13207,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2013,Millions,Freight,Road,For hire and reward,118.582
13208,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2014,Millions,Freight,Road,For hire and reward,124.650
13209,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2015,Millions,Freight,Road,For hire and reward,132.382
13210,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,Road freight transport for hire and reward,2016,Millions,Freight,Road,For hire and reward,135.259


# Setting the "Variable" column
    Rule: Since all data is associated to freight, the variable is "Freight Activity"

In [20]:
# Removing the current variable column
df.drop(columns=["Variable"], inplace=True)

# Adding the new "Variable" column
df[ColumnName.VARIABLE.value] = ["Freight Activity"] * len(df)
df

Unnamed: 0,Unit,Fuel,Technology,Source,Country,Year,PowerCode,Service,Mode,Vehicle Type,Value,Variable
0,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,1970,Millions,Freight,Road,For Own Account,0.534,Freight Activity
1,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,1971,Millions,Freight,Road,For Own Account,0.700,Freight Activity
2,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,1972,Millions,Freight,Road,For Own Account,1.098,Freight Activity
3,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,1973,Millions,Freight,Road,For Own Account,1.078,Freight Activity
4,10^9 tonne-km / yr,All,All,International Transport Forum,Slovenia,1974,Millions,Freight,Road,For Own Account,0.745,Freight Activity
...,...,...,...,...,...,...,...,...,...,...,...,...
13207,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,2013,Millions,Freight,Road,For hire and reward,118.582,Freight Activity
13208,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,2014,Millions,Freight,Road,For hire and reward,124.650,Freight Activity
13209,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,2015,Millions,Freight,Road,For hire and reward,132.382,Freight Activity
13210,10^9 tonne-km / yr,All,All,International Transport Forum,Korea,2016,Millions,Freight,Road,For hire and reward,135.259,Freight Activity


# Creating a Mode "Inland (exl. Pipeline)" which is the same as the "Inland" mode but without pipeline data

In [21]:
# Creating the new variable 
df_grouping_by_country = df.groupby(df["Country"])

# Variable holding the complete data
complete_data_per_country = {}

# For each country group, do the following
for country_name, country_group in df_grouping_by_country:
    # Group country X by year
    year_group_country_x = country_group.groupby(country_group["Year"])
    
    # Variable holding the information about <year, inland exl pipeline>
    local_result_per_year = {}
    
    # Loop each year
    for year_name, year_group in year_group_country_x:
        # Variable holding the sum for the year
        inland_exl_pipeline = 0
        
        # Loop through each data available for each year
        for index, row in year_group.iterrows():
            if row["Mode"] != "Inland" and row["Mode"] != "Pipeline" and row["Vehicle Type"] != "For Own Account" and row["Vehicle Type"] != "For hire and reward":
                inland_exl_pipeline = inland_exl_pipeline + row["Value"]
        
        # Setting the result 
        local_result_per_year[year_name] = inland_exl_pipeline
    
    # Assing the result to the country
    complete_data_per_country[country_name] = local_result_per_year

# Printing all the data
complete_data_per_country

{'Albania': {1970: 0.936,
  1971: 1.025,
  1972: 1.2,
  1973: 1.134,
  1974: 1.16,
  1975: 1.3639999999999999,
  1976: 1.23,
  1977: 1.26,
  1978: 1.5030000000000001,
  1979: 1.649,
  1980: 1.779,
  1981: 1.7440000000000002,
  1982: 1.896,
  1983: 1.917,
  1984: 1.896,
  1985: 1.826,
  1986: 1.8250000000000002,
  1987: 1.897,
  1988: 1.894,
  1989: 1.9780000000000002,
  1990: 1.779,
  1991: 0.909,
  1992: 0.6040000000000001,
  1993: 1.464,
  1994: 1.9929999999999999,
  1995: 2.13,
  1996: 2.29,
  1997: 1.363,
  1998: 1.855,
  1999: 2.037,
  2000: 2.192,
  2001: 2.25,
  2002: 2.3729999999999998,
  2003: 2.562,
  2004: 2.83,
  2005: 3.2359999999999998,
  2006: 3.342,
  2007: 3.637,
  2008: 4.1499999999999995,
  2009: 4.4910000000000005,
  2010: 4.692,
  2011: 3.855,
  2012: 3.2479999999999998,
  2013: 3.52,
  2014: 0.039887,
  2015: 0.023124000000000002,
  2016: 0.0088,
  2017: 0.0253,
  2018: 0.020399999999999998},
 'Argentina': {1994: 6.663,
  1995: 7.613,
  1996: 8.506,
  1997: 9.835,

In [22]:
# Getting the list of countries
list_of_countries = complete_data_per_country.keys()

for country in list_of_countries:
    # Getting list of years
    dic_new_value_per_country = complete_data_per_country[country]
    list_of_years = list(complete_data_per_country[country].keys())
    
    # Loop through each year
    for year in list_of_years:
        value = dic_new_value_per_country[year]
        local_DF = pd.DataFrame({ColumnName.SOURCE.value:["International Transport Forum"],ColumnName.COUNTRY.value:[country],ColumnName.VARIABLE.value:["Freight Activity"],ColumnName.UNIT.value:["10^9 tonne-km / yr"],ColumnName.SERVICE.value:["Freight"], ColumnName.MODE.value:["Inland (exl. Pipeline)"], ColumnName.VEHICLE_TYPE.value:["All"], ColumnName.TECHNOLOGY.value:["All"], ColumnName.FUEL.value:["All"], ColumnName.VALUE.value:[value], ColumnName.YEAR.value:[year]})
        df.append(local_DF,ignore_index = True, sort=False)
        df = pd.concat([df, local_DF],ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [23]:
df

Unnamed: 0,Country,Fuel,Mode,PowerCode,Service,Source,Technology,Unit,Value,Variable,Vehicle Type,Year
0,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.534,Freight Activity,For Own Account,1970
1,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.700,Freight Activity,For Own Account,1971
2,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.098,Freight Activity,For Own Account,1972
3,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.078,Freight Activity,For Own Account,1973
4,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.745,Freight Activity,For Own Account,1974
...,...,...,...,...,...,...,...,...,...,...,...,...
13458,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6098.067,Freight Activity,All,2014
13459,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,5928.872,Freight Activity,All,2015
13460,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,5707.326,Freight Activity,All,2016
13461,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,5402.670,Freight Activity,All,2017


# Removing inconsistent data from selected countries. See section: *"Determing what countries have missing data related to the "Road" mode and Determining what countries have missing data related to "Total Inland FT" mode"* Each Country is handled as follows:
    >> Albania: 
        - Remove all rows with mode "Inland" for range [1970 - 2002] --> Road data is missing
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 2002] --> Road data is missing
    
    >> Armenia:
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 2002] --> 
        
    >> Belarus:
        - Remove all rows with mode "Inland" for range [1970 - 1995] --> Pipeline data is missing
        
    >> Belgium
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2012 - 2018] --> Rail data is missing
        
    >> Bosnia-Herzegovina:
        - Remove all rows with mode "Inland" for range [1998 - 2006] and 2010 --> Road data is missing
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1998 - 2006] and 2010 --> Road data is missing
        
    >> Bulgaria:
        - Remove all rows with mode "Inland" for range [1970 - 1977] --> Pipeline data is missing
    
    >> Canada:
        - Remove all rows with mode "Inland" for range [1970 - 1994] and [2014-2018] --> Multiple data is missing
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 1995] and [2014-2018] --> Multiple data is missing
        
    >> China:
        - Remove all rows with mode "Inland" for range [1970 - 1977], [1979], [1981 - 1984], and [1986 - 1989] --> Road data is missing
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 1977], [1979], [1981 - 1984], and [1986 - 1989] --> Road data is missing
        
    >> Croatia
        - Remove all rows with mode "Inland" for range [1970 - 1995] --> Multiple data is missing
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 1995] --> Multiple data is missing
        
    >> Czech Republic:
        - Remove all rows with mode "Inland" for range [1970 - 1992] --> Road data is missing
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 1992] --> Road data is missing
        
    >> Denmark:
        - Remove all rows with mode "Inland" for range [1971 - 1983] --> Pipeline data is missing
        
    >> Greece:
        - Remove all rows with mode "Inland" for range [2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2018]
        
    >> India:
        - Remove all rows with mode "Inland" for range [1970 - 2004] and [2013 - 2017]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 2004]
        
    >> Ireland:
        - Remove all rows with mode "Inland" for range [1979]
        - Remove all rows with mode "Road" and VT "All" for range [1979]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1979]
        
    >> Japan
        - Remove all rows with mode "Inland" for range [2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2018]
        
    >> Korea:
        - Remove all rows with mode "Inland" for range [2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2018]
        
    >> Latvia:
        - Remove all rows with mode "Inland" for range [1970 - 1994]
        
    >> Liechtenstein:
        - Remove all rows with mode "Inland" for range [2005 - 2006]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2005 - 2006]
        
    >> New Zealand:
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 1998]
        
    >> Norway:
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2017]
    
    >> Portugal:
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1970 - 1978] and [1981-1986]
        
    >> Serbia, Republic of:
        - Remove all rows with mode "Inland" for range [1997 - 2018]
        - Remove all rows with mode "Road" and VT "All" for range [1997 - 2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1997 - 2018]
    
    >> Spain:
        - Remove all rows with mode "Inland" for range [2017 - 2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2017 - 2018]
        
    >> Switzerland:
        - Remove all rows with mode "Inland" for range [2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2018]
        
    >> Ukraine:
        - Remove all rows with mode "Inland" for range [1990 - 2008]
        - Remove all rows with mode "Road" and VT "All" for range [1990 - 2008]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [1990 - 2008]

    >> United Kingdom:
        - Remove all rows with mode "Inland" for range [2013 - 2018]
        
    >> United States:
        - Remove all rows with mode "Inland" for range [2012 - 2018]
        - Remove all rows with mode "Inland (Exl. Pipeline)" for range [2017 - 2018]


In [24]:
list_of_index_to_erase = []

# 27 Total countries = 21 handled here and 6 handled in coming cells
conditions = {}
conditions["Albania"] = {"years": list(range(1970,2003)), "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Armenia"] = {"years": list(range(1970,2003)), "modes": ["Inland (exl. Pipeline)"]}
conditions["Belarus"] = {"years": list(range(1970,1995)), "modes": ["Inland"]}
conditions["Belgium"] = {"years": list(range(2012,2019)), "modes": ["Inland (exl. Pipeline)"]}
conditions["Bosnia-Herzegovina"] = {"years": list(range(1998,2007))+[2010], "modes": ["Inland (exl. Pipeline)"]}
conditions["Bulgaria"] = {"years": list(range(1970,1978)), "modes": ["Inland"]}
conditions["China"] = {"years": list(range(1970,1978))+[1979]+list(range(1981,1985))+list(range(1986,1990)), "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Croatia"] = {"years": list(range(1970,1996)), "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Czech Republic"] = {"years": list(range(1970,1993)), "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Denmark"] = {"years": list(range(1971,1984)), "modes": ["Inland"]}
conditions["Greece"] = {"years": [2018], "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Japan"] = {"years": [2018], "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Korea"] = {"years": [2018], "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Latvia"] = {"years": list(range(1970,1995)), "modes": ["Inland"]}
conditions["Liechtenstein"] = {"years": [2005,2006], "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["New Zealand"] = {"years": list(range(1970,1999)), "modes": ["Inland (exl. Pipeline)"]}
conditions["Norway"] = {"years": [2017], "modes": ["Inland (exl. Pipeline)"]}
conditions["Portugal"] = {"years": list(range(1970,1979))+list(range(1981,1987)), "modes": ["Inland (exl. Pipeline)"]}
conditions["Spain"] = {"years": list(range(2017,2019)), "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["Switzerland"] = {"years": [2018], "modes": ["Inland","Inland (exl. Pipeline)"]}
conditions["United Kingdom"] = {"years": list(range(2013,2019)), "modes": ["Inland"]}

list_of_countries_to_clean = list(conditions.keys())

# For each country, perform the following operations based on the data passed to each country
for index, row in df.iterrows():
    country = row["Country"]
    if country in list_of_countries_to_clean:
        list_of_years_to_erase = conditions[country]["years"]
        modes_to_erase = conditions[country]["modes"]
        if (row["Year"] in list_of_years_to_erase) and (row["Mode"] in modes_to_erase):
            list_of_index_to_erase.append(index)

len(list_of_index_to_erase)

249

### Note: Canada, India, and United States are going to be managed in this cell because they have more special conditions than the contries above

In [25]:
# Setting the special conditions
special_conditions = {}
special_conditions["Canada"] = {"years_1": list(range(1970,1995)) +list(range(2014,2019)), "modes_1": ["Inland"], "years_2": list(range(1970,1996))+list(range(2014,2019)), "modes_2": ["Inland (exl. Pipeline)"]}
special_conditions["India"] = {"years_1": list(range(1970,2005)) +list(range(2013,2018)), "modes_1": ["Inland"], "years_2": list(range(1970,2005)), "modes_2": ["Inland (exl. Pipeline)"]}
special_conditions["United States"] = {"years_1": list(range(2012,2019)), "modes_1": ["Inland"], "years_2": list(range(2017,2019)), "modes_2": ["Inland (exl. Pipeline)"]}

list_of_countries_with_special_conditions = list(special_conditions.keys())

# For each country, perform the following operations based on the data passed to each country
for index, row in df.iterrows():
    country = row["Country"]
    if country in list_of_countries_with_special_conditions:
        
        # Conditions 1
        modes_to_erase_condition_1 = special_conditions[country]["modes_1"]
        list_of_years_to_erase_condition_1 = special_conditions[country]["years_1"]
        
        # Conditions 2
        modes_to_erase_condition_2 = special_conditions[country]["modes_2"]
        list_of_years_to_erase_condition_2 = special_conditions[country]["years_1"]
        
        # Testing the conditions
        if row["Mode"] in modes_to_erase_condition_1 and row["Year"] in list_of_years_to_erase_condition_1:
            list_of_index_to_erase.append(index)
        elif row["Mode"] in modes_to_erase_condition_2 and row["Year"] in list_of_years_to_erase_condition_2:
            list_of_index_to_erase.append(index)

len(list_of_index_to_erase)

309

# Note: Ireland, Serbia, Republic of, and Ukraine will be managed in this cell because they have even more special conditions

In [26]:
finer_conditions = {}
finer_conditions["Ireland"] = {"years": [1979], "modes": ["Inland","Inland (exl. Pipeline)", "Road"]}
finer_conditions["Serbia, Republic of"] = {"years": list(range(1997,2019)), "modes": ["Inland","Inland (exl. Pipeline)", "Road"]}
finer_conditions["Ukraine"] = {"years": list(range(1990,2009)), "modes": ["Inland","Inland (exl. Pipeline)", "Road"]}

list_of_countries_to_clean = list(finer_conditions.keys())

# For each country, perform the following operations based on the data passed to each country
for index, row in df.iterrows():
    country = row["Country"]
    if country in list_of_countries_to_clean:
        list_of_years_to_erase = finer_conditions[country]["years"]
        modes_to_erase = finer_conditions[country]["modes"]
        if (row["Year"] in list_of_years_to_erase) and (row["Mode"] in modes_to_erase):
            if row["Mode"] == "Road" and row["Vehicle Type"] != "All":
                pass
            else:
                list_of_index_to_erase.append(index)

len(list_of_index_to_erase)

432

In [27]:
# Dropping the rows from the dataframe
df.drop(df.index[list_of_index_to_erase], inplace = True)
df

Unnamed: 0,Country,Fuel,Mode,PowerCode,Service,Source,Technology,Unit,Value,Variable,Vehicle Type,Year
0,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.534,Freight Activity,For Own Account,1970
1,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.700,Freight Activity,For Own Account,1971
2,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.098,Freight Activity,For Own Account,1972
3,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.078,Freight Activity,For Own Account,1973
4,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.745,Freight Activity,For Own Account,1974
...,...,...,...,...,...,...,...,...,...,...,...,...
13451,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6770.271,Freight Activity,All,2007
13452,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6998.545,Freight Activity,All,2008
13453,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6292.634,Freight Activity,All,2009
13454,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6610.056,Freight Activity,All,2010


# Getting the ISO Code for each country
    Rule: For each country, we have to assign their respective ISO code

## Determining which countries do not appear in the list of ISO Codes
    As seen from the code below, four countries appear to not have ISO code. However, the reason is because the countries are written in a format that is not understandable. So, this is how each those "missing" countries will be called in order to obtain their ISO code
    
    Original Name --> New name
        > Montenegro, Republic of --> Montenegro
        > Bosnia-Herzegovina --> Bosnia and Herzegovina
        > Korea --> Korea, Republic of
        > Serbia, Republic of --> Serbia

In [28]:
# Getting the list of countries available
list_of_countries = list(set(df["Country"]))

# Getting the list of countries with no ISO code
countries_with_no_ISO_code = countryCodeManager.get_list_of_countries_with_no_iso_code(list_of_countries)
        
# Print this list of countries with no ISO codes
countries_with_no_ISO_code

['Korea',
 'Bosnia-Herzegovina',
 'Serbia, Republic of',
 'Montenegro, Republic of']

## Adding the ISO column to the dataset

In [29]:
dirty_list_of_all_countries = df["Country"]
clean_list_of_all_countries = []

for country in dirty_list_of_all_countries:
    if country == "Montenegro, Republic of":
        clean_list_of_all_countries.append("Montenegro")
    elif country == "Bosnia-Herzegovina":
        clean_list_of_all_countries.append("Bosnia and Herzegovina")
    elif country == "Korea":
        clean_list_of_all_countries.append("Korea, Republic of")
    elif country == "Serbia, Republic of":
        clean_list_of_all_countries.append("Serbia")
    else:
        clean_list_of_all_countries.append(country)

# Ensure the size of the cleaned list is the same as the dirty list
assert len(clean_list_of_all_countries) == len(dirty_list_of_all_countries)

# Assert that for all elements in the new list, no country is left without an ISO code
assert len(countryCodeManager.get_list_of_countries_with_no_iso_code(clean_list_of_all_countries)) == 0

# Getting the list of iso codes
list_of_iso_codes = countryCodeManager.get_list_of_iso_for_countries(clean_list_of_all_countries)

# Adding the column to the dataframe
df[ColumnName.ISO_CODE.value] = list_of_iso_codes
df

Unnamed: 0,Country,Fuel,Mode,PowerCode,Service,Source,Technology,Unit,Value,Variable,Vehicle Type,Year,ISO Code
0,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.534,Freight Activity,For Own Account,1970,SVN
1,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.700,Freight Activity,For Own Account,1971,SVN
2,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.098,Freight Activity,For Own Account,1972,SVN
3,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.078,Freight Activity,For Own Account,1973,SVN
4,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.745,Freight Activity,For Own Account,1974,SVN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13451,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6770.271,Freight Activity,All,2007,USA
13452,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6998.545,Freight Activity,All,2008,USA
13453,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6292.634,Freight Activity,All,2009,USA
13454,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6610.056,Freight Activity,All,2010,USA


# Getting the ITEM region for each country
    Rule: For each country, we need to specify the ITEM region it belongs to.

## Determining which countries are missing an ITEM region
    As seen below, all countries belong to a region. Therefore, no special processing needs to be done.

In [30]:
# Getting the list of ISO codes
list_of_iso_codes = list(set(df["ISO Code"]))

# Getting the list of ISO code with no region
iso_code_with_no_region = countryCodeManager.get_list_of_iso_codes_with_no_region(list_of_iso_codes)

# printing the list of ISO codes
iso_code_with_no_region

[]

## Adding the ITEM region column to the dataset

In [31]:
# Getting the complete list of iso codes
list_of_all_codes = df["ISO Code"]

item_region = countryCodeManager.get_list_of_regions_for_iso_codes(list_of_all_codes)

# Adding the column to the dataframe
df[ColumnName.ITEM_REGION.value] = item_region
df

Unnamed: 0,Country,Fuel,Mode,PowerCode,Service,Source,Technology,Unit,Value,Variable,Vehicle Type,Year,ISO Code,Region
0,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.534,Freight Activity,For Own Account,1970,SVN,EU-27
1,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.700,Freight Activity,For Own Account,1971,SVN,EU-27
2,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.098,Freight Activity,For Own Account,1972,SVN,EU-27
3,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,1.078,Freight Activity,For Own Account,1973,SVN,EU-27
4,Slovenia,All,Road,Millions,Freight,International Transport Forum,All,10^9 tonne-km / yr,0.745,Freight Activity,For Own Account,1974,SVN,EU-27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13451,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6770.271,Freight Activity,All,2007,USA,U.S.
13452,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6998.545,Freight Activity,All,2008,USA,U.S.
13453,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6292.634,Freight Activity,All,2009,USA,U.S.
13454,United States,All,Inland (exl. Pipeline),,Freight,International Transport Forum,All,10^9 tonne-km / yr,6610.056,Freight Activity,All,2010,USA,U.S.


# Reordering the columns
    Rule: The columns have to be presented in the order established by the template.

In [32]:
df = dataframeManager.reorder_columns(df)
df

Unnamed: 0,Source,Country,ISO Code,Region,Variable,Unit,Service,Mode,Vehicle Type,Technology,Fuel,Value,Year
0,International Transport Forum,Slovenia,SVN,EU-27,Freight Activity,10^9 tonne-km / yr,Freight,Road,For Own Account,All,All,0.534,1970
1,International Transport Forum,Slovenia,SVN,EU-27,Freight Activity,10^9 tonne-km / yr,Freight,Road,For Own Account,All,All,0.700,1971
2,International Transport Forum,Slovenia,SVN,EU-27,Freight Activity,10^9 tonne-km / yr,Freight,Road,For Own Account,All,All,1.098,1972
3,International Transport Forum,Slovenia,SVN,EU-27,Freight Activity,10^9 tonne-km / yr,Freight,Road,For Own Account,All,All,1.078,1973
4,International Transport Forum,Slovenia,SVN,EU-27,Freight Activity,10^9 tonne-km / yr,Freight,Road,For Own Account,All,All,0.745,1974
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13451,International Transport Forum,United States,USA,U.S.,Freight Activity,10^9 tonne-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6770.271,2007
13452,International Transport Forum,United States,USA,U.S.,Freight Activity,10^9 tonne-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6998.545,2008
13453,International Transport Forum,United States,USA,U.S.,Freight Activity,10^9 tonne-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6292.634,2009
13454,International Transport Forum,United States,USA,U.S.,Freight Activity,10^9 tonne-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6610.056,2010


# Exporting Results

In [59]:
# Programming Friendly View
dataframeManager.create_programming_friendly_file(df)

# User Friendly View
dataframeManager.create_user_friendly_file(df)

> PF File saved at: /Users/hlinero/Documents/database/item/historical/scripts
> UF File saved at: /Users/hlinero/Documents/database/item/historical/scripts
