In [339]:
#Imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import numpy as np


In [340]:
#Set working directory
os.chdir('/Users/khangphamgia/Downloads')
#Load data
df_tax = pd.read_csv("/Users/khangphamgia/Downloads/Environmental_Taxes.csv")
#Drop multiple unncessary columns
df_tax = df_tax.drop(columns = ["ObjectId", "ISO2", "ISO3", "Source", "CTS Code", "CTS Name", "CTS Full Descriptor", "1995", "1996", "1997", "1998", "1999","2000", "2020", "2021"])
df_tax.head()

Unnamed: 0,Country,Indicator,Unit,2001,2002,2003,2004,2005,2006,2007,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Albania,Environmental Taxes,Domestic Currency,,,,,,,,...,,,,,,43993140000.0,47813790000.0,47548580000.0,51145590000.0,53415650000.0
1,Albania,Environmental Taxes,Percent of GDP,,,,,,,,...,,,,,,3.067206,3.247163,3.066373,3.124865,3.157133
2,Albania,Taxes on Energy (including fuel for transport),Domestic Currency,,,,,,,,...,,,,,,37741110000.0,40945620000.0,40400040000.0,43521820000.0,45165300000.0
3,Albania,Taxes on Energy (including fuel for transport),Percent of GDP,,,,,,,,...,,,,,,2.631314,2.780726,2.605369,2.659072,2.669496
4,Albania,Taxes on Pollution,Domestic Currency,,,,,,,,...,,,,,,1782069000.0,1879970000.0,1941324000.0,2226251000.0,2625011000.0


In [341]:
# Select Environmental Taxes
df_tax = df_tax[df_tax['Indicator'] == 'Environmental Taxes']
# Search for unique countries
df_tax["Country"].unique()

array(['Albania', 'Andorra, Principality of', 'Antigua and Barbuda',
       'Argentina', 'Armenia, Rep. of', 'Australia', 'Austria',
       'Bahamas, The', 'Bangladesh', 'Belgium', 'Belize', 'Bhutan',
       'Bolivia', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Chad', 'Chile',
       'China, P.R.: Mainland', 'Colombia', 'Congo, Dem. Rep. of the',
       'Congo, Rep. of', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire",
       'Croatia, Rep. of', 'Cyprus', 'Czech Rep.', 'Denmark',
       'Dominican Rep.', 'Ecuador', 'Egypt, Arab Rep. of', 'El Salvador',
       'Equatorial Guinea, Rep. of', 'Estonia, Rep. of',
       'Eswatini, Kingdom of', 'Fiji, Rep. of', 'Finland', 'France',
       'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guyana',
       'Honduras', 'Hungary', 'Iceland', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Kazakhstan, Rep. of', 'Kenya',
       'Korea, Rep. of', 'Kosovo, Rep. of', 'Kyrgyz R

In [342]:
print(df_tax)

                             Country            Indicator               Unit  \
0                            Albania  Environmental Taxes  Domestic Currency   
1                            Albania  Environmental Taxes     Percent of GDP   
10          Andorra, Principality of  Environmental Taxes  Domestic Currency   
15               Antigua and Barbuda  Environmental Taxes  Domestic Currency   
16               Antigua and Barbuda  Environmental Taxes     Percent of GDP   
...                              ...                  ...                ...   
1210  Venezuela, Rep. Bolivariana de  Environmental Taxes     Percent of GDP   
1219                         Vietnam  Environmental Taxes  Domestic Currency   
1220                         Vietnam  Environmental Taxes     Percent of GDP   
1229              West Bank and Gaza  Environmental Taxes  Domestic Currency   
1230              West Bank and Gaza  Environmental Taxes     Percent of GDP   

             2001         2002         

In [343]:
# Step 1: Melt the dataframe to reshape year columns into rows
df_melted = df_tax.melt(id_vars=['Country', 'Indicator', 'Unit'], var_name='Year', value_name='Value')

# Step 2: Convert the Year column to numeric
df_melted['Year'] = pd.to_numeric(df_melted['Year'], errors='coerce')

# Step 3: Pivot the dataframe to get the required format
df_pivot = df_melted.pivot_table(
    index=['Country', 'Year'],
    columns=['Unit'],
    values='Value',
    aggfunc='first'  # Avoid duplicates
).reset_index()

# Step 4: Rename columns for clarity
df_pivot.columns.name = None  # Remove multi-index column name
df_pivot.rename(columns={
    'Percent of GDP': 'Environmental Taxes (% of GDP)',
    'Domestic Currency': 'Environmental Taxes (Domestic Currency)'
}, inplace=True)

# Step 5: Create a full range of countries and years
all_countries = df_pivot['Country'].unique()  # Get all unique countries
all_years = list(range(2000, 2021))  # Define the desired range of years

# Create a full dataframe with all combinations of countries and years
full_index = pd.MultiIndex.from_product([all_countries, all_years], names=['Country', 'Year'])
df_full = pd.DataFrame(index=full_index).reset_index()

# Merge the full dataframe with the existing data
df_main = pd.merge(df_full, df_pivot, on=['Country', 'Year'], how='left')

# Step 6: Fill missing values with NaN or placeholders if necessary
df_main['Environmental Taxes (% of GDP)'] = df_main['Environmental Taxes (% of GDP)'].fillna(np.nan)
df_main['Environmental Taxes (Domestic Currency)'] = df_main['Environmental Taxes (Domestic Currency)'].fillna(np.nan)

# Final dataframe
print(df_main)

#Export to excel 
df_main.to_excel("Environmental_Taxes_Cleaned.xlsx", index=False)



                 Country  Year  Environmental Taxes (Domestic Currency)  \
0                Albania  2000                                      NaN   
1                Albania  2001                                      NaN   
2                Albania  2002                                      NaN   
3                Albania  2003                                      NaN   
4                Albania  2004                                      NaN   
...                  ...   ...                                      ...   
2641  West Bank and Gaza  2016                             2.752039e+07   
2642  West Bank and Gaza  2017                             2.752039e+07   
2643  West Bank and Gaza  2018                             2.752039e+07   
2644  West Bank and Gaza  2019                             2.752039e+07   
2645  West Bank and Gaza  2020                                      NaN   

      Environmental Taxes (% of GDP)  
0                                NaN  
1                    

In [344]:
# Run summary statistics for each country
summary_stats = df_main.groupby('Country').agg({
    'Environmental Taxes (% of GDP)': ['mean', 'median', 'std', 'min', 'max', 'count'],
    'Environmental Taxes (Domestic Currency)': ['mean', 'median', 'std', 'min', 'max', 'count']
}).reset_index()
print(summary_stats)

                            Country Environmental Taxes (% of GDP)            \
                                                              mean    median   
0                           Albania                       3.132548  3.124865   
1          Andorra, Principality of                            NaN       NaN   
2               Antigua and Barbuda                       0.537895  0.630000   
3                         Argentina                       1.234211  1.220000   
4                  Armenia, Rep. of                       1.034888  1.049627   
..                              ...                            ...       ...   
121                   United States                       0.791067  0.752718   
122                         Uruguay                       1.674216  1.670845   
123  Venezuela, Rep. Bolivariana de                       2.177000  1.640000   
124                         Vietnam                       0.469412  0.430000   
125              West Bank and Gaza     

In [345]:
#Create a column for value count of each country for 2 variables in the original dataframe
df_main['count'] = df_main.groupby('Country')['Environmental Taxes (% of GDP)'].transform('count')
#Drop countries with less than 10 observations
df_main = df_main[df_main['count'] >= 10]
#Drop the count column
df_main = df_main.drop(columns = ['count'])
#Print the new dataframe
df_main["Country"].unique()


array(['Antigua and Barbuda', 'Argentina', 'Australia', 'Austria',
       'Bahamas, The', 'Bangladesh', 'Belgium', 'Belize', 'Bolivia',
       'Bulgaria', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Chad',
       'China, P.R.: Mainland', 'Congo, Dem. Rep. of the',
       'Congo, Rep. of', 'Costa Rica', 'Croatia, Rep. of', 'Cyprus',
       'Czech Rep.', "Côte d'Ivoire", 'Denmark', 'Ecuador',
       'Egypt, Arab Rep. of', 'Estonia, Rep. of', 'Eswatini, Kingdom of',
       'Fiji, Rep. of', 'Finland', 'France', 'Germany', 'Ghana', 'Greece',
       'Guyana', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Jamaica',
       'Japan', 'Kenya', 'Korea, Rep. of', 'Kyrgyz Rep.', 'Latvia',
       'Lithuania', 'Luxembourg', 'Malaysia', 'Maldives', 'Mali', 'Malta',
       'Mauritania, Islamic Rep. of', 'Mauritius', 'Mexico', 'Mongolia',
       'Morocco', 'Namibia', 'Netherlands, The', 'Nicaragua', 'Niger',
       'Nigeria', 'Norway', 'Panama', 'Papua New Guinea', 'Paraguay',
       'Peru', 'Philippines', 'Pola

In [346]:
import pandas as pd

# List of countries
countries = [
    'Antigua and Barbuda', 'Argentina', 'Australia', 'Austria',
    'Bahamas, The', 'Bangladesh', 'Belgium', 'Belize', 'Bolivia',
    'Bulgaria', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Chad',
    'China, P.R.: Mainland', 'Congo, Dem. Rep. of the',
    'Congo, Rep. of', 'Costa Rica', 'Croatia, Rep. of', 'Cyprus',
    'Czech Rep.', "Côte d'Ivoire", 'Denmark', 'Ecuador',
    'Egypt, Arab Rep. of', 'Estonia, Rep. of', 'Eswatini, Kingdom of',
    'Fiji, Rep. of', 'Finland', 'France', 'Germany', 'Ghana', 'Greece',
    'Guyana', 'Hungary', 'Iceland', 'Ireland', 'Italy', 'Jamaica',
    'Japan', 'Kenya', 'Korea, Rep. of', 'Kyrgyz Rep.', 'Latvia',
    'Lithuania', 'Luxembourg', 'Malaysia', 'Maldives', 'Mali', 'Malta',
    'Mauritania, Islamic Rep. of', 'Mauritius', 'Mexico', 'Mongolia',
    'Morocco', 'Namibia', 'Netherlands, The', 'Nicaragua', 'Niger',
    'Nigeria', 'Norway', 'Panama', 'Papua New Guinea', 'Paraguay',
    'Peru', 'Philippines', 'Poland, Rep. of', 'Portugal', 'Romania',
    'Rwanda', 'Serbia, Rep. of', 'Seychelles', 'Sierra Leone',
    'Slovak Rep.', 'Slovenia, Rep. of', 'Solomon Islands',
    'South Africa', 'St. Lucia', 'Sweden', 'Switzerland', 'Togo',
    'Trinidad and Tobago', 'Tunisia', 'Türkiye, Rep. of', 'Uganda',
    'United Kingdom', 'United States',
    'Venezuela, Rep. Bolivariana de', 'Vietnam'
]

# Country-region mapping
region_mapping = {
    'North America': ['Antigua and Barbuda', 'Bahamas, The', 'Belize', 'Canada', 'Costa Rica', 'Jamaica', 
                      'Mexico', 'Panama', 'St. Lucia', 'Trinidad and Tobago', 'United States'],
    'South America': ['Argentina', 'Bolivia', 'Brazil', 'Chile', 'Colombia', 'Ecuador', 'Guyana', 'Paraguay', 
                      'Peru', 'Suriname', 'Uruguay', 'Venezuela, Rep. Bolivariana de'],
    'Europe': ['Austria', 'Belgium', 'Bulgaria', 'Croatia, Rep. of', 'Cyprus', 'Czech Rep.', 'Denmark', 
               'Estonia, Rep. of', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Iceland', 'Ireland', 
               'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands, The', 'Norway', 'Poland, Rep. of', 
               'Portugal', 'Romania', 'Serbia, Rep. of', 'Slovak Rep.', 'Slovenia, Rep. of', 'Spain', 'Sweden', 
               'Switzerland', 'United Kingdom'],
    'Asia': ['Bangladesh', 'Cambodia', 'China, P.R.: Mainland', 'India', 'Japan', 'Korea, Rep. of', 'Kyrgyz Rep.', 
             'Malaysia', 'Maldives', 'Mongolia', 'Philippines', 'Singapore', 'Thailand', 'Vietnam'],
    'Africa': ['Cameroon', 'Cabo Verde', 'Chad', 'Congo, Dem. Rep. of the', 'Congo, Rep. of', 
               'Egypt, Arab Rep. of', 'Eswatini, Kingdom of', 'Ghana', 'Kenya', 'Mali', 'Mauritania, Islamic Rep. of', 
               'Mauritius', 'Morocco', 'Namibia', 'Nigeria', 'Rwanda', 'Seychelles', 'Sierra Leone', 'South Africa', 
               'Togo', 'Tunisia', 'Uganda'],
    'Oceania': ['Australia', 'Fiji, Rep. of', 'Papua New Guinea', 'Solomon Islands'],
}

df_main['Region'] = df_main['Country'].map({country: region for region, countries in region_mapping.items() for country in countries})
df_main.head()





Unnamed: 0,Country,Year,Environmental Taxes (Domestic Currency),Environmental Taxes (% of GDP),Region
42,Antigua and Barbuda,2000,,,North America
43,Antigua and Barbuda,2001,19240000.0,0.89,North America
44,Antigua and Barbuda,2002,19340000.0,0.88,North America
45,Antigua and Barbuda,2003,18840000.0,0.82,North America
46,Antigua and Barbuda,2004,20040000.0,0.81,North America
