In [60]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry
import pycountry_convert as pc

# File paths for the datasets
ghg_filepaths = {
    'ghg_per_capita': './ghg_emissions/ghg_per_capita_by_country.csv',
    'ghg_by_sector': './ghg_emissions/ghg_by_sector_and_country.csv',
    'ghg_totals': './ghg_emissions/ghg_totals_by_country.csv',
    'ghg_per_gdp': './ghg_emissions/ghg_per_gdp_by_country.csv'
}

co2_filepaths = {
    'co2_per_capital': './fossil_co2_emissions/fossil_CO2_per_capita_by_country.csv',
    'co2_by_sector': './fossil_co2_emissions/fossil_CO2_per_capita_by_country.csv',
    'co2_totals': './fossil_co2_emissions/fossil_co2_totals_by_country.csv',
    'co2_per_gdp': './fossil_co2_emissions/fossil_co2_per_gdp_by_country.csv'
}

def preprocess_dataset(df, non_year):
    year_columns = df.columns[non_year:]  # Assuming the first 4 columns are non-year columns

    for col in year_columns:
        # Replace commas with dots and remove spaces
        df[col] = df[col].astype(str).str.replace(',', '.').str.strip()

        # Try converting to numeric, flag values that cannot be converted
        df[col] = pd.to_numeric(df[col], errors='coerce')

        # Optional: Identify values that couldn't be converted
        non_numeric_values = df[pd.to_numeric(df[col], errors='coerce').isna() & ~df[col].isna()][col]
        if not non_numeric_values.empty:
            print(f"Non-numeric values found in {col}: {non_numeric_values.unique()}")

    return df


invalid = []

def country_to_continent(country_name):
    try:
        # Handling specific cases directly
        if country_name == 'Côte d’Ivoire':  # Make sure to use the exact character
            return 'Africa'
        elif country_name == 'International Aviation' or country_name == 'International Shipping' or country_name == 'EU27' or country_name == 'GLOBAL TOTAL' or country_name == None:
            return 'N/A'
        elif pd.isna(country_name):  # Handling NaN values
            return 'N/A'
        elif country_name == "Switzerland and Liechtenstein":
            return 'Europe'
        elif country_name == 'Faroes':
            return 'Europe'
        elif country_name == 'Western Sahara':
            return 'Africa'
        elif country_name == 'The Gambia':
            return 'Africa'
        elif country_name == 'Timor-Leste':
            return 'Asia'
        # if country_name == """Côte d'Ivoire""":
        #     return 'Africa'
        elif country_name == 'Spain and Andorra':
            return 'Europe'
        elif country_name == 'France and Monaco':
            return 'Europe'
        elif country_name == 'Israel and Palestine, State of':
            return 'Asia'
        elif country_name == 'Italy, San Marino and the Holy See':
            return 'Europe'
        elif country_name == 'Myanmar/Burma':
            return 'Asia'
        elif country_name == 'Serbia and Montenegro':
            return 'Europe'
        elif country_name == 'Sudan and South Sudan':
            return 'Africa'
        else:
            # Standard case
            country_alpha2 = pc.country_name_to_country_alpha2(country_name)
            country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
            country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
            return country_continent_name
    except Exception as e:
        invalid.append(country_name)
        print("Invalid country is ", country_name)
        return None

In [61]:
# Loading and preprocessing all datasets
ghg_per_capita = pd.read_csv(ghg_filepaths['ghg_per_capita'], delimiter=';')
ghg_by_sector = pd.read_csv(ghg_filepaths['ghg_by_sector'], delimiter=';')
ghg_totals = pd.read_csv(ghg_filepaths['ghg_totals'], delimiter=';')
ghg_per_gdp = pd.read_csv(ghg_filepaths['ghg_per_gdp'], delimiter=';')

ghg_per_capita = preprocess_dataset(ghg_per_capita, 3)
ghg_by_sector = preprocess_dataset(ghg_by_sector, 4)
ghg_totals = preprocess_dataset(ghg_totals, 3)
ghg_per_gdp = preprocess_dataset(ghg_per_gdp, 3)

# Applying continent categorization
ghg_per_capita['Continent'] = ghg_per_capita['Country'].apply(country_to_continent)
ghg_by_sector['Continent'] = ghg_by_sector['Country'].apply(country_to_continent)
ghg_totals['Continent'] = ghg_totals['Country'].apply(country_to_continent)
ghg_per_gdp['Continent'] = ghg_per_gdp['Country'].apply(country_to_continent)

# Example: Displaying a sample from one of the datasets
print(ghg_by_sector.head())

  Substance       Sector EDGAR Country Code      Country      1970      1971  \
0       CO2  Agriculture                AFG  Afghanistan  0.029229  0.029229   
1       CO2  Agriculture                ALB      Albania  0.113300  0.113300   
2       CO2  Agriculture                ARG    Argentina  0.104343  0.104343   
3       CO2  Agriculture                ARM      Armenia  0.055288  0.055288   
4       CO2  Agriculture                AUS    Australia  0.311143  0.311143   

       1972      1973      1974      1975  ...      2014      2015      2016  \
0  0.029229  0.029229  0.039967  0.045310  ...  0.084490  0.116967  0.162800   
1  0.113300  0.113300  0.113614  0.112514  ...  0.056624  0.058719  0.049605   
2  0.104343  0.104343  0.087214  0.077314  ...  1.145152  0.892257  1.359547   
3  0.055288  0.055288  0.059966  0.059966  ...  0.022629  0.022629  0.022471   
4  0.311143  0.311143  0.311143  0.268190  ...  2.182924  2.291771  2.505224   

       2017      2018      2019      2

In [62]:
ghg_per_capita.columns

Index(['EDGAR Country Code', 'Country', '1970', '1971', '1972', '1973', '1974',
       '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983',
       '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992',
       '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020', '2021', '2022', 'Continent'],
      dtype='object')

In [64]:
# Reshaping the dataset from wide to long format
ghg_per_capita_long = pd.melt(ghg_per_capita, id_vars=['EDGAR Country Code', 'Country', 'Continent'],
                              var_name='Year', value_name='GHG_per_Capita')

# Convert 'Year' to a numeric data type
ghg_per_capita_long['Year'] = pd.to_numeric(ghg_per_capita_long['Year'], errors='coerce')

# Group by Continent and Year
ghg_per_capita_grouped = ghg_per_capita_long.groupby(['Continent', 'Year']).mean().reset_index()

# Visualization
plt.figure(figsize=(12, 6))
sns.lineplot(data=ghg_per_capita_grouped, x='Year', y='GHG_per_Capita', hue='Continent')
plt.title('GHG Emissions Per Capita by Continent Over Time')
plt.xlabel('Year')
plt.ylabel('Average GHG Emissions Per Capita')
plt.legend(title='Continent')
plt.show()

TypeError: Could not convert AGOBDIBENBFABWACAFCIVCMRCODCOGCOMCPVDJIDZAEGYERIESHETHGABGHAGINGMBGNBGNQKENLBRLBYLSOMARMDGMLIMOZMRTMUSMWINAMNERNGAREURWASDNSENSHNSLESOMSTPSWZSYCTCDTGOTUNTZAUGAZAFZMBZWE to numeric

In [63]:
# Set up visualization style
sns.set(style="whitegrid")

# Function to aggregate and visualize data for a given dataset
def analyze_continent_data(df, metric_name):
    # Aggregate data by continent and calculate mean for each year
    continent_agg = df.groupby('Continent').mean()

    # Plotting trends over time for each continent
    plt.figure(figsize=(12, 6))
    for continent in continent_agg.index:
        plt.plot(continent_agg.columns, continent_agg.loc[continent], marker='o', label=continent)

    plt.title(f'{metric_name} by Continent Over Time', fontsize=16)
    plt.xlabel('Year', fontsize=14)
    plt.ylabel(metric_name, fontsize=14)
    plt.legend(title='Continent')
    plt.show()

# Analyzing GHG emissions per capita by continent
analyze_continent_data(ghg_per_capita, 'GHG Emissions Per Capita')

# Analyzing total GHG emissions by continent
analyze_continent_data(ghg_totals, 'Total GHG Emissions')

# Analyzing GHG emissions by sector (considering a specific sector or aggregate sectors as needed)
# analyze_continent_data(ghg_by_sector, 'GHG Emissions by Sector')

# Analyzing GHG emissions per GDP by continent
analyze_continent_data(ghg_per_gdp, 'GHG Emissions Per GDP')

TypeError: Could not convert AGOBDIBENBFABWACAFCIVCMRCODCOGCOMCPVDJIDZAEGYERIESHETHGABGHAGINGMBGNBGNQKENLBRLBYLSOMARMDGMLIMOZMRTMUSMWINAMNERNGAREURWASDNSENSHNSLESOMSTPSWZSYCTCDTGOTUNTZAUGAZAFZMBZWE to numeric