In [6]:
import io
import zipfile
import pandas as pd
import requests
import re
from openpyxl import load_workbook

_raw_dir = '../../../data/_raw/edgar'
base_url = "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/EDGAR/datasets/v70_FT2021_GHG"

# Define the URL and file details
routes = [
    {'route': "/EDGAR_CH4_1970-2021.zip",
     'csv_name': '/edgar_file_em_ch4',
     'sheets': ['IPCC 2006', 'TOTALS BY COUNTRY'],
     'target_value':'IPCC_annex'
     },
    {'route': "/IEA_EDGAR_CO2_1970-2021.zip",
     'csv_name': '/edgar_file_em_co2',
     'sheets': ['IPCC 2006', 'TOTALS BY COUNTRY'],
     'target_value':'IPCC_annex'
     },
    {'route': "/EDGAR_F-gases_1990-2021.zip",
     'csv_name': '/edgar_file_em_fgas',
     'sheets': ['IPCC 2006', 'TOTALS BY COUNTRY'],
     'target_value':'IPCC_annex'
     },
    {'route': "/EDGAR_N2O_1970-2021.zip",
     'csv_name': '/edgar_file_em_n2o',
     'sheets': ['IPCC 2006', 'TOTALS BY COUNTRY'],
     'target_value':'IPCC_annex'
     },
    {'route': "/v70_FT2021_GHG_AR4_AR5b.zip",
     'csv_name': '/edgar_file_em_tot_co2eq',
     'sheets': ['Total GHG by sector country AR5', 'Total GHG by country AR5'],
     'target_value':'EDGAR Country Code'
     }
]

# Sheets to iterate over
# sheets = ['IPCC 2006', 'TOTALS BY COUNTRY']

for route in routes:
    url = base_url + route['route']
    print('route:' + route['route'])
    # Download the zip file
    response = requests.get(url)

    zip_data = io.BytesIO(response.content)
    print('response.url:' + response.url)
    # Extract the zip file in memory
    try:
        with zipfile.ZipFile(zip_data, 'r') as zip_ref:
            excel_file = None
            for file in zip_ref.namelist():
                if file.endswith('.xlsx'):
                    excel_file = file
                    break
                
            if excel_file:
                # Read the Excel file into a DataFrame
                with zip_ref.open(excel_file) as excel_data:
                    workbook = load_workbook(excel_data)
                    csv = _raw_dir + route['csv_name']
                    #print(csv)

                    for sheet_name in route['sheets']:

                        csv_filename = csv + \
                            ('_sect' if 'IPCC' in sheet_name or 'sector' in sheet_name else '') + '.csv'

                        # Get the sheet by name
                        sheet = workbook[sheet_name]
                        # Find the cell with the target value
                        target_value = route['target_value']
                        target_cell = None
                        target_row = None
                        for row_num, row in enumerate(sheet.iter_rows(values_only=True), start=1):
                            if target_value in row[:5]:  # Check if the value is present in the first 5 columns
                                target_cell = sheet.cell(row=row_num, column=1)
                                target_row = row_num
                                break

                        if target_cell:
                            # Get the cell position
                            start_row = target_row
                            # Start at first column (min_col=0)
                            # start_col = target_cell.column

                            # Read the sheet starting from the specified cell
                            data = sheet.iter_rows(
                                min_row=start_row, min_col=1, values_only=True)
                            # Assuming the first row contains the headers
                            headers = next(data)
                            
                            # Create DataFrame from the remaining data
                            df = pd.DataFrame(data, columns=headers)
                            
                            #df.columns = df.columns.astype(str).str.replace('Y_', '')
                            # Melt DataFrame to convert year columns into a single column
                            # Filter columns that do not start with 'Y_'
                            #filtered_columns = [col for col in df.columns if not bool(re.match(r'^\d{4}$', col))]
                            #df = df.melt(id_vars=filtered_columns,
                                            #var_name='year', value_name='value')
                            # Extract the year from the column name
                            #df['year'] = df['year'].astype(int)
                            #MOVED TO Process

                            # Sort DataFrame by year
                            # df.sort_values('year', inplace=True, ascending=False) 

                            # Reset index
                            df.reset_index(drop=True, inplace=True)

                            # Transform the table and save as CSV in memory

                            df.to_csv(csv_filename, index=False)
                            print(f'CSV: {csv_filename}')
                            # Print the transformed DataFrame
                            print(df.head())

                        else:
                            print(
                                f"Target value '{target_value}' not found in sheet '{sheet_name}'.")
                            continue



            else:
                print("Excel file not found in the zip.")
                continue

    except zipfile.BadZipFile as ex:
        print(f'BadZipFile:  {csv}')
        continue


route:/EDGAR_CH4_1970-2021.zip
response.url:https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/EDGAR/datasets/v70_FT2021_GHG/EDGAR_CH4_1970-2021.zip
CSV: ../../../data/_raw/edgar/edgar_file_em_ch4_sect.csv
    IPCC_annex       C_group_IM24_sh Country_code_A3   Name  \
0  Non-Annex_I  Rest Central America             ABW  Aruba   
1  Non-Annex_I  Rest Central America             ABW  Aruba   
2  Non-Annex_I  Rest Central America             ABW  Aruba   
3  Non-Annex_I  Rest Central America             ABW  Aruba   
4  Non-Annex_I  Rest Central America             ABW  Aruba   

  ipcc_code_2006_for_standard_report  \
0                            1.A.1.a   
1                              1.A.2   
2                              1.A.4   
3                              1.A.5   
4                              1.B.1   

         ipcc_code_2006_for_standard_report_name Substance fossil_bio  \
0  Main Activity Electricity and Heat Production       CH4        bio   
1      Manufacturing Industrie

CSV: ../../../data/_raw/edgar/edgar_file_em_fgas_sect.csv
    IPCC_annex C_group_IM24_sh Country_code_A3         Name  \
0  Non-Annex_I         India +             AFG  Afghanistan   
1  Non-Annex_I         India +             AFG  Afghanistan   
2  Non-Annex_I         India +             AFG  Afghanistan   
3  Non-Annex_I         India +             AFG  Afghanistan   
4  Non-Annex_I         India +             AFG  Afghanistan   

  ipcc_code_2006_for_standard_report  \
0                                2.F   
1                                2.F   
2                                2.F   
3                                2.F   
4                                2.F   

             ipcc_code_2006_for_standard_report_name  Substance fossil_bio  \
0  Product Uses as Substitutes for Ozone Depletin...    HFC-125     fossil   
1  Product Uses as Substitutes for Ozone Depletin...   HFC-134a     fossil   
2  Product Uses as Substitutes for Ozone Depletin...   HFC-143a     fossil   
3  Product