In [None]:
#taking out the defintion for successful results

import asyncio
import aiohttp
import pandas as pd
import nest_asyncio
import time
from constants import taxud_mapping, GTT_mapping, headers

# Apply nest_asyncio to allow nested use of asyncio.run()
nest_asyncio.apply()

MAX_RETRIES = 5
RETRY_BACKOFF_FACTOR = 1.5
BATCH_SIZE = 10  # Number of requests to send in a batch
DELAY_BETWEEN_BATCHES = 5  # Delay in seconds between batches

successful_results = []
failed_results = []

async def fetch_data(session, product):
    sector = product["sector"]
    HS_codes = product["HS codes"]
    product_name = product["name"]

    url = f"https://www.ec.europa.eu/agrifood/api/taxud/weeklyData/import?importCategories=Import%20-%20preferential&importCategories=Import%20-%20most favoured nation&sectors={sector}&{HS_codes}"

    for attempt in range(MAX_RETRIES):
        try:
            async with session.get(url, headers=headers) as response:
                if response.status == 200:
                    data = await response.json()
                    df = pd.DataFrame(data)
                    df.rename(columns={'kgEquivalent': f'TAXUD_{product_name}'}, inplace=True)

                    return df, None  # Return DataFrame and None as error
                elif response.status >= 500:  # Server error, retry
                    print(f"Server error (HTTP {response.status}) for {product_name}, retrying...")
                else:  # Client error, do not retry
                    return None, f"Failed to retrieve data for {product_name}: HTTP {response.status}"
        except aiohttp.ClientError as e:
            print(f"Client error for {product_name}, retrying: {e}")
        except Exception as e:
            print(f"Unexpected error for {product_name}, retrying: {e}")

        await asyncio.sleep(RETRY_BACKOFF_FACTOR ** attempt)

    return None, f"Failed to retrieve data for {product_name} after {MAX_RETRIES} attempts"

async def fetch_batch(session, batch):
    tasks = [fetch_data(session, product) for product in batch]
    return await asyncio.gather(*tasks)

async def main():
    global successful_results, failed_results
    async with aiohttp.ClientSession() as session:
        results = []
        for i in range(0, len(taxud_mapping), BATCH_SIZE):
            batch = taxud_mapping[i:i + BATCH_SIZE]
            results.extend(await fetch_batch(session, batch))
            await asyncio.sleep(DELAY_BETWEEN_BATCHES)

    successful_results = [result for result, error in results if result is not None]
    failed_results = [error for result, error in results if error is not None]

    if successful_results:
        print("Successfully retrieved data:")
        for df in successful_results:
            print(df.head(10))  # Print the first 10 rows of each successfully retrieved DataFrame
    else:
        print("No data retrieved for any product.")

    if failed_results:
        print("\nFailed to retrieve data for the following products:")
        for error in failed_results:
            print(error)

# Run the main function
asyncio.run(main())

# Use the successful_results in the next block of code
print("Using the successful_results in the next block of code...")
# Add your code to process successful_results here


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dateutil import parser


#In this code we change data aggregation by turning marketing year (starts in July and ends in June) into a normal calendar year
def week_to_date(marketing_year, week):

    start_year = int(marketing_year.split('/')[0])
    end_year = int(marketing_year.split('/')[1])

    if week <= 26: 
        year = start_year
        week_number = week + 26
    else:  
        year = end_year
        week_number = week - 27

    jan_1 = datetime(year, 1, 1)
    first_week_start = jan_1 + timedelta(days=(7 - jan_1.weekday()))
    week_start = first_week_start + timedelta(weeks=week_number - 1)
    week_end = week_start + timedelta(days=6)

    return week_start, week_end 

In [None]:
import pandas as pd

aggregated_results = []

for df in successful_results:
    df.columns = df.columns.str.strip()

    if 'marketingYear' not in df.columns or 'week' not in df.columns:
        print("DataFrame does not contain 'Marketing Year' or 'week' columns.")
        continue

    # Calculate start and end dates for each week
    df[['Start date', 'End date']] = df.apply(lambda row: pd.Series(week_to_date(row['marketingYear'], row['week'])), axis=1)

    def split_week_into_months(row):
        result = []
        total_days = (row['End date'] - row['Start date']).days + 1
        current_date = row['Start date']
        
        while current_date <= row['End date']:
            start_of_month = current_date.replace(day=1)
            end_of_month = (start_of_month + pd.DateOffset(months=1) - pd.DateOffset(days=1)).date()
            
            if current_date.month == row['End date'].month:
                end_of_period = row['End date']
            else:
                end_of_period = pd.Timestamp(end_of_month)
            
            days_in_period = (end_of_period - current_date).days + 1
            proportion = days_in_period / total_days
            
            taxud_columns = [col for col in df.columns if col.startswith('TAXUD_')]

            for col in taxud_columns:
                value_for_period = (row[col] * proportion) / 1000
                result.append({
                    'date': start_of_month,
                    col: value_for_period
                })

            current_date = end_of_period + pd.DateOffset(days=1)

        return result

    # Apply the function and expand the results into a new DataFrame
    expanded_rows = df.apply(split_week_into_months, axis=1)
    expanded_df = pd.DataFrame([item for sublist in expanded_rows for item in sublist])

    # Aggregate data by month
    monthly_aggregated = expanded_df.groupby(['date']).sum().reset_index()

    # Add the aggregated DataFrame to the list
    aggregated_results.append(monthly_aggregated)

# Merge all aggregated DataFrames on 'month'
if aggregated_results:
    final_aggregated_df = aggregated_results[0]
    for df in aggregated_results[1:]:
        final_aggregated_df = pd.merge(final_aggregated_df, df, on='date', how='outer')
else:
    final_aggregated_df = pd.DataFrame()

# Replace NaNs with zeros
final_aggregated_df.fillna(0, inplace=True)

final_aggregated_df.sort_values(by='date', inplace=True)

TAXUD_merged_df = final_aggregated_df



# Print the final aggregated DataFrame
print(TAXUD_merged_df.head(10))


In [None]:
# main_script.py
import os
from database_connection import query_data


a_ssh_host = X
a_ssh_user = X
a_ssh_port = X
a_ssh_private_key = X
a_sql_hostname = X
a_sql_username = X
a_sql_password = X
a_sql_database = X
a_sql_port = X

In [None]:
import pandas as pd
import os
from dotenv import load_dotenv
from database_connection import ssh_tunnel, db_connection, query_data

# Load environment variables from .env
load_dotenv()

dfs_GTT = []

with ssh_tunnel(a_ssh_host, a_ssh_port, a_ssh_user, a_ssh_private_key, a_sql_hostname, a_sql_port) as local_port:
    with db_connection(local_port, a_sql_username, a_sql_password, a_sql_database) as conn:
        for item in GTT_mapping:
            GTT_code = item['GTT_code']
            name = item['name']

            query = f'''SELECT date, amount
                        FROM vesper.total_import_export_figures
                        WHERE country_id = 200 AND hs_code = {GTT_code} AND data_interval = 'monthly' AND type = 1 AND date > '2010-12-31'
                        ORDER BY date ASC'''

            result = query_data(conn, query)
            if not result.empty:
                result['date'] = pd.to_datetime(result['date'])
                result = result.rename(columns={'amount': f'GTT_{name}'})
                dfs_GTT.append(result)

if dfs_GTT:
    # Start with the first dataframe
    merged_df = dfs_GTT[0]
    
    # Merge the rest without dropping NaNs
    for df in dfs_GTT[1:]:
        merged_df = pd.merge(merged_df, df, on='date', how='outer')

    # Optional: Fill NaN values with 0 or another value
    # merged_df = merged_df.fillna(0)

    GTT_merged_df = merged_df

    print(GTT_merged_df.head(10))

else:
    print("No dataframes to merge")


In [None]:
TAXUD_and_GTT = pd.merge(TAXUD_merged_df , GTT_merged_df, on='date', how='outer')

start_date = '2012-01-01'
end_date = '2024-05-31'

start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

filtered_TAXUD_and_GTT = TAXUD_and_GTT[(TAXUD_and_GTT['date'] >= start_date) & (TAXUD_and_GTT['date'] <= end_date)]

# Print a summary of the DataFrame
print("Merged DataFrame head:")
print(filtered_TAXUD_and_GTT.head())

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Assume filtered_df is your DataFrame and it's already defined

# Extract unique suffixes from the columns
suffixes = {col.split('_')[1] for col in filtered_TAXUD_and_GTT.columns if '_' in col}

# Ensure date column is in datetime format
filtered_TAXUD_and_GTT['date'] = pd.to_datetime(filtered_TAXUD_and_GTT['date'])

# Extract year and month for plotting purposes
filtered_TAXUD_and_GTT['year'] = filtered_TAXUD_and_GTT['date'].dt.year
filtered_TAXUD_and_GTT['month'] = filtered_TAXUD_and_GTT['date'].dt.month

# Create a directory to save plots if needed
# import os
# os.makedirs('plots', exist_ok=True)

# Create a PdfPages object
with PdfPages('time_series_plots.pdf') as pdf:
    # Iterate over each suffix to create separate charts
    for suffix in suffixes:
        taxud_col = f'TAXUD_{suffix}'
        gtt_col = f'GTT_{suffix}'

        if taxud_col in filtered_TAXUD_and_GTT.columns and gtt_col in filtered_TAXUD_and_GTT.columns:
            plt.figure(figsize=(12, 6))

            # Plot TAXUD data
            plt.plot(filtered_TAXUD_and_GTT['date'], filtered_TAXUD_and_GTT[taxud_col], label=f'TAXUD_{suffix}', color='blue')

            # Plot GTT data
            plt.plot(filtered_TAXUD_and_GTT['date'], filtered_TAXUD_and_GTT[gtt_col], label=f'GTT_{suffix}', color='red')

            # Set up the title, labels, and legend
            plt.title(f'Time Series for {suffix}')
            plt.xlabel('Date')
            plt.ylabel('Value')
            plt.legend()
            plt.grid(True)

            # Adjust layout and add the plot to the PDF
            plt.tight_layout()
            pdf.savefig()  # Save the current figure into the PDF
            plt.close()  # Close the figure to free memory
