# DATA INTEGRATION MODULE

In [1]:
import pandas as pd
#ignore all warnings
import warnings
warnings.filterwarnings("ignore")


#### ECB EST€R 

In [None]:
import pandas as pd

est_data = pd.read_csv('FETCHED_DATA/ECB_API/EST.csv')
est_data['TIME_PERIOD'] = pd.to_datetime(est_data['TIME_PERIOD'])

#rename value to EST_VALUE
est_data_cleaned= est_data.rename(columns={'value':'EST_VALUE'})

# Pivot the dataframe
est_data_cleaned = est_data_cleaned.pivot(index='TIME_PERIOD', columns=['BENCHMARK_ITEM', 'DATA_TYPE_EST'], values='EST_VALUE').reset_index()
est_data_cleaned.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in est_data_cleaned.columns.values]

# Fill NaN values using forward fill then backward fill
est_data_cleaned = est_data_cleaned.fillna(method='ffill').fillna(method='bfill')

#rename columns
est_data_cleaned = est_data_cleaned.rename(columns={
    'TIME_PERIOD_':'TIME_PERIOD', 
    'EU000A2QQF08_CI':'ESTR_EU000A2QQF08_CI', 
    'EU000A2X2A25_NT':'ESTR_EU000A2X2A25_NT', 
    'EU000A2X2A25_TT':'ESTR_EU000A2X2A25_TT' 
})

#filtered columns
est_data_cleaned = est_data_cleaned.filter(items=['TIME_PERIOD', 'ESTR_EU000A2QQF08_CI', 'ESTR_EU000A2X2A25_NT', 'ESTR_EU000A2X2A25_TT'])

#sort by TIME_PERIOD
est_data_cleaned = est_data_cleaned.sort_values(by='TIME_PERIOD')

# Subtract values in the 'ESTR_EU000A2QQF08_CI' column from 100, which is the base rate since 2019.
est_data_cleaned['ESTR_EU000A2QQF08_CI'] = est_data_cleaned['ESTR_EU000A2QQF08_CI'] - 100

est_data_cleaned.head()

#### ECB CISS  

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
ciss_data = pd.read_csv('FETCHED_DATA/ECB_API/ECB_CISS.csv')

# Convert the TIME_PERIOD column to datetime format, handling errors
ciss_data['TIME_PERIOD'] = pd.to_datetime(ciss_data['TIME_PERIOD'], errors='coerce')

# Filter the data to include only rows where REF_AREA = 'U2'
ciss_data_filtered = ciss_data[ciss_data['REF_AREA'] == 'U2']

# Exclude rows where PROVIDER_FM_ID is 'SOV_EW' or 'SOV_GDPW'
ciss_data_filtered = ciss_data_filtered[~ciss_data_filtered['PROVIDER_FM_ID'].isin(['SS_CIN', 'SOV_EW', 'SOV_GDPW'])]

# Pivot the DataFrame, setting TIME_PERIOD as the index and
# PROVIDER_FM_ID and DATA_TYPE_FM as the columns, with values from CISS_VALUE
ecb_ciss_si_data_cleaned = ciss_data_filtered.pivot(index='TIME_PERIOD', columns=['PROVIDER_FM_ID', 'DATA_TYPE_FM'], values='value').reset_index()

# Merge the multi-level columns into a single level, removing any whitespace
ecb_ciss_si_data_cleaned.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in ecb_ciss_si_data_cleaned.columns.values]

# Fill all NaN values using forward fill then backward fill
ecb_ciss_si_data_cleaned = ecb_ciss_si_data_cleaned.fillna(method='ffill').fillna(method='bfill')

# Rename the column 'TIME_PERIOD_' back to 'TIME_PERIOD' if necessary
ecb_ciss_si_data_cleaned = ecb_ciss_si_data_cleaned.rename(columns={'TIME_PERIOD_': 'TIME_PERIOD', 'SS_BM_CON':'CISS_EA20_SS_BM', 'SS_FI_CON':'CISS_EA20_SS_FI', 'SS_FX_CON':'CISS_EA20_SS_FX', 'SS_MM_CON':'CISS_EA20_SS_MM'})

#filtered columns
ecb_ciss_si_data_cleaned = ecb_ciss_si_data_cleaned.filter(items=['TIME_PERIOD', 'CISS_EA20_SS_BM', 'CISS_EA20_SS_FI', 'CISS_EA20_SS_FX', 'CISS_EA20_SS_MM'])

# Sort the DataFrame by TIME_PERIOD
ecb_ciss_si_data_cleaned = ecb_ciss_si_data_cleaned.sort_values(by='TIME_PERIOD')

# Return the first 5 rows of the cleaned DataFrame
ecb_ciss_si_data_cleaned.head()


In [None]:
ecb_ciss = pd.read_csv('FETCHED_DATA/ECB_API/ECB_CISS.csv')
ea20_ecb_ciss = ecb_ciss[ecb_ciss['REF_AREA'] == 'U2']
ea20_ecb_ciss = ea20_ecb_ciss[ea20_ecb_ciss['FREQ']== 'D']

ea20_ecb_ciss = ea20_ecb_ciss[ea20_ecb_ciss['PROVIDER_FM_ID']== 'SS_CIN']
#convert TIME_PERIOD to proper datetime format (current format is YYYY-MM-DD)
ea20_ecb_ciss['TIME_PERIOD'] = pd.to_datetime(ea20_ecb_ciss['TIME_PERIOD'])
ea20_ecb_ciss_cleaned = ea20_ecb_ciss[['TIME_PERIOD', 'value']]
#rename value to CISS_VALUE
ea20_ecb_ciss_cleaned = ea20_ecb_ciss_cleaned.rename(columns={'value': 'CISS_EA20_SS_CIN'})
#sort by TIME_PERIOD
ea20_ecb_ciss_cleaned = ea20_ecb_ciss_cleaned.sort_values(by='TIME_PERIOD')
ea20_ecb_ciss_cleaned.head()

#### WHO 

In [None]:
import pandas as pd

# Load the data from the CSV file
covid_data_ea20 = pd.read_csv('FETCHED_DATA/WHO_API/COVID_DATA_EA20.csv')

# Rename 'Date_reported' to 'TIME_PERIOD'
covid_data_ea20 = covid_data_ea20.rename(columns={'Date_reported': 'TIME_PERIOD'})

# Convert 'TIME_PERIOD' to datetime format
covid_data_ea20['TIME_PERIOD'] = pd.to_datetime(covid_data_ea20['TIME_PERIOD'])

# Sort by 'TIME_PERIOD'
covid_data_ea20 = covid_data_ea20.sort_values(by='TIME_PERIOD')

# Drop the 'Country' column
covid_data_ea20 = covid_data_ea20.drop(columns=['Country'])

# Rename columns
covid_data_ea20 = covid_data_ea20.rename(columns={
    'New_cases': 'Covid_EA20_New_Cases',
    'Cumulative_cases': 'Covid_EA20_Cumulative_Cases',
    'New_deaths': 'Covid_EA20_New_Deaths',
    'Cumulative_deaths': 'Covid_EA20_Cumulative_Deaths'
})

# Display the first few rows of the DataFrame for verification
covid_data_ea20.head()

#### ESTAT | INFLATION 

In [None]:
from datetime import datetime   
inflation = pd.read_csv('FETCHED_DATA/ESTAT_API/ESTAT_INFLATION_2024.csv')
inflation = inflation[inflation['geo'] == 'EA20']
#convert TIME_PERIOD to proper datetime format (current format is YYYY)

inflation = inflation[inflation['TIME_PERIOD'] >= 2020]
# Generate date range from 1-1-2020 to now
start_date = '2020-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a new dataframe with daily dates and corresponding values
rows = []
for index, row in inflation.iterrows():
    year = row['TIME_PERIOD']
    value = row['value']
    year_dates = date_range[(date_range.year == year)]
    for date in year_dates:
        rows.append({
            'date': date,
            'geo': row['geo'],
            'unit': row['unit'],
            'freq': row['freq'],
            'coicop': row['coicop'],
            'value': value
        })

daily_inflation_ea20 = pd.DataFrame(rows)

#rename date to TIME_PERIOD
daily_inflation_ea20 = daily_inflation_ea20.rename(columns={'date': 'TIME_PERIOD'})
daily_inflation_ea20 = daily_inflation_ea20[['TIME_PERIOD', 'value']]
#rename value to INFLATION_VALUE
daily_inflation_ea20 = daily_inflation_ea20.rename(columns={'value': 'INFLATION_EA20_VALUE'})

daily_inflation_ea20.head()

#### ESTAT | GDP 

In [None]:
estat_gdp = pd.read_csv('FETCHED_DATA/ESTAT_API/ESTAT_GDP_2024.csv')
estat_gdp = estat_gdp[estat_gdp['geo'] == 'EA20']
estat_gdp = estat_gdp[estat_gdp['unit'] == 'CLV_PCH_PRE']

estat_gdp = estat_gdp[estat_gdp['TIME_PERIOD'] >= 2020]
# Generate date range from 1-1-2020 to now
start_date = '2020-01-01'
end_date = '2024-06-30'
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a new dataframe with daily dates and corresponding values
rows = []
for index, row in estat_gdp.iterrows():
    year = row['TIME_PERIOD']
    value = row['value']
    year_dates = date_range[(date_range.year == year)]
    for date in year_dates:
        rows.append({
            'date': date,
            'geo': row['geo'],
            'freq': row['freq'],
            'value': value
        })

daily_gdp_ea20 = pd.DataFrame(rows)

#rename date to TIME_PERIOD
daily_gdp_ea20 = daily_gdp_ea20.rename(columns={'date': 'TIME_PERIOD'})
daily_gdp_ea20 = daily_gdp_ea20[['TIME_PERIOD', 'value']]
#rename value to GDP_VALUE
daily_gdp_ea20 = daily_gdp_ea20.rename(columns={'value': 'GDP_GROWTH_EA20'})
# sort by TIME_PERIOD
daily_gdp_ea20 = daily_gdp_ea20.sort_values(by='TIME_PERIOD')

daily_gdp_ea20.head()

#### ESTAT | UNEMPLOYMENT 

In [None]:
estat_unemployment_ea20 = pd.read_csv('FETCHED_DATA/ESTAT_API/ESTAT_UNEMPLOYMENT.csv')
# Filter the data for 'EA20' and dates after 2020-01-01
filtered_data = estat_unemployment_ea20[
    (estat_unemployment_ea20['geo'] == 'EA20') & 
    (estat_unemployment_ea20['TIME_PERIOD'] >= '2020-01') &
    (estat_unemployment_ea20['s_adj'] != 'NSA') &
    (estat_unemployment_ea20['indic'] != 'LM-UN-F-GT25') &
    (estat_unemployment_ea20['indic'] != 'LM-UN-F-LE25') &
    (estat_unemployment_ea20['indic'] != 'LM-UN-M-GT25') &
    (estat_unemployment_ea20['indic'] != 'LM-UN-M-LE25') 
]

# Convert TIME_PERIOD to datetime
filtered_data['TIME_PERIOD'] = pd.to_datetime(filtered_data['TIME_PERIOD'])

# Generate a daily date range from 2020-01-01 to the current date
start_date = '2020-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a pivot table with the necessary columns
pivot_data = filtered_data.pivot_table(index='TIME_PERIOD', columns=['indic', 'unit', 's_adj'], values='value').reset_index()

# Flatten the multi-level column headers
pivot_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in pivot_data.columns.values]

# Remove the extra underscore if any
pivot_data.columns = [col.replace('__', '') if isinstance(col, str) else col for col in pivot_data.columns]

# Reindex the dataframe to include all daily dates
pivot_data = pivot_data.set_index('TIME_PERIOD').reindex(date_range).fillna(method='ffill').reset_index()
pivot_data = pivot_data.rename(columns={'index': 'TIME_PERIOD'})

pivot_data = pivot_data.rename(columns={'LM-UN-F-TOT_PC_ACT_SA': 'UN_EA20_F_TOT'})
pivot_data = pivot_data.rename(columns={'LM-UN-M-TOT_PC_ACT_SA': 'UN_EA20_M_TOT'})
pivot_data = pivot_data.rename(columns={'LM-UN-T-GT25_PC_ACT_SA': 'UN_EA20_GT25_TOT'})
pivot_data = pivot_data.rename(columns={'LM-UN-T-LE25_PC_ACT_SA': 'UN_EA20_LE25_TOT'})
pivot_data = pivot_data.rename(columns={'LM-UN-T-TOT_PC_ACT_SA': 'UN_EA20_T_TOT'})

#sort by TIME_PERIOD
estat_unemployment_ea20 = pivot_data.sort_values(by='TIME_PERIOD')

estat_unemployment_ea20

### ESTAT | CONSUMER CONFIDENCE 

In [None]:
from datetime import datetime   

estat_consumer_confidence_ea20 = pd.read_csv('FETCHED_DATA/ESTAT_API/ESTAT_CONSUMER_CONFIDENCE.csv')
# Filter the data for 'EA20' and dates after 2020-01-01
filtered_data = estat_consumer_confidence_ea20[
    (estat_consumer_confidence_ea20['geo'] == 'EA20') & 
    (estat_consumer_confidence_ea20['TIME_PERIOD'] >= '2020-01') &
    (estat_consumer_confidence_ea20['s_adj'] != 'NSA') &
    (estat_consumer_confidence_ea20['indic'] != 'BS-FS-LY') &
    (estat_consumer_confidence_ea20['indic'] != 'BS-GES-LY') &
    (estat_consumer_confidence_ea20['indic'] != 'BS-MP-PR') &
    (estat_consumer_confidence_ea20['indic'] != 'BS-PT-LY') &
    (estat_consumer_confidence_ea20['indic'] != 'BS-SFSH') &
    (estat_consumer_confidence_ea20['indic'] != 'BS-UE-NY')
]

# Convert TIME_PERIOD to datetime
filtered_data['TIME_PERIOD'] = pd.to_datetime(filtered_data['TIME_PERIOD'])

# Generate a daily date range from 2020-01-01 to the current date
start_date = '2020-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a pivot table with the necessary columns
pivot_data = filtered_data.pivot_table(index='TIME_PERIOD', columns=['indic', 'unit', 's_adj'], values='value').reset_index()

# Flatten the multi-level column headers
pivot_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in pivot_data.columns.values]

# Remove the extra underscore if any
pivot_data.columns = [col.replace('__', '') if isinstance(col, str) else col for col in pivot_data.columns]

# Reindex the dataframe to include all daily dates
pivot_data = pivot_data.set_index('TIME_PERIOD').reindex(date_range).fillna(method='ffill').reset_index()
pivot_data = pivot_data.rename(columns={'index': 'TIME_PERIOD'})

pivot_data = pivot_data.rename(columns={'BS-CSMCI_BAL_SA': 'BS_EA20_CSMCI'})
pivot_data = pivot_data.rename(columns={'BS-FS-NY_BAL_SA': 'BS_EA20_FS_NY'})
pivot_data = pivot_data.rename(columns={'BS-MP-NY_BAL_SA': 'BS_EA20_MP_NY'})
pivot_data = pivot_data.rename(columns={'BS-GES-NY_BAL_SA': 'BS_EA20_GES_NY'})
pivot_data = pivot_data.rename(columns={'BS-PT-NY_BAL_SA': 'BS_EA20_PT_NY'})
pivot_data = pivot_data.rename(columns={'BS-SV-NY_BAL_SA': 'BS_EA20_SV_NY'})

#sort by TIME_PERIOD
estat_consumer_confidence_ea20 = pivot_data.sort_values(by='TIME_PERIOD')

estat_consumer_confidence_ea20


#### ESTAT | GOVYC 

In [None]:
govdebt_df_ea20 = pd.read_csv('FETCHED_DATA/ESTAT_API/ESTAT_GOVYC_2024.csv')
#keep only geo EA
govdebt_df_ea20 = govdebt_df_ea20[govdebt_df_ea20['geo'] == 'EA']
govdebt_df_ea20 = govdebt_df_ea20[['TIME_PERIOD', 'value']]
#rename value to GOVDEBT_VALUE
govdebt_df_ea20 = govdebt_df_ea20.rename(columns={'value': 'GOVDEBT_EA20'})

# Convert TIME_PERIOD to datetime (using the first day of each month)
govdebt_df_ea20['TIME_PERIOD'] = pd.to_datetime(govdebt_df_ea20['TIME_PERIOD'], format='%Y-%m')

# Generate a daily date range from 2020-01-01 to now
start_date = '2020-01-01'
end_date = datetime.now().strftime('%Y-%m-%d')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Reindex the dataframe to include all daily dates
govdebt_df_ea20 = govdebt_df_ea20.set_index('TIME_PERIOD').reindex(date_range).fillna(method='ffill').reset_index()
govdebt_df_ea20 = govdebt_df_ea20.rename(columns={'index': 'TIME_PERIOD'})

govdebt_df_ea20

#### ECB | EXCHANGE RATES EUR to USD, CNY

In [None]:
exr = pd.read_csv("FETCHED_DATA/ECB_API/EXCHANGE_RATES_EUR_to_['USD', 'CNY'].csv")
pivot_exchange_df = exr.pivot(index='TIME_PERIOD', columns='CURRENCY', values='value').reset_index()
pivot_exchange_df = pivot_exchange_df.rename(columns={'USD': 'ECB_EXCHANGE_RATES_USD_EUR', 'CNY': 'ECB_EXCHANGE_RATES_CNY_EUR'})
pivot_exchange_df = pivot_exchange_df[['TIME_PERIOD', 'ECB_EXCHANGE_RATES_USD_EUR', 'ECB_EXCHANGE_RATES_CNY_EUR']]
#make TIME_PERIOD to datetime
pivot_exchange_df['TIME_PERIOD'] = pd.to_datetime(pivot_exchange_df['TIME_PERIOD'])
pivot_exchange_df

#### ALPHA VANTAGE | GLOBAL PRICES for OIL, GAS, WHEAT, CORN

In [None]:
import pandas as pd

# List of commodity files
commodity_files = {
    'oil': 'FETCHED_DATA/ALPHA_VANTAGE_API/oil_prices.csv',
    'gas': 'FETCHED_DATA/ALPHA_VANTAGE_API/gas_prices.csv',
    'corn': 'FETCHED_DATA/ALPHA_VANTAGE_API/corn_prices.csv',
    'wheat': 'FETCHED_DATA/ALPHA_VANTAGE_API/wheat_prices.csv'
}

commodity_data = []

for commodity, file in commodity_files.items():
    df = pd.read_csv(file, parse_dates=['date'])
    df.set_index('date', inplace=True)
    df.index.name = 'TIME_PERIOD'
    # Strip whitespace from column names
    df.columns = df.columns.str.strip()
    
    # Select specific columns and rename them
    columns_to_select = {
        '4. close': f'{commodity.capitalize()}_Close_Price',
        '5. volume': f'{commodity.capitalize()}_Volume'
    }
    df = df[list(columns_to_select.keys())]
    df.rename(columns=columns_to_select, inplace=True)
    commodity_data.append(df)

# Concatenate all the selected dataframes
commodity_merged_df = pd.concat(commodity_data, axis=1)

# Display the head of the dataframe to verify the changes
commodity_merged_df


#### PYTRENDS 

In [None]:
import pandas as pd

# Load the PY_GOOGLE_TRENDS.csv file
trends_df = pd.read_csv("FETCHED_DATA/PY_GOOGLE_TRENDS_API/PY_GOOGLE_TRENDS.csv")

# Rename the 'date' column to 'TIME_PERIOD' to match other analyses
trends_df = trends_df.rename(columns={'date': 'TIME_PERIOD'})

# Convert the 'TIME_PERIOD' column to datetime format
trends_df['TIME_PERIOD'] = pd.to_datetime(trends_df['TIME_PERIOD'])

# Reorder columns to have 'TIME_PERIOD' as the first column
columns_order = ['TIME_PERIOD'] + [col for col in trends_df.columns if col != 'TIME_PERIOD']
trends_df = trends_df[columns_order]

# Rename the columns as per your preference
new_column_names = {
    '"Russia Ukraine War"': 'Russia_Ukraine_War_Trend',
    '"Israel Gaza War"': 'Israel_Gaza_War_Trend',
    '"Financial Crisis"': 'Financial_Crisis_Trend',
    '"Climate Change"': 'Climate_Change_Trend',
    '"Covid-19"': 'Covid19_Trend'
}

trends_df = trends_df.rename(columns=new_column_names)

# Print the first few rows to verify the changes
trends_df.head(2000)

#### YOUTUBE 

In [None]:
import pandas as pd

# Load the YOUTUBE_DAILY_RESULTS.csv file
videocount_df = pd.read_csv("FETCHED_DATA/YOUTUBE_API/YOUTUBE_DAILY_RESULTS.csv")

# Rename the 'Date' column to 'TIME_PERIOD' to match other analyses
videocount_df = videocount_df.rename(columns={'Date': 'TIME_PERIOD'})

# Convert the 'TIME_PERIOD' column to datetime format
videocount_df['TIME_PERIOD'] = pd.to_datetime(videocount_df['TIME_PERIOD'])

# Reorder columns to have 'TIME_PERIOD' as the first column
columns_order = ['TIME_PERIOD'] + [col for col in videocount_df.columns if col != 'TIME_PERIOD']
videocount_df = videocount_df[columns_order]

# Rename the columns as per your preference
new_column_names = {
    '"Russia Ukraine War"_total_results': 'Russia_Ukraine_War_VideoCount',
    '"Israel Gaza War"_total_results': 'Israel_Gaza_War_VideoCount',
    '"Financial Crisis"_total_results': 'Financial_Crisis_VideoCount',
    '"Climate Change"_total_results': 'Climate_Change_VideoCount',
    '"Covid-19"_total_results': 'Covid19_VideoCount'
}

videocount_df = videocount_df.rename(columns=new_column_names)

# Print the first few rows to verify the changes
videocount_df


## EA20 INFUSION DATASET

In [None]:
#outter join on TIME_PERIOD
merged_data_ea20 = pd.merge(est_data_cleaned, ea20_ecb_ciss_cleaned, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, ecb_ciss_si_data_cleaned, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, daily_inflation_ea20, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, daily_gdp_ea20, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, govdebt_df_ea20, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, estat_unemployment_ea20, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, estat_consumer_confidence_ea20, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, pivot_exchange_df, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, commodity_merged_df, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, covid_data_ea20, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, trends_df, on='TIME_PERIOD', how='outer')
merged_data_ea20 = pd.merge(merged_data_ea20, videocount_df, on='TIME_PERIOD', how='outer')


columns_to_ffill_bfill = [
    'ESTR_EU000A2QQF08_CI', 'ESTR_EU000A2X2A25_NT', 'ESTR_EU000A2X2A25_TT', 
    'CISS_EA20_SS_CIN', 'CISS_EA20_SS_BM', 'CISS_EA20_SS_FI', 'CISS_EA20_SS_FX', 
    'CISS_EA20_SS_MM', 'ECB_EXCHANGE_RATES_USD_EUR', 'ECB_EXCHANGE_RATES_CNY_EUR',
    'Oil_Close_Price', 'Oil_Volume', 'Gas_Close_Price', 'Gas_Volume', 'Corn_Close_Price',
    'Corn_Volume', 'Wheat_Close_Price', 'Wheat_Volume'
]

for column in columns_to_ffill_bfill:
    merged_data_ea20[column] = merged_data_ea20[column].fillna(method='ffill')
    merged_data_ea20[column] = merged_data_ea20[column].fillna(method='bfill')

# Filter to keep only data up to 2024-06-30
merged_data_ea20 = merged_data_ea20[merged_data_ea20['TIME_PERIOD'] <= '2024-06-30']

print("Column names in merged_data_ea20:")
print(merged_data_ea20.columns)

def interpolate_weekly(data, columns):
    for column in columns:
        data[column] = data[column].interpolate(method='linear', limit_direction='both')
    return data

covid_columns = ['Covid_EA20_New_Cases', 'Covid_EA20_Cumulative_Cases', 'Covid_EA20_New_Deaths', 'Covid_EA20_Cumulative_Deaths']

for column in covid_columns:
    merged_data_ea20[column] = pd.to_numeric(merged_data_ea20[column], errors='coerce')

merged_data_ea20 = interpolate_weekly(merged_data_ea20, covid_columns)

#save to EA folder
merged_data_ea20.to_csv('INFUSED_DATA/merged_data_EA20.csv', index=False)

merged_data_ea20
