# Acquiring data from the web

## S3 Storage (AWS)

In [None]:
# Import the libraries
from dotenv import load_dotenv
load_dotenv('../config/.env')
import sys
import os

# Add the path to the scripts folder and import the functions
sys.path.append("../scripts/")
from data_acquisition import load_csv_file, load_parquet_file

# Define the bucket name
bucket_name = 'migraine-pressure-6week-analysis'

### Acquire Weather Data from S3 Bucket

In [None]:
# Define the file keys
url_cities = 'weather/cities.csv'
url_countries = 'weather/countries.csv'
url_daily_weather = 'weather/daily_weather.parquet'

# Load the data into dataframes
df_cities = load_csv_file(bucket_name,url_cities)
df_countries = load_csv_file(bucket_name,url_countries)
df_daily_weather = load_parquet_file(bucket_name,url_daily_weather)

# Display the dataframes
if df_cities is not None:
    print(df_cities.head())

if df_countries is not None:
    print(df_countries.head())

if df_daily_weather is not None:
    print(df_daily_weather.head())


### Combine Weather Data

In [None]:
# Combine cities and daily weather dataframes into one
df_cities_daily_weather = df_cities.merge(df_daily_weather, how='left', left_on=['station_id', 'city_name'], right_on=['station_id', 'city_name'])

print(df_cities_daily_weather.info())
print(df_cities_daily_weather.head())


### Acquire Migraine Data from S3 Bucket

In [None]:
# Define the file keys
url_migraine = 'health/IHME-GBD_2019_DATA-361f72c5-1.csv'

# Load the data into dataframes
df_migraine = load_csv_file(bucket_name,url_migraine)

# Display the dataframes
if df_migraine is not None:
    print(df_migraine.head())

### Acquire Population Data from S3 Bucket

In [None]:
# Import other required libraries
import pandas as pd
import boto3

# Initialize S3 client with environment variables
s3 = boto3.client(
    's3', 
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), 
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')
)

# List files in the 'population/' directory
prefix = 'population/'
response = s3.list_objects(Bucket=bucket_name, Prefix=prefix)

# Initialize master DataFrame
population_combined = pd.DataFrame()

# Loop through each file and append it to the master DataFrame
for file in response['Contents']:
    file_name = file['Key']
    obj = s3.get_object(Bucket=bucket_name, Key=file_name)
    temp_df = pd.read_csv(obj['Body'])
    population_combined = pd.concat([population_combined, temp_df], ignore_index=True)

# Display the combined population DataFrame
if population_combined is not None:
    print(population_combined.head())
    print(population_combined.tail())
