In [None]:
# import necessary libraries and dependencies

import requests
import pandas as pd
from io import BytesIO
import boto3  # For uploading to AWS S3
from io import StringIO

# Function to download data from a specific sheet and extract the necessary columns and store those columns as a df
def extract_sheet_data(url, sheet_name, columns_to_extract, skiprows=5):
    # Download the Excel file
    response = requests.get(url)
    response.raise_for_status()  # Raise an error if the download fails

    # Read the Excel file
    xlsx = pd.ExcelFile(BytesIO(response.content), engine="openpyxl")

    # Load the specific sheet and extract only the necessary columns
    df = pd.read_excel(xlsx, sheet_name=sheet_name, skiprows=skiprows)
    
    # Extract only the required columns for that sheet
    df_filtered = df[columns_to_extract]

    return df_filtered


# Function to upload DataFrame to S3 in JSON format
def upload_to_s3(df, bucket_name, file_name):
    # Convert the DataFrame to JSON format
    json_buffer = StringIO()
    df.to_json(json_buffer, orient='records', lines=True)  # 'records' format, one JSON object per line
    
    # Initialize S3 client
    s3_client = boto3.client('s3')
    
    # Upload the JSON data to S3
    s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=json_buffer.getvalue())

# URL of the Excel file that contains data (for social housing in France from January 2 2023 to January 1 2024)
url = "https://www.statistiques.developpement-durable.gouv.fr/media/7970/download?inline"

# Define the columns to extract for each sheet
columns_region = ['REG', 'LIBREG', 'densite', 'nb_ls', 'tx_vac', 'tx_mob']  # Columns for REGION sheet
columns_departement = ['REG', 'DEP', 'Unnamed: 1', 'densite', 'nb_ls', 'tx_vac', 'tx_mob']  # Columns for DEPARTEMENT sheet, column 'Unnamed: 1' 
                                                                                            # corresponds to the column containing the name department name, 
                                                                                            # the column name will be change to 'LIBDEP' later in current notebook
columns_commune = ['REG', 'DEP', 'DEPCOM_ARM', 'LIBCOM', 'densite', 'nb_ls', 'tx_vac', 'tx_mob']  # Columns for COMMUNE sheet
# Here is a legend regarding what info is represented in each:
    # 'REG' = numéro de la région
    # 'LIBREG' = nom de la région
    # 'DEP' = numéro du département
    # 'Unnamed: 1' = nom du département, will be renamed to 'LIBDEP'
    # 'DEPCOM_ARM' = code commune INSEE (ressemble à un code postal mais ça ne l'est pas)
    # 'LIBCOM' = nom de la commune' 
    # 'densite' = Densité de logements sociaux pour 100 résidences principales, source datant du RPLS 2021
    # 'nb_ls' = l’ensemble du parc social (en additionnant le nombre de logement proposé  à la location (vide ou loué), le nombre de logement vide pris en charge par une association, le nombre de logement occupé avec ou sans contrepartie financière et le nombre de logement occupé pour de l'hébergement temporaire)
    # 'tx_vac' = taux de vacance : pourcentage de vacances des logements proposés à la location au 1er janvier du n-1
    # 'tx_mob' = taux de mobilité: pourcentage d’emménagements dans les logements proposés à la location

# Extract data for each sheet using the defined columns
df_region = extract_sheet_data(url, "REGION", columns_region)
df_departement = extract_sheet_data(url, "DEPARTEMENT", columns_departement)
df_commune = extract_sheet_data(url, "COMMUNES", columns_commune)


# S3 Bucket Name (need to change this to the ODIS's S3 bucket name)
bucket_name = 'insert_bucket_name_here'

# Upload the DataFrames to S3 as JSON files
upload_to_s3(df_region, bucket_name, 'region_data.json')
upload_to_s3(df_departement, bucket_name, 'departement_data.json')
upload_to_s3(df_commune, bucket_name, 'commune_data.json')

print("Files uploaded successfully to S3.")



