# Etude de cas - LivraisonCo

## Weather forecast features


## Requirements

In [139]:
import requests
import calendar
import datetime
from dateutil.parser import parse
import pandas as pd

from datetime import datetime
from tqdm import tqdm

## Data import

In [140]:
WWO_API_KEY = "d6dd1b8b56d34d83b38134048220512"  # world weather online
WWO_BASE_URL = "https://api.worldweatheronline.com/premium/v1/past-weather.ashx"

# API query functions

def get_weather_data(start_date, end_date=None, location="paris,france", data_format="json", forecast_time_interval="1", key=WWO_API_KEY):
    """
        function to get weather data; limit : 500 calls / day
        parameters :
            start_date, end_date : "yyyy-mm-dd"
            location : city or lattitude and longitude : "XX.XXX,XX.XXX"; when using "paris", the location is 48.867, 2.333
            forecast_time_interval : time interval (in hours)
        output:
            list of dictionnaries for each day beatween start_date and end_date
    """
    params_dict = {'date': start_date, 'q': location,
                   'format': data_format, 'tp': forecast_time_interval, 'key': key}
    if end_date:
        params_dict['enddate'] = end_date

    r = requests.get(WWO_BASE_URL, params=params_dict)
    json_data = r.json()

    if 'error' in json_data:
        print(json_data['error'][0]['msg'])
        return None

    return json_data['data']['weather']


def get_month_weather_data(month, year):
    start_date = "-".join([year, month, "1"])
    last_month_day = calendar.monthrange(int(year), int(month))[1]
    end_date = "-".join([year, month, str(last_month_day)])

    # verify if the dates are available
    today = date.today()
    if parse(start_date).date() <= today <= parse(end_date).date():
        end_date = today.strftime(r"%Y-%m-%d")
    elif today < parse(start_date).date():
        return None

    month_data = get_weather_data(start_date, end_date)
    return month_data


In [152]:

# Helper functions

def convert_time_format(time):
  if time=="0":
    return "00:00:00"
  elif len(time)==3:
    return "0"+time[0]+":00:00"
  elif len(time)==4:
    return time[:2]+":00:00"

def get_datetime(date, time):
  return date+" "+convert_time_format(time)

def convert_format_sunrise(time):

  in_time = datetime.strptime(time, "%I:%M %p")
  out_time = datetime.strftime(in_time, "%H:%M:%S")
  return out_time

def is_day_criteria(time, sunrise,sunset):
  return (time>=sunrise) & (time<=sunset)

# Main function

def convert_data_to_df(data):

  df = pd.DataFrame(data)

  # flatten astronomy data
  astronomy_data = df.pop('astronomy')
  astro_columns = list(astronomy_data[0][0].keys())
  astro_dict = {feature: [astronomy_data[day][0][feature]
                          for day in range(len(astronomy_data))] for feature in astro_columns}
  for feature in astro_dict:
      df[feature] = astro_dict[feature]

  df_v1 = df.explode('hourly', ignore_index=True)

  hourly_data = df_v1.pop('hourly')

  hourly_columns = ['time', 'tempC', 'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM', 'humidity',
                              'visibility', 'pressure', 'cloudcover', 'HeatIndexC', 'DewPointC', 'WindChillC', 'WindGustKmph', 'FeelsLikeC', 'uvIndex']

  hourly_dict = {feature: [hourly_data[hour][feature]
                          for hour in range(len(hourly_data))] for feature in hourly_columns}

  for feature in hourly_dict:
      df_v1[feature] = hourly_dict[feature]

  df_v1["datetime"] = df_v1.apply(lambda row: get_datetime(row["date"],row["time"]), axis=1)
  df_v1["datetime"] = pd.to_datetime(df_v1["datetime"], format="%Y-%m-%d %H:%M:%S")

  df_v1["sunrise"] = df_v1["sunrise"].apply(convert_format_sunrise)
  df_v1["sunrise"] = df_v1.apply(lambda row: row["date"]+" "+row["sunrise"], axis=1)
  df_v1["sunrise"] = pd.to_datetime(df_v1["sunrise"], format="%Y-%m-%d %H:%M:%S")

  df_v1["sunset"] = df_v1["sunset"].apply(convert_format_sunrise)
  df_v1["sunset"] = df_v1.apply(lambda row: row["date"]+" "+row["sunset"], axis=1)
  df_v1["sunset"] = pd.to_datetime(df_v1["sunset"], format="%Y-%m-%d %H:%M:%S")

  df_v1["is_day"] = df_v1.apply(lambda row: is_day_criteria(row["datetime"],row["sunrise"],row["sunset"]), axis=1)

  df_v1.drop(columns=[
      "date",
      "sunHour",
      "sunrise",
      "sunset",
      "moonrise",
      "moonset",
      "moon_phase",
      "moon_illumination",
      "time",
      "maxtempF", 
      "mintempF", 
      "avgtempF"
      ], inplace=True)
  
  return df_v1

In [175]:
df_list=[]

for year in range(2020,2023):
  for month in tqdm(range(1,13)):
    data = get_month_weather_data(str(month),str(year))
    df = convert_data_to_df(data)
    df_list.append(df)

df_final = pd.concat(df_list, axis=0)
df_final.reset_index(inplace=True)

100%|██████████| 12/12 [00:18<00:00,  1.52s/it]
100%|██████████| 12/12 [00:19<00:00,  1.66s/it]
100%|██████████| 12/12 [00:15<00:00,  1.25s/it]


In [181]:
df_final = df_final.drop(columns=["index"],axis=1)
df_final

Unnamed: 0,maxtempC,mintempC,avgtempC,totalSnow_cm,uvIndex,tempC,windspeedKmph,winddirDegree,weatherCode,precipMM,...,visibility,pressure,cloudcover,HeatIndexC,DewPointC,WindChillC,WindGustKmph,FeelsLikeC,datetime,is_day
0,6,3,4,0.0,1,4,6,127,116,0.0,...,10,1032,5,4,1,3,8,3,2020-01-01 00:00:00,False
1,6,3,4,0.0,1,4,6,129,116,0.0,...,10,1032,7,4,1,3,8,3,2020-01-01 01:00:00,False
2,6,3,4,0.0,1,4,5,130,116,0.0,...,10,1031,9,4,1,3,8,3,2020-01-01 02:00:00,False
3,6,3,4,0.0,1,3,5,131,116,0.0,...,10,1031,10,3,1,2,8,2,2020-01-01 03:00:00,False
4,6,3,4,0.0,1,3,5,129,116,0.0,...,10,1031,14,3,1,2,8,2,2020-01-01 04:00:00,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25699,6,2,5,0.0,1,5,9,38,122,0.0,...,10,1020,100,5,1,3,12,3,2022-12-06 19:00:00,False
25700,6,2,5,0.0,1,5,9,38,122,0.0,...,10,1020,100,5,1,3,12,3,2022-12-06 20:00:00,False
25701,6,2,5,0.0,1,5,8,38,122,0.0,...,10,1020,100,5,0,3,12,3,2022-12-06 21:00:00,False
25702,6,2,5,0.0,1,4,7,27,122,0.0,...,10,1021,79,4,1,3,11,3,2022-12-06 22:00:00,False


In [182]:
df_final.to_csv("/content/drive/MyDrive/CentraleSupelec/3A/BCG_datathon/Data/meteo.csv", sep=';',index = False)

In [178]:
df_final.columns

Index(['index', 'maxtempC', 'mintempC', 'avgtempC', 'totalSnow_cm', 'uvIndex',
       'tempC', 'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM',
       'humidity', 'visibility', 'pressure', 'cloudcover', 'HeatIndexC',
       'DewPointC', 'WindChillC', 'WindGustKmph', 'FeelsLikeC', 'datetime',
       'is_day'],
      dtype='object')