## Step 1) Import libraries

In [1]:
import pandas as pd
from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

## Step 2) Create connection string

In [2]:
# Let's load values from the .env file
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

### Building the URL

In [3]:
# Now building the URL with the values from the .env file

url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

# without specifying the schema default connection is to the schema `public`
# url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

## Step 3) Create engine

In [4]:
engine = create_engine(url, echo=False)

### Connecting to the schema

In [5]:
my_schema = 'clear_skies'

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

### Step 4) Run SQL statement and bring data from DBeaver

In [23]:
with engine.begin() as conn: # Done with echo=False
    result = conn.execute(text('SELECT * FROM prep_flights'))
    data = result.all()

### Let's create a dataframe out of that
master_df = pd.DataFrame(data) 
master_df

Unnamed: 0,date,sched_dep_time,actual_dep_time,dep_delay,sched_arr_time,actual_arr_time,arr_delay,airline,tail_number,flight_number,...,diverted,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,max_snow_mm,avg_wind_direction,avg_wind_speed_kmh,wind_peakgust_kmh,avg_pressure_hpa
0,2014-10-01,09:00:00,08:54:00,-6.0,12:10:00,12:02:00,-8.0,AA,N785AA,1,...,0,17.7,16.7,19.4,0.0,0,22.0,18.4,,1016.8
1,2014-10-02,09:00:00,08:54:00,-6.0,12:10:00,12:07:00,-3.0,AA,N783AA,1,...,0,17.4,16.1,20.0,0.0,0,,13.3,,1019.9
2,2014-10-03,09:00:00,08:55:00,-5.0,12:10:00,11:45:00,-25.0,AA,N786AA,1,...,0,16.8,12.8,20.0,0.0,0,89.0,11.5,,1017.2
3,2014-10-04,09:00:00,08:52:00,-8.0,12:10:00,12:03:00,-7.0,AA,N784AA,1,...,0,18.8,11.1,22.2,17.8,0,175.0,26.3,,1004.1
4,2014-10-05,09:00:00,08:57:00,-3.0,12:10:00,12:18:00,8.0,AA,N792AA,1,...,0,12.7,8.3,16.1,0.0,0,273.0,20.9,,1010.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207894,2016-03-18,20:10:00,20:06:00,-4.0,23:09:00,22:43:00,-26.0,UA,N53442,712,...,0,11.9,2.2,18.3,0.0,0,266.0,16.9,,1013.1
207895,2016-03-18,17:27:00,17:29:00,2.0,20:15:00,20:15:00,0.0,UA,N27421,719,...,0,11.9,2.2,18.3,0.0,0,266.0,16.9,,1013.1
207896,2016-03-18,12:33:00,12:24:00,-9.0,14:31:00,14:31:00,0.0,UA,N406UA,735,...,0,11.9,2.2,18.3,0.0,0,266.0,16.9,,1013.1
207897,2016-03-18,19:04:00,19:02:00,-2.0,21:52:00,21:45:00,-7.0,UA,N38424,751,...,0,11.9,2.2,18.3,0.0,0,266.0,16.9,,1013.1


### Step 5) Cleaning and Formatting

In [28]:
# master_df['date'] = pd.to_datetime(master_df['date'], errors='coerce', format='%Y-%m-%d')
# Convert necessary columns to numeric types, handle missing values if needed
master_df['dep_delay'] = pd.to_numeric(master_df['dep_delay'], errors='coerce')
master_df['arr_delay'] = pd.to_numeric(master_df['arr_delay'], errors='coerce')
master_df['precipitation_mm'] = pd.to_numeric(master_df['precipitation_mm'], errors='coerce')
master_df['max_snow_mm'] = pd.to_numeric(master_df['max_snow_mm'], errors='coerce')
master_df['avg_wind_speed_kmh'] = pd.to_numeric(master_df['avg_wind_speed_kmh'], errors='coerce')
master_df['avg_pressure_hpa'] = pd.to_numeric(master_df['avg_pressure_hpa'], errors='coerce')
master_df['avg_temp_c'] = pd.to_numeric(master_df['avg_temp_c'], errors='coerce')
master_df['min_temp_c'] = pd.to_numeric(master_df['min_temp_c'], errors='coerce')
master_df['max_temp_c'] = pd.to_numeric(master_df['max_temp_c'], errors='coerce')
master_df['avg_wind_direction'] = pd.to_numeric(master_df['avg_wind_direction'], errors='coerce')
master_df['wind_peakgust_kmh'] = pd.to_numeric(master_df['wind_peakgust_kmh'], errors='coerce')

master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207899 entries, 0 to 207898
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype          
---  ------                        --------------   -----          
 0   date                          207899 non-null  datetime64[ns] 
 1   sched_dep_time                207899 non-null  object         
 2   actual_dep_time               203694 non-null  object         
 3   dep_delay                     203694 non-null  float64        
 4   sched_arr_time                207899 non-null  object         
 5   actual_arr_time               203512 non-null  object         
 6   arr_delay                     203051 non-null  float64        
 7   airline                       207899 non-null  object         
 8   tail_number                   206311 non-null  object         
 9   flight_number                 207899 non-null  int64          
 10  airport_code                  207899 non-null  object         
 11  

### Step 5) Export data to CSV

In [30]:
master_df.to_csv('./data/master_df.csv')