# Setup 

### Importing 

In [1]:
from src.scraping import scrape_wiki_cities, scrape_weather, icao_airport_codes, city_airport_distance
from src.utils import ConfigHandler, MyMySQLConnection, load_csv_or_execute

### Load Private Keys

In [2]:
# requires a file '.env' at the same level at the main file with :
#      [APIs]
#      openweather_key = <key1>
#      aerodatabox_key = <key2>
# where <key> are the keys without quotes or anything

relative_path_to_file = '.env_aws'
keys = ConfigHandler(relative_path_to_file)
api_key_dict = keys.load_config('APIs')
sql_cred_dict = keys.load_config('SQL')

### Create MySql Connection

In [3]:
cnx = MyMySQLConnection(sql_cred_dict, 'gans_scooters')

# Scraping
## CITIES

In [4]:
relative_city_csv = 'data/cities.csv'
cities_df = load_csv_or_execute(relative_city_csv, scrape_wiki_cities)

In [5]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94 entries, 0 to 93
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                94 non-null     object 
 1   member_state        94 non-null     object 
 2   officialpopulation  94 non-null     int64  
 3   date                94 non-null     object 
 4   lat                 94 non-null     float64
 5   lon                 94 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 5.1+ KB


## WEATHER

In [6]:
relative_city_csv = 'data/weather.csv'
func = scrape_weather
args = {'city_lst' : list(cities_df.city),  'openweather_key' : api_key_dict['openweather_key']}
weather_df = load_csv_or_execute(relative_city_csv, func, args)

In [7]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3760 entries, 0 to 3759
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city                     3760 non-null   object 
 1   date                     3760 non-null   object 
 2   temp_celcius             3760 non-null   float64
 3   temp_feels_like_celcius  3760 non-null   float64
 4   humidity_percent         3760 non-null   int64  
 5   weather_description      3760 non-null   object 
 6   visibility               3760 non-null   int64  
 7   wind_speed_meter_sec     3760 non-null   float64
 8   wind_direction_degree    3760 non-null   int64  
 9   wind_gust_meter_sec      3760 non-null   float64
 10  pop_percent              3760 non-null   float64
 11  rain_3h_mm               3760 non-null   float64
 12  pod                      3760 non-null   object 
dtypes: float64(6), int64(3), object(4)
memory usage: 411.2+ KB


## AIRPORTS

In [22]:
latitudes = cities_df.lat.tolist()
longitudes = cities_df.lon.tolist()
cities = cities_df.city.tolist()

relative_city_csv = 'data/airports.csv'
func = icao_airport_codes
args = {'city_val' : cities, 'latitudes' : latitudes, 'longitudes' : longitudes, 'aerodatabox_key' : api_key_dict['aerodatabox_key']}
airports_df = load_csv_or_execute(relative_city_csv, func, args)


In [None]:
airports_df.info()

## AIRPORT DISTANCES

In [23]:
airport_distances_df = load_csv_or_execute('data/airport_distances.csv', city_airport_distance, {'cities_df' : cities_df, 'airports_df' : airports_df})

In [24]:
airport_distances_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 0 to 259
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   city            260 non-null    object 
 1   icao            260 non-null    object 
 2   distance_in_km  260 non-null    float64
dtypes: float64(1), object(2)
memory usage: 8.1+ KB


In [25]:
airport_distances_df#.info()

Unnamed: 0,city,icao,distance_in_km
0,Berlin,EDDB,17.76
1,Berlin,EDDT,27.83
2,Madrid,LEMD,29.75
3,Rome,LIRF,30.23
4,Rome,LIRA,39.17
...,...,...,...
255,Constanța,LRCK,30.39
256,Catania,LICC,44.63
257,Catania,LICR,86.32
258,Catania,LICB,88.30


## FLIGHTS

# Adding to Database

![Database Schema](data/GansScooter.png)

In [31]:
#cnx.execute('drop database gans_scooters')
#del cnx

In [37]:
###   CREATE TABLES
################################
cities_primary = 'city_id'
cities_cols = [cities_primary]
cities_cols.extend(cities_df.columns)
cities_dict = {'cities': (cities_cols, 
                    [f'int NOT NULL AUTO_INCREMENT, PRIMARY KEY ({cities_primary})', #city_id
                    'varchar(255) NOT NULL',       #city 
                    'varchar(255)',                #member_state
                    'int',                         #officialpopulation
                    'DATETIME',                    #date
                    'DECIMAL(6, 4)',               #lat
                    'DECIMAL(6, 4)']) }            #lon

cities_foreign_key = f'FOREIGN KEY ({cities_primary}) REFERENCES cities({cities_primary})'
weather_cols = [cities_primary]
weather_cols.extend(weather_df.columns[1:-1])
if 'rain_3h_mm' not in weather_cols:
    weather_cols.append('rain_3h_mm')
if 'snow_3h_mm' not in weather_cols:
    weather_cols.append('snow_3h_mm')
weather_dict = {'weather': (weather_cols, 
                    [f'int NOT NULL, {cities_foreign_key}',#city_id
                    'DATETIME NOT NULL',        #date
                    'DECIMAL(4, 2)',             #temp_celcius
                    'DECIMAL(4, 2)',             #temp_feels_like_celcius                                 
                    'INT(3)',                    #humidity_percent                                     
                    'varchar(255)',              #weather_description                                       
                    'INT(5) DEFAULT 10000',      #visibility     
                    'DECIMAL(5, 2)',             #wind_speed_meter_sec                                     
                    'INT(3)',                    #wind_direction_degree       
                    'DECIMAL(5, 2)',             #wind_gust_meter_sec     
                    'DECIMAL(5, 2)',             #pop_percent       
                    'DECIMAL(5, 2) DEFAULT 0.00',#rain_3h_mm        
                    'DECIMAL(5, 2) DEFAULT 0.00'])} #snow_3h_mm   

airports_primary = 'icao'
airports_cols = list(airports_df.columns[1:3])
airports_dict = {'airports': (airports_cols,
                   [f'CHAR(4) NOT NULL, PRIMARY KEY ({airports_primary})', #icao
                    'VARCHAR(255)'])}                                      #airport name

airports_foreign_key = 'FOREIGN KEY ({}) REFERENCES airports({})'
airport_distances_cols = [cities_primary]
airport_distances_cols.extend(airport_distances_df.columns[1:])
airport_distances_dict = {'airport_distances': (airport_distances_cols,
                     [f'int NOT NULL, {cities_foreign_key}',            #city_id
                      f'CHAR(4) NOT NULL, {airports_foreign_key}'
                          .format(airports_primary,airports_primary),   #icao
                      'DECIMAL(5, 2)'])}                                #distance_in_km


flights_cols = list(flights_df.columns)
flights_dict = {'flights': (flights_cols,
                   [f'CHAR(4) NOT NULL, {airports_foreign_key}'
                          .format('arrival_' + airports_primary,airports_primary), #arrival_icao
                    f'CHAR(4) NOT NULL',   #departure_icao : cannot be a primary key because there might be non-european departure airports
                    'DATETIME NOT NULL',                                         #arrival_time_local
                    'CHAR(6)',                                                  #timezone
                    'VARCHAR(100)',                                            # airline_name
                    'VARCHAR(100)',                                            # flight_number
                    'DATE NOT NULL'                                             # data_retrieved_on
                    ])}

create_tables_dict = {**cities_dict, **weather_dict, **airports_dict, **airport_distances_dict, **flights_dict}
cnx.create_tables(create_tables_dict)

In [38]:
# check table specifications
#tmp = cnx.execute('DESCRIBE cities;')
tmp = cnx.execute('DESCRIBE weather;')
for i in tmp:
    print(i)

('city_id', 'int', 'NO', 'MUL', None, '')
('date', 'datetime', 'NO', '', None, '')
('temp_celcius', 'decimal(4,2)', 'YES', '', None, '')
('temp_feels_like_celcius', 'decimal(4,2)', 'YES', '', None, '')
('humidity_percent', 'int', 'YES', '', None, '')
('weather_description', 'varchar(255)', 'YES', '', None, '')
('visibility', 'int', 'YES', '', '10000', '')
('wind_speed_meter_sec', 'decimal(5,2)', 'YES', '', None, '')
('wind_direction_degree', 'int', 'YES', '', None, '')
('wind_gust_meter_sec', 'decimal(5,2)', 'YES', '', None, '')
('pop_percent', 'decimal(5,2)', 'YES', '', None, '')
('rain_3h_mm', 'decimal(5,2)', 'YES', '', '0.00', '')
('snow_3h_mm', 'decimal(5,2)', 'YES', '', '0.00', '')


## Populate Tables

In [39]:
###   ADD CITY DATA
################################
###   ADD AIRPORT DATA
################################
###   ADD FLIGHT DATA
################################

cnx.add_tables_to_db(
            dfs = [cities_df, airports_df.iloc[:,1:3], flights_df],     
            tablenames = ['cities', 'airports', 'flights'], 
            insert_modes = ['append', 'append', 'append'] 
)

In [40]:
# check tables
tmp = cnx.execute('SELECT * FROM airports LIMIT 20;')
for i in tmp:
    print(i)

('EBAW', 'Antwerp, Antwerp   (Deurne)')
('EBBR', 'Brussels')
('EBCI', 'Brussels, Brussels South Charleroi')
('EBKT', 'Wevelgem')
('EBLG', 'Liège')
('EBOS', 'Ostend, Ostend-Bruges')
('EDAH', 'Heringsdorf')
('EDDB', 'Berlin, Berlin Brandenburg')
('EDDC', 'Dresden')
('EDDE', 'Erfurt')
('EDDF', 'Frankfurt-am-Main')
('EDDG', 'Münster, Münster Osnabrück')
('EDDH', 'Hamburg')
('EDDK', 'Cologne, Cologne Bonn')
('EDDL', 'Duesseldorf, Düsseldorf')
('EDDM', 'Munich')
('EDDN', 'Nuremberg')
('EDDP', 'Leipzig, Leipzig Halle')
('EDDR', 'Saarbrücken')
('EDDS', 'Stuttgart')


In [41]:
###   ADD WEATHER DATA
################################
            
from src.utils import intersect_cols

keep_cols = ['city', 'date', 'temp_celcius', 'temp_feels_like_celcius',
       'humidity_percent', 'weather_description',
       'wind_speed_meter_sec', 'wind_direction_degree', 'wind_gust_meter_sec',
       'pop_percent', 'rain_3h_mm', 'snow_3h_mm']

cnx.add_to_db_with_foreign_key(
            df = weather_df[intersect_cols(weather_df.columns, keep_cols)],
            tablename = 'weather',
            foreigntables = ['cities'], # list of all tables with a foreign key that need to be extracted
            foreigncolumns = [['city_id']], # list of lists: naming the columns to extract for each table
            matchcolumns = [['city']], # list of lists: naming the columns to merge on for each table; 
                                 # needs to be the same name in the foreign table and df 
            insert_mode = 'append',
            dropcolumns = [['city']])

In [42]:
# check weather table data
tmp = cnx.execute('SELECT * FROM weather LIMIT 5;')
for i in tmp:
    print(i)

(1, datetime.datetime(2022, 6, 15, 18, 0), Decimal('25.19'), Decimal('24.70'), 36, 'few clouds', 10000, Decimal('2.90'), 71, Decimal('3.11'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 15, 21, 0), Decimal('20.75'), Decimal('20.16'), 49, 'few clouds', 10000, Decimal('3.78'), 93, Decimal('8.92'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 16, 0, 0), Decimal('15.84'), Decimal('15.15'), 64, 'scattered clouds', 10000, Decimal('2.66'), 86, Decimal('5.99'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 16, 3, 0), Decimal('15.35'), Decimal('14.66'), 66, 'overcast clouds', 10000, Decimal('1.84'), 71, Decimal('2.54'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 16, 6, 0), Decimal('17.61'), Decimal('16.97'), 59, 'overcast clouds', 10000, Decimal('2.72'), 8, Decimal('3.27'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))


In [43]:
###   ADD AIRPORT DISTANCES DATA
################################

cnx.add_to_db_with_foreign_key(
            df = airport_distances_df,
            tablename = 'airport_distances',
            foreigntables = ['cities'], # list of all tables with a foreign key that need to be extracted
            foreigncolumns = [['city_id']], # list of lists: naming the columns to extract for each table
            matchcolumns = [['city']], # list of lists: naming the columns to merge on for each table; 
                                 # needs to be the same name in the foreign table and df 
            insert_mode = 'append',
            dropcolumns = [['city']])

In [44]:
# check weather table data
tmp = cnx.execute('SELECT * FROM airport_distances LIMIT 5;')
for i in tmp:
    print(i)

(1, 'EDDB', Decimal('17.76'))
(1, 'EDDT', Decimal('27.83'))
(2, 'LEMD', Decimal('29.75'))
(3, 'LIRF', Decimal('30.23'))
(3, 'LIRA', Decimal('39.17'))
