# Setup 

### Importing 

In [1]:
# imports functions from the other notebooks
# requires nbimporter
import sys, os
sys.path.append(os.path.join(os.path.dirname(''), '..'))
#sys.path.append(os.path.join(os.path.dirname(''), '../..'))
from proj3_gans_scooters.src.scraping import scrape_wiki_cities, scrape_weather, icao_airport_codes, city_airport_distance
from proj3_gans_scooters.src.utils import PrivateKeysHandler, MyMySQLConnection, load_or_execute_df

### Load Private Keys

In [2]:
# requires a file '.env' at the same level at the main file with :
#      [APIs]
#      openweather_key = <key1>
#      aerodatabox_key = <key2>
# where <key> are the keys without quotes or anything

relative_path_to_file = '.env'
keys = PrivateKeysHandler(relative_path_to_file)
api_key_dict = keys.load_keys('APIs')
sql_cred_dict = keys.load_keys('SQL')

### Create MySql Connection

In [3]:
cnx = MyMySQLConnection(sql_cred_dict)

# Scraping
* cities
* weather
* flight data

In [4]:
relative_city_csv = 'cities.csv'
cities_df = load_or_execute_df(relative_city_csv, scrape_wiki_cities)

In [5]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94 entries, 0 to 93
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                94 non-null     object 
 1   member_state        94 non-null     object 
 2   officialpopulation  94 non-null     int64  
 3   date                94 non-null     object 
 4   lat                 94 non-null     float64
 5   lon                 94 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 5.1+ KB


In [6]:
relative_city_csv = 'data/weather.csv'
func = scrape_weather
args = {'city_lst' : list(cities_df.city),  'openweather_key' : api_key_dict['openweather_key']}
weather_df = load_or_execute_df(relative_city_csv, func, args)

In [7]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3760 entries, 0 to 3759
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city                     3760 non-null   object 
 1   date                     3760 non-null   object 
 2   temp_celcius             3760 non-null   float64
 3   temp_feels_like_celcius  3760 non-null   float64
 4   humidity_percent         3760 non-null   int64  
 5   weather_description      3760 non-null   object 
 6   visibility               3760 non-null   int64  
 7   wind_speed_meter_sec     3760 non-null   float64
 8   wind_direction_degree    3760 non-null   int64  
 9   wind_gust_meter_sec      3760 non-null   float64
 10  pop_percent              3760 non-null   float64
 11  rain_3h_mm               3760 non-null   float64
 12  pod                      3760 non-null   object 
dtypes: float64(6), int64(3), object(4)
memory usage: 411.2+ KB


In [8]:
latitudes = cities_df.lat.tolist()
longitudes = cities_df.lon.tolist()
cities = cities_df.city.tolist()

#for testing purposes
latitudes = [latitudes[0]]
longitudes = [longitudes[0]]
cities = [cities[0]]

relative_city_csv = 'data/airports.csv'
func = icao_airport_codes
args = {'city_val' : cities, 'latitudes' : latitudes, 'longitudes' : longitudes, 'aerodatabox_key' : api_key_dict['aerodatabox_key']}
icao_airports_df = load_or_execute_df(relative_city_csv, func, args)

In [9]:
airports_df = icao_airports_df.copy()
keep_cols = ['city', 'icao', 'name', 'location.lat', 'location.lon']
new_cols = ['city', 'icao', 'name', 'lat', 'lon']
airports_df = airports_df[keep_cols].rename(columns=dict(zip(keep_cols, new_cols)))
airport_distances_df = city_airport_distance(cities_df, airports_df)

In [10]:
airports_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   city    2 non-null      object 
 1   icao    2 non-null      object 
 2   name    2 non-null      object 
 3   lat     2 non-null      float64
 4   lon     2 non-null      float64
dtypes: float64(2), object(3)
memory usage: 96.0+ bytes


In [15]:
airport_distances_df#.info()

Unnamed: 0,city,icao,distance_in_km
0,Berlin,EDDB,17.757229
1,Berlin,EDDT,27.827624


# Adding to Database

![Database Schema](data/GansScooter.png)

In [110]:
#cnx.execute('drop database gans_scooters')
#del cnx

In [111]:
###   CREATE TABLES
################################
cities_primary = 'city_id'
cities_cols = [cities_primary]
cities_cols.extend(cities_df.columns)
cities_dict = {'cities': (cities_cols, 
                    [f'int NOT NULL AUTO_INCREMENT, PRIMARY KEY ({cities_primary})', #city_id
                    'varchar(255) NOT NULL',       #city 
                    'varchar(255)',                #member_state
                    'int',                         #officialpopulation
                    'DATETIME',                    #date
                    'DECIMAL(6, 4)',               #lat
                    'DECIMAL(6, 4)']) }            #lon

cities_foreign_key = f'FOREIGN KEY ({cities_primary}) REFERENCES cities({cities_primary})'
weather_cols = [cities_primary]
weather_cols.extend(weather_df.columns[1:-1])
weather_cols.append('snow_3h_mm')
weather_dict = {'weather': (weather_cols, 
                    [f'int NOT NULL, {cities_foreign_key}',#city_id
                    'DATETIME NOT NULL',        #date
                    'DECIMAL(4, 2)',             #temp_celcius
                    'DECIMAL(4, 2)',             #temp_feels_like_celcius                                 
                    'INT(3)',                    #humidity_percent                                     
                    'varchar(255)',              #weather_description                                       
                    'INT(5) DEFAULT 10000',      #visibility     
                    'DECIMAL(5, 2)',             #wind_speed_meter_sec                                     
                    'INT(3)',                    #wind_direction_degree       
                    'DECIMAL(5, 2)',             #wind_gust_meter_sec     
                    'DECIMAL(5, 2)',             #pop_percent       
                    'DECIMAL(5, 2) DEFAULT 0.00',#rain_3h_mm        
                    'DECIMAL(5, 2) DEFAULT 0.00'])} #snow_3h_mm   

airports_primary = 'icao'
airports_cols = list(airports_df.columns[1:3])
airports_dict = {'airport_distances': (airports_cols,
                   [f'CHAR(4) NOT NULL, PRIMARY KEY ({airports_primary})', #icao
                    'VARCHAR(255)'])}

airports_foreign_key = 'FOREIGN KEY ({}) REFERENCES airports({})'
airport_distances_cols = [cities_primary]
airport_distances_cols.extend(airport_distances_df.columns[1:])
airport_distances_dict = {'airports': (airport_distances_cols,
                     [f'int NOT NULL, {cities_foreign_key}',            #city_id
                      f'CHAR(4) NOT NULL, {airports_foreign_key}'
                          .format(airports_primary,airports_primary),   #icao
                      'DECIMAL(5, 2)'])}                                #distance_in_km

flights_cols = list(airports_df.columns[1:3])
flights_dict = {'airport_distances': (airports_cols,
                   [f'CHAR(4) NOT NULL, {airports_foreign_key}'
                          .format('arrival_' + airports_primary,airports_primary), #arrival_icao
                    f'CHAR(4) NOT NULL, {airports_foreign_key}'
                          .format('departure' + airports_primary,airports_primary),  #departure_icao
                    'DATETIME NOT NULL',                                         #scheduled arrival time
                    'DATETIME',                                                  #actual arrival time
                    'VARCHAR(255)'])}                                            # airline_name

create_tables_dict = {**cities_dict, **weather_dict, **airports_dict, **airport_distances_dict, **flights_dict}
cnx.create_tables(create_tables_dict)

In [118]:
# check table specifications
#tmp = cnx.execute('DESCRIBE cities;')
tmp = cnx.execute('DESCRIBE weather;')
for i in tmp:
    print(i)

('city_id', 'int', 'NO', 'MUL', None, '')
('date', 'datetime', 'NO', '', None, '')
('temp_celcius', 'decimal(4,2)', 'YES', '', None, '')
('temp_feels_like_celcius', 'decimal(4,2)', 'YES', '', None, '')
('humidity_percent', 'int', 'YES', '', None, '')
('weather_description', 'varchar(255)', 'YES', '', None, '')
('visibility', 'int', 'YES', '', None, '')
('wind_speed_meter_sec', 'decimal(5,2)', 'YES', '', None, '')
('wind_direction_degree', 'int', 'YES', '', None, '')
('wind_gust_meter_sec', 'decimal(5,2)', 'YES', '', None, '')
('pop_percent', 'decimal(5,2)', 'YES', '', None, '')
('rain_3h_mm', 'decimal(5,2)', 'YES', '', '0.00', '')
('snow_3h_mm', 'decimal(5,2)', 'YES', '', '0.00', '')


## Populate Tables

In [112]:
###   ADD CITY DATA
################################
###   ADD AIRPORT DATA
################################
###   ADD FLIGHT DATA
################################

cnx.add_tables_to_db(
            dfs = [cities_df, airports_df],      #, flights_df],
            tablenames = ['cities', 'airports'], #, 'flights'],
            insert_modes = ['append', 'append']  #, 'append']
)

None


In [None]:
# check tables
tmp = cnx.execute('SELECT * FROM cities LIMIT 20;')
for i in tmp:
    print(i)

In [113]:
###   ADD WEATHER DATA
################################

tmp_weather_df = weather_df[['city', 'date', 'temp_celcius', 'temp_feels_like_celcius',
       'humidity_percent', 'weather_description',
       'wind_speed_meter_sec', 'wind_direction_degree', 'wind_gust_meter_sec',
       'pop_percent', 'rain_3h_mm']]

cnx.add_to_db_with_foreign_key(
            tmp_weather_df,
            'weather',
            ['cities'], # list of all tables with a foreign key that need to be extracted
            [['city_id']], # list of lists: naming the columns to extract for each table
            [['city']], # list of lists: naming the columns to merge on for each table; 
                                 # needs to be the same name in the foreign table and df 
            'append')

weather ['cities'] [['city_id']] [['city']] [[False]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[foreigncolumns[i][j]] = df.merge(tmp_df, on=matchcolumns[i][j])[foreigncolumns[i][j]]


In [117]:
# check weather table data
tmp = cnx.execute('SELECT * FROM weather LIMIT 20;')
for i in tmp:
    print(i)

(1, datetime.datetime(2022, 6, 10, 12, 0), Decimal('21.88'), Decimal('21.30'), 45, 'few clouds', None, Decimal('3.83'), 290, Decimal('5.34'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 10, 15, 0), Decimal('23.64'), Decimal('23.03'), 37, 'clear sky', None, Decimal('3.64'), 290, Decimal('4.69'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 10, 18, 0), Decimal('23.00'), Decimal('22.35'), 38, 'few clouds', None, Decimal('3.12'), 280, Decimal('4.44'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 10, 21, 0), Decimal('18.78'), Decimal('18.28'), 60, 'broken clouds', None, Decimal('2.43'), 309, Decimal('5.13'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 11, 0, 0), Decimal('17.98'), Decimal('17.56'), 66, 'broken clouds', None, Decimal('1.56'), 281, Decimal('2.55'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 

In [7]:
###   ADD AIRPORT DISTANCES DATA
################################

cnx.add_to_db_with_foreign_key(
            airport_distances_df,
            'airport_distances',
            ['cities'], # list of all tables with a foreign key that need to be extracted
            [['city_id']], # list of lists: naming the columns to extract for each table
            [['city']], # list of lists: naming the columns to merge on for each table; 
                                 # needs to be the same name in the foreign table and df 
            'append')