# Setup 

### Importing 

In [1]:
# imports functions from the other notebooks
# requires nbimporter
import sys, os
sys.path.append(os.path.join(os.path.dirname(''), '..'))
#sys.path.append(os.path.join(os.path.dirname(''), '../..'))
from proj3_gans_scooters.src.scraping import scrape_wiki_cities, scrape_weather, icao_airport_codes
from proj3_gans_scooters.src.utils import PrivateKeysHandler, MyMySQLConnection, load_or_execute_df

### Load Private Keys

In [2]:
# requires a file '.env' at the same level at the main file with :
#      [APIs]
#      openweather_key = <key1>
#      aerodatabox_key = <key2>
# where <key> are the keys without quotes or anything

relative_path_to_file = '.env'
keys = PrivateKeysHandler(relative_path_to_file)
api_key_dict = keys.load_keys('APIs')
sql_cred_dict = keys.load_keys('SQL')

### Create MySql Connection

In [3]:
cnx = MyMySQLConnection(sql_cred_dict)

# Scraping
* cities
* weather
* flight data

In [4]:
relative_city_csv = 'cities.csv'
cities_df = load_or_execute_df(relative_city_csv, scrape_wiki_cities)

In [5]:
cities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94 entries, 0 to 93
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   city                94 non-null     object 
 1   member_state        94 non-null     object 
 2   officialpopulation  94 non-null     int64  
 3   date                94 non-null     object 
 4   lat                 94 non-null     float64
 5   lon                 94 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 5.1+ KB


In [6]:
relative_city_csv = 'data/weather.csv'
func = scrape_weather
args = {'city_lst' : list(cities_df.city),  'openweather_key' : api_key_dict['openweather_key']}
weather_df = load_or_execute_df(relative_city_csv, func, args)

In [7]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3760 entries, 0 to 3759
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city                     3760 non-null   object 
 1   date                     3760 non-null   object 
 2   temp_celcius             3760 non-null   float64
 3   temp_feels_like_celcius  3760 non-null   float64
 4   humidity_percent         3760 non-null   int64  
 5   weather_description      3760 non-null   object 
 6   visibility               3760 non-null   int64  
 7   wind_speed_meter_sec     3760 non-null   float64
 8   wind_direction_degree    3760 non-null   int64  
 9   wind_gust_meter_sec      3760 non-null   float64
 10  pop_percent              3760 non-null   float64
 11  rain_3h_mm               3760 non-null   float64
 12  pod                      3760 non-null   object 
dtypes: float64(6), int64(3), object(4)
memory usage: 411.2+ KB


In [8]:
latitudes = cities_df.lat.tolist()
longitudes = cities_df.lon.tolist()
latitudes = [latitudes[0]]
longitudes = [longitudes[0]]

relative_city_csv = 'data/airports.csv'
func = icao_airport_codes
args = {'latitudes' : latitudes, 'longitudes' : longitudes, 'aerodatabox_key' : api_key_dict['aerodatabox_key']}
icao_airports_df = load_or_execute_df(relative_city_csv, func, args)

In [11]:
icao_airports_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   icao              2 non-null      object 
 1   iata              2 non-null      object 
 2   name              2 non-null      object 
 3   shortName         2 non-null      object 
 4   municipalityName  2 non-null      object 
 5   countryCode       2 non-null      object 
 6   location.lat      2 non-null      float64
 7   location.lon      2 non-null      float64
dtypes: float64(2), object(6)
memory usage: 256.0+ bytes


# Adding to Database

In [110]:
#cnx.execute('drop database gans_scooters')
#del cnx

In [111]:
###   CREATE TABLES
################################
cities_primary = 'city_id'
cities_cols = [cities_primary]
cities_cols.extend(cities_df.columns)

weather_cols = [cities_primary]
weather_cols.extend(weather_df.columns[1:-1])
weather_cols.append('snow_3h_mm')
weather_foreign_key = f'FOREIGN KEY ({cities_primary}) REFERENCES cities({cities_primary})'

d = {'cities': (cities_cols, [f'int NOT NULL AUTO_INCREMENT, PRIMARY KEY ({cities_primary})', #city_id
                              'varchar(255) NOT NULL',       #city 
                              'varchar(255)',                #member_state
                              'int',                         #officialpopulation
                              'DATETIME',                    #date
                              'DECIMAL(6, 4)',               #lat
                              'DECIMAL(6, 4)'])              #lon
     'weather': (weather_cols, [f'int NOT NULL, {weather_foreign_key}',#city_id
                                'DATETIME NOT NULL',        #date
                                'DECIMAL(4, 2)',             #temp_celcius
                                'DECIMAL(4, 2)',             #temp_feels_like_celcius                                 
                                'INT(3)',                    #humidity_percent                                     
                                'varchar(255)',              #weather_description                                       
                                'INT(5) DEFAULT 10000',      #visibility     
                                'DECIMAL(5, 2)',             #wind_speed_meter_sec                                     
                                'INT(3)',                    #wind_direction_degree       
                                'DECIMAL(5, 2)',             #wind_gust_meter_sec     
                                'DECIMAL(5, 2)',             #pop_percent       
                                'DECIMAL(5, 2) DEFAULT 0.00',#rain_3h_mm        
                                'DECIMAL(5, 2) DEFAULT 0.00']), #snow_3h_mm   
    }

cnx.create_tables(d)

In [118]:
# check table specifications
#tmp = cnx.execute('DESCRIBE cities;')
tmp = cnx.execute('DESCRIBE weather;')
for i in tmp:
    print(i)

('city_id', 'int', 'NO', 'MUL', None, '')
('date', 'datetime', 'NO', '', None, '')
('temp_celcius', 'decimal(4,2)', 'YES', '', None, '')
('temp_feels_like_celcius', 'decimal(4,2)', 'YES', '', None, '')
('humidity_percent', 'int', 'YES', '', None, '')
('weather_description', 'varchar(255)', 'YES', '', None, '')
('visibility', 'int', 'YES', '', None, '')
('wind_speed_meter_sec', 'decimal(5,2)', 'YES', '', None, '')
('wind_direction_degree', 'int', 'YES', '', None, '')
('wind_gust_meter_sec', 'decimal(5,2)', 'YES', '', None, '')
('pop_percent', 'decimal(5,2)', 'YES', '', None, '')
('rain_3h_mm', 'decimal(5,2)', 'YES', '', '0.00', '')
('snow_3h_mm', 'decimal(5,2)', 'YES', '', '0.00', '')


In [112]:
###   ADD CITY DATA
################################

res = cnx.add_table_to_db(
            df = cities_df,
            tablename = 'cities',
            insert_mode = 'append'
)
print(res)

None


In [113]:
###   ADD WEATHER DATA
################################

tmp_weather_df = weather_df[['city', 'date', 'temp_celcius', 'temp_feels_like_celcius',
       'humidity_percent', 'weather_description',
       'wind_speed_meter_sec', 'wind_direction_degree', 'wind_gust_meter_sec',
       'pop_percent', 'rain_3h_mm']]

cnx.add_to_db_with_foreign_key(
            tmp_weather_df,
            'weather',
            ['cities'], # list of all tables with a foreign key that need to be extracted
            [['city_id']], # list of lists: naming the columns to extract for each table
            [['city']], # list of lists: naming the columns to merge on for each table; 
                                 # needs to be the same name in the foreign table and df 
            'append')

weather ['cities'] [['city_id']] [['city']] [[False]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[foreigncolumns[i][j]] = df.merge(tmp_df, on=matchcolumns[i][j])[foreigncolumns[i][j]]


In [117]:
# check weather table data
tmp = cnx.execute('SELECT * FROM weather LIMIT 20;')
for i in tmp:
    print(i)

(1, datetime.datetime(2022, 6, 10, 12, 0), Decimal('21.88'), Decimal('21.30'), 45, 'few clouds', None, Decimal('3.83'), 290, Decimal('5.34'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 10, 15, 0), Decimal('23.64'), Decimal('23.03'), 37, 'clear sky', None, Decimal('3.64'), 290, Decimal('4.69'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 10, 18, 0), Decimal('23.00'), Decimal('22.35'), 38, 'few clouds', None, Decimal('3.12'), 280, Decimal('4.44'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 10, 21, 0), Decimal('18.78'), Decimal('18.28'), 60, 'broken clouds', None, Decimal('2.43'), 309, Decimal('5.13'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 11, 0, 0), Decimal('17.98'), Decimal('17.56'), 66, 'broken clouds', None, Decimal('1.56'), 281, Decimal('2.55'), Decimal('0.00'), Decimal('0.00'), Decimal('0.00'))
(1, datetime.datetime(2022, 6, 

In [7]:
from bs4 import BeautifulSoup
import requests

#get html code
doc_url = 'https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits'
response = requests.get(doc_url)
if response.status_code != 200:
    raise Exception(f'wikipedia returned code {response.status_code} for url = {doc_url}')
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.select('table.wikitable > tbody > tr')

# prettify the names and take only selected ones
header = [h.text.strip().replace(' ', '_').lower() for h in table[0].select('th')][1:-2]
cities = [[cell.text.strip() for cell in city.select('td')[1:-2]] for city in table[1:]]
city_url = 'https://en.wikipedia.org'
links = [city_url + city.select('td:nth-child(2) a:first-of-type')[0]['href'] for city in table[1:]]

import pandas as pd
df = pd.DataFrame(data=cities, columns=header)
latitude = []
longitude = []
for city_url in links:
    city_response = requests.get(city_url)
    soup = BeautifulSoup(city_response.content, "html.parser")
    latitude.append(soup.select(".latitude")[0].get_text())
    longitude.append(soup.select(".longitude")[0].get_text())
df = df.assign(lat = latitude, lon = longitude)


In [6]:
df.iloc[80:, :]


NameError: name 'df' is not defined

In [12]:


df.loc[:, 'officialpopulation'] = df['officialpopulation'].str.replace(',', '').astype(int)
df.loc[:, 'date'] = pd.to_datetime(df['date'])

# since all latitudes gathered are on the north half and not too close to the equator,
# we can just remove all non-digits and put a floating point after the second digit.
# If we would look at cities all over the world, we should proceed as we do for the longitude
df.loc[:,'lat'] = df.lat.str.replace('\D','', regex=True)
df.loc[:,'lat'] = df.lat.str[:2] + '.' + df.lat.str[2:]

df.loc[:,'lon'] = df.lon.str.replace('°','.')
mask = df.lon.str[-1] == 'W','lon'
df.loc[mask] = '-' + df.loc[mask]
df.loc[:,'lon'] = df.lon.replace('[EW]|″|′', '', regex=True)
df.loc[:,'lat'] = pd.to_numeric(df['lat'])
df.loc[:,'lon'] = pd.to_numeric(df['lon'])

  new_ix = Index(new_ix)


In [13]:
df

Unnamed: 0,city,member_state,officialpopulation,date,lat,lon
0,Berlin,Germany,3664088.0,2020-12-31 00:00:00,52°31′12″N,13°24′18″E
1,Madrid,Spain,3305408.0,2021-01-01 00:00:00,40°25′00″N,--03°42′09″W
2,Rome,Italy,2770226.0,2021-01-01 00:00:00,41°53′36″N,12°28′58″E
3,Bucharest,Romania,2161347.0,2021-07-01 00:00:00,44°25′57″N,26°6′14″E
4,Paris,France,2139907.0,2022-01-01 00:00:00,48°51′24″N,2°21′08″E
...,...,...,...,...,...,...
91,Galați,Romania,304957.0,2021-07-01 00:00:00,45°25′24″N,28°2′33″E
92,Catania,Italy,300356.0,2021-01-01 00:00:00,37°30′0″N,15°5′25″E
93,Vila Nova de Gaia,Portugal,300018.0,2020-12-31 00:00:00,41°08′N,--8°37′W
lat,,,,,,
