# Processing world flights data

### Load Python tools

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import geojson
import json
import glob
import io
import os
import pyarrow
from shapely.geometry import Point, LineString

### Download data from Flightradar24

In [2]:
# !wget --user general_media --password OpE0SimNCt -r -np -nH --cut-dirs=3 -R index.html '{u}'

---

## Process 'flights' metadata about each set of points

### Set path for flights and define the files we'll concatenate

In [3]:
a_flight = pd.read_csv('coronavirus/20200317_flights.csv')

In [4]:
a_flight.head()

Unnamed: 0,flight_id,aircraft_id,reg,equip,callsign,flight,schd_from,schd_to,real_to,reserved
0,607326734,4845756,,,OSTRAHA,,OSR,,,
1,607319067,10909827,N516JB,A320,MX516,,LGB,,,
2,607329341,3949337,V83,GRND,V83,,,,,
3,607289287,10591780,N203LB,LOON,HBAL129,,,,,
4,607328140,4688509,,GRND,BR085,,,,,


In [5]:
path = 'coronavirus/'
files = glob.glob(os.path.join(path, "*flights.csv"))

### Read the csv and create a 'date' field

In [6]:
file_df = (pd.read_csv(f, encoding = "ISO-8859-1", low_memory=False)\
           .assign(date=os.path.basename(f)) for f in files)

### Combined our newly processed flight files

In [7]:
flights_df = pd.concat(file_df, ignore_index=True)

In [8]:
flights_df.head()

Unnamed: 0,flight_id,aircraft_id,reg,equip,callsign,flight,schd_from,schd_to,real_to,reserved,date
0,608291364,4976852,,GRND,FLWME3,,AYT,,,,20200324_flights.csv
1,608298331,5313483,,,SAATJA4,,TLL,,,,20200324_flights.csv
2,608288050,4546612,,GRND,RANGER2,,SOF,,,,20200324_flights.csv
3,608300515,4735585,LJ-Light,GRND,LI,,,,,,20200324_flights.csv
4,608298634,11402224,,,BIRD,,DMK,,,,20200324_flights.csv


### Clean up our dates for use later

In [9]:
flights_df['date'] = flights_df['date']\
    .str.replace('_flights.csv','')

In [10]:
flights_df['date'] = pd.to_datetime(flights_df.date, format='%Y%m%d')
flights_df['month'] = flights_df['date'].dt.month 
flights_df['day'] = flights_df['date'].dt.day 
flights_df['weekday'] = flights_df['date'].dt.weekday_name

### Create a new dataframe with flights and export to CSV

In [11]:
flights = pd.DataFrame(flights_df)

In [12]:
flights.head()

Unnamed: 0,flight_id,aircraft_id,reg,equip,callsign,flight,schd_from,schd_to,real_to,reserved,date,month,day,weekday
0,608291364,4976852,,GRND,FLWME3,,AYT,,,,2020-03-24,3,24,Tuesday
1,608298331,5313483,,,SAATJA4,,TLL,,,,2020-03-24,3,24,Tuesday
2,608288050,4546612,,GRND,RANGER2,,SOF,,,,2020-03-24,3,24,Tuesday
3,608300515,4735585,LJ-Light,GRND,LI,,,,,,2020-03-24,3,24,Tuesday
4,608298634,11402224,,,BIRD,,DMK,,,,2020-03-24,3,24,Tuesday


### Just flights from LAX, SFO, MXP, VCE

In [13]:
from_lax = pd.DataFrame(flights[(flights['schd_from'] == 'LAX')|\
                  (flights['schd_from'] == 'SFO')|\
                  (flights['schd_from'] == 'MXP')|\
                  (flights['schd_from'] == 'VCE')])

In [14]:
from_lax.schd_from.value_counts()

LAX    16811
SFO    10232
MXP     2745
VCE      907
Name: schd_from, dtype: int64

In [15]:
from_lax['flight_id'] = from_lax['flight_id'].astype(str)

In [16]:
from_lax.to_csv('output/from_lax.csv')

In [17]:
from_lax_slim = from_lax[['flight_id', 'reg', 'equip', 'flight', 'schd_from', 'real_to']]

---

## Process 'positions' data showing each point along a flight

### Set path for latest positions directory and define the files we'll concatenate

In [50]:
a_position = pd.read_csv('/Users/mhustiles/data/data/flights/\
coronavirus/20200324_positions/20200324_608444394.csv')
a_position.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk
0,1585073474,0,246,29.98221,-95.35144,5798,0,0
1,1585073483,0,281,29.98213,-95.35208,5798,12,0
2,1585073490,0,337,29.98241,-95.3523,5798,16,0
3,1585073498,0,332,29.98303,-95.35276,5798,14,0
4,1585073515,0,339,29.9839,-95.35322,5798,24,0


In [51]:
path = '/Users/mhustiles/data/data/flights/coronavirus/20200324_positions'
files = glob.glob(os.path.join(path, "*.csv"))

### Read the csv and create a 'flightid' field so we can track unique flights

In [52]:
file_df = (pd.read_csv(f, encoding = "ISO-8859-1", low_memory=False)\
           .assign(flightid=os.path.basename(f)) for f in files)

### Concateate the frames

In [None]:
positions_df = pd.concat(file_df, ignore_index=True)
positions_df.head()

In [None]:
len(positions_df)

### Combined our newly processed flight positions

In [None]:
positions_df['flightid'] = positions_df['flightid']\
    .str.replace('.csv','')

### Split the flightid field so we have a date string to convert later and also a flightid

In [None]:
positions_df[['datestr','flight_id']] = positions_df.flightid.str.split("_",expand=True,)

In [None]:
positions_df.head()

### Merge and filter positions data to flights departing just our four airports

In [26]:
all_positions_lax_sfo_mxp_vce = from_lax_slim.merge(positions_df, on='flight_id', how='inner')

In [27]:
len(all_positions_lax_sfo_mxp_vce)

309300

In [28]:
all_positions_lax_sfo_mxp_vce.to_csv('/Users/mhustiles/data/data/flights/coronavirus/all_positions_lax_sfo_mxp_vce.csv')

In [29]:
positions_df = pd.DataFrame(all_positions_lax_sfo_mxp_vce)

In [30]:
all_positions_lax_sfo_mxp_vce.iloc[0]

flight_id               608290635
reg_x                       9VSGC
equip_x                      A359
flight_x                     SQ37
schd_from_x                   LAX
real_to_x                     SIN
Unnamed: 0                    NaN
altitude                    40750
date                          NaN
equip_y                       NaN
flight_y                      NaN
flightid       20200324_608290635
heading                       232
latitude                  1.51936
longitude                 105.992
month                         NaN
radar_id                     1080
real_to_y                     NaN
reg_y                         NaN
schd_from_y                   NaN
snapshot_id           1.58501e+09
speed                         502
squawk                       7254
time                          NaN
weekday                       NaN
datestr                  20200324
Name: 0, dtype: object

### Process the 'datestr' field into something we can use

In [31]:
positions_df['date'] = pd.to_datetime(positions_df.datestr, format='%Y%m%d')
positions_df['month'] = positions_df['date'].dt.month 
positions_df['day'] = positions_df['date'].dt.day 
positions_df['weekday'] = positions_df['date'].dt.weekday_name

### Convert the unix timestampt to human datetime and localize

In [32]:
positions_df['date_time'] = pd.to_datetime(positions_df['snapshot_id'],unit='s')
positions_df['utc_datetime'] = \
    pd.to_datetime(positions_df['date_time'], format='%Y-%m-%dT%H:%M:%SZ').dt.tz_localize('UTC')

In [33]:
positions_df['datetime_pst'] = positions_df['utc_datetime'].dt.tz_convert('America/Los_Angeles')

In [34]:
positions_df['date'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%m/%d/%Y')
positions_df['time'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%H:%M:%S')
positions_df['display_time'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%I:%M %p')

In [35]:
positions_df = \
    positions_df.drop(['snapshot_id', 'radar_id', 'day',\
                          'datestr','utc_datetime','date_time', 'datetime_pst', 'display_time'], axis=1)

In [36]:
positions = pd.DataFrame(positions_df)

In [37]:
positions.sort_values(by='date', ascending=True).head()

Unnamed: 0.1,flight_id,reg_x,equip_x,flight_x,schd_from_x,real_to_x,Unnamed: 0,altitude,date,equip_y,...,latitude,longitude,month,real_to_y,reg_y,schd_from_y,speed,squawk,time,weekday
0,608290635,9VSGC,A359,SQ37,LAX,SIN,,40750,03/23/2020,,...,1.51936,105.99191,3,,,,502,7254,17:00:19,Tuesday
78196,608380306,N864AS,CRJ2,UA5827,LAX,ACV,,16000,03/23/2020,,...,33.93809,-118.97414,3,,,,307,1001,20:09:30,Tuesday
78195,608380306,N864AS,CRJ2,UA5827,LAX,ACV,,15675,03/23/2020,,...,33.93267,-118.96591,3,,,,310,1001,20:09:24,Tuesday
78194,608380306,N864AS,CRJ2,UA5827,LAX,ACV,,15400,03/23/2020,,...,33.9283,-118.95721,3,,,,310,1001,20:09:18,Tuesday
78193,608380306,N864AS,CRJ2,UA5827,LAX,ACV,,15125,03/23/2020,,...,33.92485,-118.94795,3,,,,310,1001,20:09:12,Tuesday


In [38]:
positions.to_csv('/Users/mhustiles/data/data/flights/coronavirus/20200324_positions.csv')

---

## Geography

### Convert to positions to a GeoDataFrame using lon/lat for each point in the flight

In [39]:
positions.loc[112000]

flight_id               608388349
reg_x                       GXLEI
equip_x                      A388
flight_x                    BA268
schd_from_x                   LAX
real_to_x                     LHR
Unnamed: 0                    NaN
altitude                    37000
date                   03/24/2020
equip_y                       NaN
flight_y                      NaN
flightid       20200324_608388349
heading                        68
latitude                  54.7336
longitude                -80.1865
month                           3
real_to_y                     NaN
reg_y                         NaN
schd_from_y                   NaN
speed                         510
squawk                       1003
time                     02:36:45
weekday                   Tuesday
Name: 112000, dtype: object

In [40]:
positions_geo = gpd.GeoDataFrame(positions, \
                geometry=gpd.points_from_xy(positions['longitude'], positions['latitude']))

In [41]:
positions_geo.crs = "epsg:4326"

In [42]:
positions_geo.to_file('/Users/mhustiles/data/data/flights/coronavirus/20200324_positions_geo.geojson', driver='GeoJSON')

---

### Convert point data into linestrings