# Processing world flights data

### Load Python tools

In [34]:
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import geojson
import json
import glob
import io
import os
import pyarrow
from shapely.geometry import Point, LineString

### Download data from Flightradar24

In [35]:
# !wget --user general_media --password OpE0SimNCt -r -np -nH --cut-dirs=3 -R index.html '{u}'

---

---

## Process 'flights' metadata about each set of points

### Set path for flights and define the files we'll concatenate

In [36]:
a_flight = pd.read_csv('coronavirus/20200317_flights.csv')

In [37]:
a_flight.head()

Unnamed: 0,flight_id,aircraft_id,reg,equip,callsign,flight,schd_from,schd_to,real_to,reserved
0,607326734,4845756,,,OSTRAHA,,OSR,,,
1,607319067,10909827,N516JB,A320,MX516,,LGB,,,
2,607329341,3949337,V83,GRND,V83,,,,,
3,607289287,10591780,N203LB,LOON,HBAL129,,,,,
4,607328140,4688509,,GRND,BR085,,,,,


In [38]:
path = 'coronavirus/'
files = glob.glob(os.path.join(path, "*flights.csv"))

### Read the csv and create a 'date' field

In [39]:
file_df = (pd.read_csv(f, encoding = "ISO-8859-1", low_memory=False)\
           .assign(date=os.path.basename(f)) for f in files)

### Combined our newly processed flight files

In [40]:
flights_df = pd.concat(file_df, ignore_index=True)

In [41]:
flights_df.head()

Unnamed: 0,flight_id,aircraft_id,reg,equip,callsign,flight,schd_from,schd_to,real_to,reserved,date
0,608291364,4976852,,GRND,FLWME3,,AYT,,,,20200324_flights.csv
1,608298331,5313483,,,SAATJA4,,TLL,,,,20200324_flights.csv
2,608288050,4546612,,GRND,RANGER2,,SOF,,,,20200324_flights.csv
3,608300515,4735585,LJ-Light,GRND,LI,,,,,,20200324_flights.csv
4,608298634,11402224,,,BIRD,,DMK,,,,20200324_flights.csv


### Clean up our dates for use later

In [42]:
flights_df['date'] = flights_df['date']\
    .str.replace('_flights.csv','')

In [43]:
flights_df['date'] = pd.to_datetime(flights_df.date, format='%Y%m%d')
flights_df['month'] = flights_df['date'].dt.month 
flights_df['day'] = flights_df['date'].dt.day 
flights_df['weekday'] = flights_df['date'].dt.weekday_name

### Create a new dataframe with flights and export to CSV

In [44]:
flights = pd.DataFrame(flights_df)

In [45]:
flights.head()

Unnamed: 0,flight_id,aircraft_id,reg,equip,callsign,flight,schd_from,schd_to,real_to,reserved,date,month,day,weekday
0,608291364,4976852,,GRND,FLWME3,,AYT,,,,2020-03-24,3,24,Tuesday
1,608298331,5313483,,,SAATJA4,,TLL,,,,2020-03-24,3,24,Tuesday
2,608288050,4546612,,GRND,RANGER2,,SOF,,,,2020-03-24,3,24,Tuesday
3,608300515,4735585,LJ-Light,GRND,LI,,,,,,2020-03-24,3,24,Tuesday
4,608298634,11402224,,,BIRD,,DMK,,,,2020-03-24,3,24,Tuesday


### Just flights from LAX, SFO, MXP, VCE

In [85]:
from_lax = pd.DataFrame(flights[(flights['schd_from'] == 'LAX')|\
                  (flights['schd_from'] == 'SFO')|\
                  (flights['schd_from'] == 'MXP')|\
                  (flights['schd_from'] == 'VCE')])

In [86]:
from_lax.schd_from.value_counts()

LAX    16811
SFO    10232
MXP     2745
VCE      907
Name: schd_from, dtype: int64

In [87]:
from_lax['flight_id'] = from_lax['flight_id'].astype(str)

In [None]:
from_lax.to_csv('output/from_lax.csv')

In [106]:
from_lax_slim = from_lax[['flight_id', 'reg', 'equip', 'flight', 'schd_from', 'real_to']]

---

## Process 'positions' data showing each point along a flight

### Set path for latest positions directory and define the files we'll concatenate

In [113]:
a_position = pd.read_csv('/Users/mhustiles/data/data/flights/\
coronavirus/20200324_positions/20200324_608444394.csv')
a_position.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk
0,1585073474,0,246,29.98221,-95.35144,5798,0,0
1,1585073483,0,281,29.98213,-95.35208,5798,12,0
2,1585073490,0,337,29.98241,-95.3523,5798,16,0
3,1585073498,0,332,29.98303,-95.35276,5798,14,0
4,1585073515,0,339,29.9839,-95.35322,5798,24,0


In [114]:
path = '/Users/mhustiles/data/data/flights/coronavirus/20200324_positions'
files = glob.glob(os.path.join(path, "*.csv"))

### Read the csv and create a 'flightid' field so we can track unique flights

In [115]:
file_df = (pd.read_csv(f, encoding = "ISO-8859-1", low_memory=False)\
           .assign(flightid=os.path.basename(f)) for f in files)

### Concateate the frames

In [116]:
positions_df = pd.concat(file_df, ignore_index=True)
positions_df.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk,flightid
0,1585073474,0,246,29.98221,-95.35144,5798,0,0,20200324_608444394.csv
1,1585073483,0,281,29.98213,-95.35208,5798,12,0,20200324_608444394.csv
2,1585073490,0,337,29.98241,-95.3523,5798,16,0,20200324_608444394.csv
3,1585073498,0,332,29.98303,-95.35276,5798,14,0,20200324_608444394.csv
4,1585073515,0,339,29.9839,-95.35322,5798,24,0,20200324_608444394.csv


In [117]:
len(positions_df)

17889922

### Combined our newly processed flight positions

In [118]:
positions_df['flightid'] = positions_df['flightid']\
    .str.replace('.csv','')

### Split the flightid field so we have a date string to convert later and also a flightid

In [119]:
positions_df[['datestr','flight_id']] = positions_df.flightid.str.split("_",expand=True,)

In [120]:
positions_df.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk,flightid,datestr,flight_id
0,1585073474,0,246,29.98221,-95.35144,5798,0,0,20200324_608444394,20200324,608444394
1,1585073483,0,281,29.98213,-95.35208,5798,12,0,20200324_608444394,20200324,608444394
2,1585073490,0,337,29.98241,-95.3523,5798,16,0,20200324_608444394,20200324,608444394
3,1585073498,0,332,29.98303,-95.35276,5798,14,0,20200324_608444394,20200324,608444394
4,1585073515,0,339,29.9839,-95.35322,5798,24,0,20200324_608444394,20200324,608444394


### Merge and filter positions data to flights departing just our four airports

In [121]:
all_positions_lax_sfo_mxp_vce = from_lax_slim.merge(positions_df, on='flight_id', how='inner')

In [122]:
len(all_positions_lax_sfo_mxp_vce)

309300

In [123]:
all_positions_lax_sfo_mxp_vce.to_csv('/Users/mhustiles/data/data/flights/coronavirus/all_positions_lax_sfo_mxp_vce.csv')

In [124]:
positions_df = pd.DataFrame(all_positions_lax_sfo_mxp_vce)

In [125]:
all_positions_lax_sfo_mxp_vce.iloc[0]

flight_id               608290635
reg                         9VSGC
equip                        A359
flight                       SQ37
schd_from                     LAX
real_to                       SIN
snapshot_id            1585008019
altitude                    40750
heading                       232
latitude                  1.51936
longitude                 105.992
radar_id                     1080
speed                         502
squawk                       7254
flightid       20200324_608290635
datestr                  20200324
Name: 0, dtype: object

### Process the 'datestr' field into something we can use

In [126]:
positions_df['date'] = pd.to_datetime(positions_df.datestr, format='%Y%m%d')
positions_df['month'] = positions_df['date'].dt.month 
positions_df['day'] = positions_df['date'].dt.day 
positions_df['weekday'] = positions_df['date'].dt.weekday_name

### Convert the unix timestampt to human datetime and localize

In [127]:
positions_df['date_time'] = pd.to_datetime(positions_df['snapshot_id'],unit='s')
positions_df['utc_datetime'] = \
    pd.to_datetime(positions_df['date_time'], format='%Y-%m-%dT%H:%M:%SZ').dt.tz_localize('UTC')

In [128]:
positions_df['datetime_pst'] = positions_df['utc_datetime'].dt.tz_convert('America/Los_Angeles')

In [129]:
positions_df['date'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%m/%d/%Y')
positions_df['time'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%H:%M:%S')
positions_df['display_time'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%I:%M %p')

In [130]:
positions_df = \
    positions_df.drop(['snapshot_id', 'radar_id', 'day',\
                          'datestr','utc_datetime','date_time', 'datetime_pst', 'display_time'], axis=1)

In [131]:
positions = pd.DataFrame(positions_df)

In [132]:
positions.sort_values(by='date', ascending=True).head()

Unnamed: 0,flight_id,reg,equip,flight,schd_from,real_to,altitude,heading,latitude,longitude,speed,squawk,flightid,date,month,weekday,time
0,608290635,9VSGC,A359,SQ37,LAX,SIN,40750,232,1.51936,105.99191,502,7254,20200324_608290635,03/23/2020,3,Tuesday,17:00:19
78196,608380306,N864AS,CRJ2,UA5827,LAX,ACV,16000,309,33.93809,-118.97414,307,1001,20200324_608380306,03/23/2020,3,Tuesday,20:09:30
78195,608380306,N864AS,CRJ2,UA5827,LAX,ACV,15675,303,33.93267,-118.96591,310,1001,20200324_608380306,03/23/2020,3,Tuesday,20:09:24
78194,608380306,N864AS,CRJ2,UA5827,LAX,ACV,15400,296,33.9283,-118.95721,310,1001,20200324_608380306,03/23/2020,3,Tuesday,20:09:18
78193,608380306,N864AS,CRJ2,UA5827,LAX,ACV,15125,289,33.92485,-118.94795,310,1001,20200324_608380306,03/23/2020,3,Tuesday,20:09:12


In [133]:
positions.to_csv('/Users/mhustiles/data/data/flights/coronavirus/20200324_positions.csv')

---

## Geography

### Convert to positions to a GeoDataFrame using lon/lat for each point in the flight

In [134]:
positions.loc[112000]

flight_id             608388349
reg                       GXLEI
equip                      A388
flight                    BA268
schd_from                   LAX
real_to                     LHR
altitude                  37000
heading                      68
latitude                54.7336
longitude              -80.1865
speed                       510
squawk                     1003
flightid     20200324_608388349
date                 03/24/2020
month                         3
weekday                 Tuesday
time                   02:36:45
Name: 112000, dtype: object

In [135]:
positions_geo = gpd.GeoDataFrame(positions, \
                geometry=gpd.points_from_xy(positions['longitude'], positions['latitude']))

In [136]:
positions_geo.crs = "epsg:4326"

In [137]:
positions_geo.to_file('/Users/mhustiles/data/data/flights/coronavirus/20200324_positions_geo.geojson', driver='GeoJSON')

---