# Processing world flights data

### Load Python tools

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import geojson
import json
import glob
import io
import os
import pyarrow
from shapely.geometry import Point, LineString

### Next, read our data from Flightradar24

In [2]:
# os.chdir('/Users/mhustiles/data/github/notebooks/flights-data/coronavirus/')

In [3]:
#  urls = 'https://secure.flightradar24.com/general_media/',

In [4]:
# for u in urls: 
#     !wget --user general_media --password OpE0SimNCt -r -np -nH --cut-dirs=3 -R index.html '{u}'

In [23]:
!unzip \*.zip
!rm -f *.zip
!rm -f *.tmp

In [24]:
!mkdir positions
!mkdir flights

In [25]:
!mv -f *flights.csv flights

In [26]:
!mv -f *.csv positions

---

## Process 'positions' data showing each point along a flight

### Set path for positions and define the files we'll concatenate

In [27]:
a_position = pd.read_csv('positions/20190101_520786143.csv')
a_position.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk
0,1546302500,682,351,33.8297,-118.15107,8444,96,1206
1,1546302513,799,354,33.83565,-118.15157,8444,97,1206
2,1546302537,875,74,33.84734,-118.15132,8444,101,1206
3,1546302545,899,19,33.85181,-118.15102,8444,104,1206
4,1546302552,800,7,33.85536,-118.15085,8444,106,1206


In [28]:
path = 'positions'
files = glob.glob(os.path.join(path, "*.csv"))

### Read the csv and create a 'flightid' field so we can track unique flights

In [29]:
file_df = (pd.read_csv(f, encoding = "ISO-8859-1", low_memory=False)\
           .assign(flightid=os.path.basename(f)) for f in files)

### Concateate the frames

In [30]:
positions_df = pd.concat(file_df, ignore_index=True)
positions_df.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk,flightid
0,1548698910,500,310,33.97671,-118.26622,26826,0,0,20190128_525530318.csv
1,1571361703,575,237,34.05377,-118.23032,3021,36,0,20191018_579082629.csv
2,1571361709,675,268,34.05359,-118.23167,3021,46,0,20191018_579082629.csv
3,1571361729,875,299,34.05492,-118.23851,3021,67,0,20191018_579082629.csv
4,1571361735,900,306,34.05597,-118.24036,3021,65,0,20191018_579082629.csv


In [31]:
len(positions_df)

5871403

### Combined our newly processed flight positions

In [32]:
positions_df['flightid'] = positions_df['flightid']\
    .str.replace('.csv','')

### Split the flightid field so we have a date string to convert later and also a flightid

In [33]:
positions_df[['datestr','flight_id']] = positions_df.flightid.str.split("_",expand=True,)

In [34]:
positions_df.head()

Unnamed: 0,snapshot_id,altitude,heading,latitude,longitude,radar_id,speed,squawk,flightid,datestr,flight_id
0,1548698910,500,310,33.97671,-118.26622,26826,0,0,20190128_525530318,20190128,525530318
1,1571361703,575,237,34.05377,-118.23032,3021,36,0,20191018_579082629,20191018,579082629
2,1571361709,675,268,34.05359,-118.23167,3021,46,0,20191018_579082629,20191018,579082629
3,1571361729,875,299,34.05492,-118.23851,3021,67,0,20191018_579082629,20191018,579082629
4,1571361735,900,306,34.05597,-118.24036,3021,65,0,20191018_579082629,20191018,579082629


### Process the 'datestr' field into something we can use

In [None]:
positions_df['date'] = pd.to_datetime(positions_df.datestr, format='%Y%m%d')
positions_df['month'] = positions_df['date'].dt.month 
positions_df['day'] = positions_df['date'].dt.day 
positions_df['weekday'] = positions_df['date'].dt.weekday_name

### Convert the unix timestampt to human datetime and localize

In [None]:
positions_df['date_time'] = pd.to_datetime(positions_df['snapshot_id'],unit='s')
positions_df['utc_datetime'] = \
    pd.to_datetime(positions_df['date_time'], format='%Y-%m-%dT%H:%M:%SZ').dt.tz_localize('UTC')

In [None]:
positions_df['datetime_pst'] = positions_df['utc_datetime'].dt.tz_convert('America/Los_Angeles')

In [None]:
positions_df['date'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%m/%d/%Y')
positions_df['time'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%H:%M:%S')
positions_df['display_time'] = pd.to_datetime(positions_df['datetime_pst']).dt.strftime('%I:%M %p')

In [None]:
positions_df = \
    positions_df.drop(['snapshot_id', 'radar_id', 'day',\
                          'datestr','utc_datetime','date_time', 'datetime_pst', 'display_time'], axis=1)

In [None]:
positions = pd.DataFrame(positions_df)

In [None]:
positions.sort_values(by='date', ascending=True).head()

---

## Process 'flights' metadata about each set of points

### Set path for flights and define the files we'll concatenate

In [None]:
a_flight = pd.read_csv('flights/20191215_flights.csv')

In [None]:
a_flight.head()

In [None]:
path = 'flights'
files = glob.glob(os.path.join(path, "*.csv"))

### Read the csv and create a 'date' field

In [None]:
file_df = (pd.read_csv(f, encoding = "ISO-8859-1", low_memory=False)\
           .assign(date=os.path.basename(f)) for f in files)

### Combined our newly processed flight files

In [None]:
flights_df = pd.concat(file_df, ignore_index=True)

In [None]:
flights_df.head()

### Clean up our dates for use later

In [None]:
flights_df['date'] = flights_df['date']\
    .str.replace('_flights.csv','')

In [None]:
flights_df['date'] = pd.to_datetime(flights_df.date, format='%Y%m%d')
flights_df['month'] = flights_df['date'].dt.month 
flights_df['day'] = flights_df['date'].dt.day 
flights_df['weekday'] = flights_df['date'].dt.weekday_name

### Create a new dataframe with flights and export to CSV

In [None]:
flights = pd.DataFrame(flights_df)

In [None]:
flights.head()

In [None]:
# flights.to_csv('../output/all_flights.csv')

### Group by flight ID to associate each flight with an aircraft

In [None]:
flight_id_grouped = flights.groupby(['flight_id', 'reg']).agg('size').reset_index(name='count')
flight_id_grouped = \
    flight_id_grouped.drop(['count'], axis=1)

In [None]:
len(flight_id_grouped)

---

### Merge to add aircraft ID and registration N number to each position

In [None]:
flight_id_grouped['flight_id'] = flight_id_grouped['flight_id'].astype(str)

In [None]:
positions = positions.merge(flight_id_grouped, on='flight_id')

In [None]:
positions = gpd.GeoDataFrame(positions.merge(src, left_on='reg', right_on='n_number'))

In [None]:
len(positions)

In [None]:
# positions.reset_index().to_feather('/Users/mhustiles/data/data/helicopters/all_positions.feather')

---

## Geography

### Convert to positions to a GeoDataFrame using lon/lat for each point in the flight

In [None]:
positions.loc[112000]

In [None]:
positions_geo = gpd.GeoDataFrame(positions, \
                geometry=gpd.points_from_xy(positions['longitude'], positions['latitude']))

In [None]:
positions_geo.crs = "epsg:4326"

### Loop though all the aircraft, creating frames for each set of positions to export

In [None]:
n_numbers = positions_geo.groupby(['reg']).agg('size').reset_index(name='count')

In [None]:
choppers_list = n_numbers['reg'].tolist()

In [None]:
n_numbers = []
for n in choppers_list:
    n_numbers.append(dict(n_number = n))

In [None]:
# df = pd.DataFrame()

# for l in n_numbers:
#     n = l['n_number']
#     aircraft = positions_geo[positions_geo['n_number'] == n]
#     aircraft.to_file(f'/Users/mhustiles/data/data/helicopters/' + n + '.geojson', driver='GeoJSON')

In [None]:
!tippecanoe --generate-ids --force -Z8 -z11 -r1 -pk -pf -o \
/Users/mhustiles/data/data/helicopters/N661PD.mbtiles \
/Users/mhustiles/data/data/helicopters/N661PD.geojson

---