# Exploring opensky states data

In [1]:
import math
import pandas as pd
import numpy as np
from scipy.stats import stats, norm, skew
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasRegressor
from scipy.special import boxcox1p
import lightgbm as lgb
import xgboost as xgb

%matplotlib inline
np.random.seed(2)

Using TensorFlow backend.


In [57]:
dataset1 = pd.read_csv('../input/states_2019-01-14-00.csv')
print('dataset1 loaded')
dataset2 = pd.read_csv('../input/states_2019-01-14-01.csv')
print('dataset2 loaded')
dataset3 = pd.read_csv('../input/states_2019-01-14-02.csv')
print('dataset3 loaded')
dataset = pd.concat([dataset1, dataset2, dataset3], axis=0)

dataset1 loaded
dataset2 loaded
dataset3 loaded


In [58]:
dataset[0:5]

Unnamed: 0,time,icao24,lat,lon,velocity,heading,vertrate,callsign,onground,alert,spi,squawk,baroaltitude,geoaltitude,lastposupdate,lastcontact
0,1547424000,4ca1fe,51.671308,2.311249,127.868055,288.289158,-4.8768,RYR4WA,False,False,False,3117.0,7757.16,7620.0,1547424000.0,1547424000.0
1,1547424000,4b187d,40.537491,-73.051758,204.903762,76.940691,10.07872,SWR17P,False,False,False,1547.0,4099.56,4091.94,1547424000.0,1547424000.0
2,1547424000,a776b2,41.119446,-71.720581,198.12774,246.430252,3.90144,JBU321,False,False,False,2076.0,10195.56,10088.88,1547424000.0,1547424000.0
3,1547424000,40762c,51.964233,8.027344,227.736637,45.732175,0.0,NPT4069,False,False,False,7130.0,11277.6,10904.22,1547424000.0,1547424000.0
4,1547424000,a278f3,40.795402,-73.304714,222.625901,48.278499,0.0,CNS818,False,False,False,3055.0,7010.4,6964.68,1547424000.0,1547424000.0


## Dropping useless columns

In [59]:
dataset = dataset.drop('onground', axis=1)
dataset = dataset.drop('squawk', axis=1)
dataset = dataset.drop('time', axis=1)
dataset = dataset.drop('lastcontact', axis=1)
dataset = dataset.drop('spi', axis=1)

In [60]:
dataset.isnull().sum()

icao24                 0
lat              1212768
lon              1212768
velocity         1165636
heading          1165636
vertrate         1144668
callsign         1046304
alert                  0
baroaltitude      477801
geoaltitude      1576856
lastposupdate    1212768
dtype: int64

## Dropping useless rows

In [61]:
dataset = dataset.dropna(axis=0, subset=['lat', 'lon'])
dataset.isnull().sum()

icao24                0
lat                   0
lon                   0
velocity         177473
heading          177473
vertrate         173074
callsign         203760
alert                 0
baroaltitude     167155
geoaltitude      445952
lastposupdate         0
dtype: int64

## Filling barometric altitude holes with geoaltitudes and removing the remaninig nulls

In [62]:
dataset['altitude'] = dataset['baroaltitude']
dataset['altitude'] = dataset['altitude'].fillna(dataset['geoaltitude'])
dataset = dataset.dropna(axis=0, subset=['altitude'])
dataset = dataset.drop('geoaltitude', axis=1)
dataset = dataset.drop('baroaltitude', axis=1)
dataset.isnull().sum()

icao24                0
lat                   0
lon                   0
velocity          61992
heading           61992
vertrate          57729
callsign         167431
alert                 0
lastposupdate         0
altitude              0
dtype: int64

## Data cleaned

The remaining nulls could be filled via interpolation once the dataset is splitted over the `icao24` field values

## Plotting some flights

In [71]:
groups = dataset.groupby('icao24')

flight_paths = []

print(len(groups.groups))
max_num = 1000
for k, i in groups:
    if max_num == 0: break
    max_num -= 1
    flight_paths.append(i[['lat', 'lon']].values.tolist())


12203


In [72]:
from ipyleaflet import Map, Polyline

print(len(flight_paths))

av_lat, av_lon = 41, 12
print(av_lat, av_lon)

line = Polyline(
    locations = [flight_paths],
    color = "green",
    fill=False,
    weight=2)
m = Map(center = (av_lat, av_lon), zoom = 4)
m.add_layer(line)
m.layout.width = '100%'
m.layout.height = '800px'
m

1000
41 12


Map(basemap={'url': 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', 'max_zoom': 19, 'attribution': 'Map …