In [1]:
#############################################################################################################
# SCRIPT TO CREATE DATASET INPUT FOR THE LIGHT GBM MODEL
# IT SAMPLES AN EQUALLY DISTRIBUTED DATAFRAME CONDISERING ONLY RISKY EVENTS 
# RISKY EVENTS ARE THOSE THAT COLLISSION_PROBABILITY IS GREATER THAN 10 E-6
##############################################################################################################
import pandas as pd
import datetime as dt
import numpy as np
import os

from preparing_data import *



df=pd.read_csv("./data/train_data.csv")

# CONVERT KELVIN DATASET TO CDM FORMAT TO SIMULATE ACTUAL INPUT
cdm=convertKelvinDatasetToCDMFormat(df)

# DELETE NULLS FROM ONE COLUMN NEEDED TO RUN FOLLOWING TIME CONVERSIONS
cdm.dropna(subset = ["OBJECT2_TIME_LASTOB_START"], inplace=True)

# CONVERT TIME STRING TO TIMEDATE
cdm=convertTimestringToTimedate(cdm)
# CONVERT TIMEDATE TO RANGE IN DAYS
cdm=convertTimedateToDaysRange(cdm)
# CONVERT RISK IN LOGARITHMIC SCALE TO NATURAL SCALE THE SAME THAT COLLISSION PROBABILITY USES IN THE CDMs
cdm=convertPCto10logaritmicscale(cdm)

#DELETE NULS FROM ALL THER OTHER ROWS
cdm.dropna(inplace=True)

# DROP NON NUMERIC COLUMNS
numeric_cols=cdm.select_dtypes(exclude='number')
cdm.drop(numeric_cols, axis=1, inplace=True)

print("Adding correlation matrix elements to the dataframe \n")

# CALCULATE AND ADD CORRELATION COLUMNS TO IMPROVE MACHINE LEARNING MODEL
cdm=addCorrelationColumns(cdm)

#DELETE COVARIANCE MATRIX NON DIAGONAL ELEMENTS
print("Deleting covariance matrix elements from the dataframe \n")

cdm=deleteCovarianceNonDiagonalElements(cdm)
print("Dataframe size without feature engineering {} x {}".format(cdm.shape[0],cdm.shape[1]))
cdm.head()


#DELETING OBSERVATION COLUMNS NO NEEDED IN THE MODEL
cdm.drop([     'OBJECT1_TIME_LASTOB_START',
                'OBJECT1_TIME_LASTOB_END',
                'OBJECT2_TIME_LASTOB_START',
                'OBJECT2_TIME_LASTOB_END'
                ], inplace=True, axis=1)


# REORDERING COLUMNS BRING __time_to_tca TO FRONT
cdm=cdm[ ['__time_to_tca'] + [ col for col in cdm.columns if col != '__time_to_tca' ] ]


#SORT DATAFRAME BY event_id AND THEN BY __time_to_tca DESCENDING
cdm.sort_values(by=['event_id', '__time_to_tca'],ascending=[True, False],inplace=True)

Adding correlation matrix elements to the dataframe 

Deleting covariance matrix elements from the dataframe 

Dataframe size without feature engineering 153393 x 81


In [2]:
cdm[cdm.__time_to_tca<=0]

Unnamed: 0,__time_to_tca,event_id,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,RELATIVE_POSITION_N,RELATIVE_VELOCITY_R,RELATIVE_VELOCITY_T,RELATIVE_VELOCITY_N,...,OBJECT2_CORR_CRDOT_N,OBJECT2_CORR_CTDOT_R,OBJECT2_CORR_CTDOT_T,OBJECT2_CORR_CTDOT_N,OBJECT2_CORR_CTDOT_RDOT,OBJECT2_CORR_CNDOT_R,OBJECT2_CORR_CNDOT_T,OBJECT2_CORR_CNDOT_N,OBJECT2_CORR_CNDOT_RDOT,OBJECT2_CORR_CNDOT_TDOT
842,-0.001596,60,15937.0,10678.0,-115.5,11174.2,11363.4,-21.9,-7613.8,7487.4,...,-0.021585,-0.999830,0.319152,0.228060,-0.306886,-0.142060,0.007982,0.680366,-0.056514,0.141551
1260,-0.007561,89,25691.0,9966.0,-1405.1,19667.6,16470.6,-457.6,-6410.9,7617.1,...,-0.444555,-0.532837,-0.237173,0.368578,0.345395,0.012492,0.598190,0.016380,-0.694953,-0.757884
2788,-0.144270,214,1354.0,14677.0,-635.6,-211.1,1177.6,-38.8,-14442.4,-2616.4,...,-0.244423,-0.999507,0.387241,0.442451,-0.491668,-0.611512,0.189405,0.938286,-0.287396,0.610530
3293,-0.005489,256,9087.0,14833.0,-395.6,653.8,9055.5,-34.2,-14795.3,1068.1,...,0.146256,-0.998872,0.268482,-0.287496,-0.281347,-0.506428,-0.199869,0.224557,0.163286,0.514597
3830,-0.029272,301,25835.0,1218.0,-95.4,-25831.7,-455.3,10.7,-21.5,1218.0,...,-0.770624,-0.999843,-0.944665,-0.755666,0.936050,0.778832,0.738488,0.916936,-0.717078,-0.778513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161827,-0.006722,13091,37085.0,14173.0,184.3,-12136.4,35042.5,0.2,-13392.4,-4639.4,...,-0.101516,-0.999793,0.281527,0.475940,-0.276026,0.115751,-0.038025,0.794315,0.042480,-0.116501
162040,-0.010056,13105,9222.0,5529.0,180.3,8554.3,3441.4,-5.0,-2063.6,5130.2,...,-0.630130,-0.999717,0.649309,0.706870,-0.643575,0.134227,-0.205075,-0.174718,0.197859,-0.134130
162137,-0.017494,13113,2571.0,14635.0,-197.5,-542.3,-2505.8,0.4,-14299.6,3118.8,...,-0.438570,-0.999373,0.568272,0.922983,-0.481558,-0.126302,0.154221,0.080820,-0.110421,0.126510
162339,-0.011776,13131,48848.0,6672.0,-551.6,43732.1,-21757.5,-41.9,-2972.5,-5973.1,...,0.189697,-0.995861,-0.552428,-0.072762,0.556452,-0.037205,-0.293071,0.691740,0.240138,0.034749


In [3]:
PC_data=cdm.COLLISSION_PROBABILITY

In [4]:
PC_data.shape

(153393,)

In [5]:
PC_data=PC_data.astype(int)

In [6]:
type(PC_data)

pandas.core.series.Series

In [7]:
PC_data=PC_data.to_frame(name="PC")

In [8]:
df = PC_data
df15 = PC_data[PC_data.PC >= -15]
df6 = PC_data[PC_data.PC >= -6]

In [13]:
import numpy as np

from bokeh.layouts import gridplot
from bokeh.plotting import figure, show

In [106]:
def make_plot(title, hist, edges):
    p = figure(title=title, tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="grey", line_color="white", alpha=0.5)
           #fill_color="navy"

    p.y_range.start = 0
    # p.legend.location = "center_right"
    #p.legend.background_fill_color = "#fefefe"
    p.background_fill_color = "white"

    p.xaxis.axis_label = 'Probabilidad de Colisión'
    p.yaxis.axis_label = 'Cantidad'
    p.grid.grid_line_color="white"
    return p


In [108]:
measured = df.to_numpy()
hist, edges = np.histogram(measured, density=False,bins=30)
p1 = make_plot("Rango PC ε [-30, 0)", hist, edges)

measured = df15.to_numpy()
hist, edges = np.histogram(measured, density=False,bins=15)
p2 = make_plot("Rango PC ε [-15, 0)", hist, edges)

measured = df6.to_numpy()
hist, edges = np.histogram(measured, density=False,bins=4)
p3 = make_plot("Rango PC ε [-6, 0)", hist, edges)

show(gridplot([p1,p2,p3], ncols=1, width=800, height=400, toolbar_location=None))

In [16]:
from bokeh.io import export_png

#export_png(p1, filename="plot.png")

In [109]:
full_grid=gridplot([p1,p2,p3], ncols=1, width=800, height=300, toolbar_location=None)
export_png(full_grid, filename="/home/esteban/Documents/ITBA/Collision-avoidance/images/plot-grid.png")


'/home/esteban/Documents/ITBA/Collision-avoidance/images/plot-grid.png'

In [18]:
# # SELECTING DATA TO BUILD MODEL

# print("Building dataframe... \n")
# aux1=cdm[(cdm["COLLISSION_PROBABILITY"]>-4)& (cdm["__time_to_tca"]<1)]
# aux2=cdm[(cdm["COLLISSION_PROBABILITY"]<-4) & (cdm["COLLISSION_PROBABILITY"]>-5)& (cdm["__time_to_tca"]<1)]
# aux3=cdm[(cdm["COLLISSION_PROBABILITY"]<-5) & (cdm["COLLISSION_PROBABILITY"]>-6)& (cdm["__time_to_tca"]<1)]


# # APPEND SUBPART OF DATAFRAMES EVENTS WITH PROBABILITIES LOWER THAN 10-6 TO CREATE AN EQUALLY DISTRIBUITED PROBABILITY DATAFRAME



In [19]:
countdf=(df.PC.value_counts()).to_frame(name="cant")

In [20]:
countdf["PC"]=countdf.index

In [21]:
countdf.sort_values(by="PC",ascending=False,inplace=True)

In [22]:
countdf.head()

Unnamed: 0,cant,PC
-1,8,-1
-2,42,-2
-3,647,-3
-4,2525,-4
-5,6450,-5


In [23]:
sum(countdf.cant)

153393

In [24]:
countdf["p"]=countdf.cant/sum(countdf.cant)

In [25]:
countdf["cumsum1"]=countdf.p.cumsum()

In [26]:
countdf["cumsumR"]=1-countdf.cumsum1

In [27]:
countdf

Unnamed: 0,cant,PC,p,cumsum1,cumsumR
-1,8,-1,5.2e-05,5.2e-05,0.999948
-2,42,-2,0.000274,0.000326,0.999674
-3,647,-3,0.004218,0.004544,0.995456
-4,2525,-4,0.016461,0.021005,0.978995
-5,6450,-5,0.042049,0.063054,0.936946
-6,10380,-6,0.067669,0.130723,0.869277
-7,10065,-7,0.065616,0.196339,0.803661
-8,7517,-8,0.049005,0.245344,0.754656
-9,5971,-9,0.038926,0.28427,0.71573
-10,5251,-10,0.034232,0.318502,0.681498


In [28]:
aux=countdf
aux.p=np.round(aux.p,5)
aux.cumsum1=np.round(aux.cumsum1,5)
aux.drop('cumsumR', inplace=True, axis=1)
aux=aux[ ['PC'] + [ col for col in aux.columns if col != 'PC' ] ]
aux.to_csv("./report/table.csv",index=False)


In [53]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import BoxAnnotation
from bokeh.models import ColumnDataSource, Label, LabelSet, Range1d

In [97]:
p = figure(width=400, height=600,toolbar_location=None)

yvalues=np.round((countdf.cumsum1)*100,2) #en porcentaje
# add a line renderer
p.line(countdf.PC,yvalues, line_width=4)
#p.line(x=-4,y=[0,100], line_width=1,line_dash="dashed",line_color="red")
p.xaxis.axis_label = 'Probabilidad de Colisión'
p.yaxis.axis_label = 'Proporción'
p.grid.grid_line_color="white"
p.y_range = Range1d(0, 100)
#p.varea(countdf.PC, y1=0,y2=yvalues, alpha=0.6)

collision_box = BoxAnnotation(left=-4,bottom=0,top=100, fill_alpha=0.4, fill_color='red')
p.add_layout(collision_box)

monitoring_box = BoxAnnotation(left=-6,right=-4,bottom=0,top=100, fill_alpha=0.4, fill_color='yellow')
p.add_layout(monitoring_box)



# labels = LabelSet(x='weight', y='height', text='names',
#               x_offset=5, y_offset=5, source=source, render_mode='canvas')
citation = Label(x=260, y=200, x_units='screen', y_units='screen',
                 text='Maniobra', render_mode='css',
                 text_alpha=1,text_font_style="bold",text_color="red")
                 #background_fill_color='red', background_fill_alpha=0.4,
p.add_layout(citation)

citation = Label(x=220, y=250, x_units='screen', y_units='screen',
                 text='Alerta', render_mode='css',
                 text_alpha=1,text_font_style="bold",text_color="orange")
                 #background_fill_color='red', background_fill_alpha=0.4,
p.add_layout(citation)

citation = Label(x=50, y=140, x_units='screen', y_units='screen',
                 text='Fuera de protocolo', render_mode='css',
                 text_alpha=1,text_font_style="bold",text_color="black")
                 #background_fill_color='red', background_fill_alpha=0.4,
p.add_layout(citation)



show(p)

In [98]:
export_png(p, filename="/home/esteban/Documents/ITBA/Collision-avoidance/images/PC_relation.png")


'/home/esteban/Documents/ITBA/Collision-avoidance/images/PC_relation.png'

In [30]:
df2=(cdm.COLLISSION_PROBABILITY).to_frame()

In [31]:
df2.head()

Unnamed: 0,COLLISSION_PROBABILITY
0,-10.204955
1,-10.355758
2,-10.345631
3,-10.337809
4,-10.39126


In [32]:
# import plotly.express as px
# fig = px.box(df2, y="COLLISSION_PROBABILITY")
# fig.show()


In [33]:
# aux_describe=np.round(df2.describe(),2)
# aux_describe.to_csv("table_description.csv")


In [34]:
newdf = [cdm["COLLISSION_PROBABILITY"],cdm["MISS_DISTANCE"], cdm["__time_to_tca"]]
headers = ["COLLISION_PROBABILITY","MISS_DISTANCE", "__time_to_tca"]

df3 = (pd.concat(newdf, axis=1, keys=headers)).describe()

In [35]:
df3

Unnamed: 0,COLLISION_PROBABILITY,MISS_DISTANCE,__time_to_tca
count,153393.0,153393.0,153393.0
mean,-19.284374,16209.889702,3.356155
std,10.017741,14047.739057,2.012437
min,-30.0,9.0,-0.149808
25%,-30.0,4589.0,1.594216
50%,-17.708187,12138.0,3.306945
75%,-9.111652,24768.0,5.089688
max,-1.442854,67373.0,6.993832


In [37]:
aux_describe=np.round(df3,2)
aux_describe.to_csv("./report/table_description_3_variables.csv")

In [100]:
cdm.shape

(153393, 77)