In [1]:
#############################################################################################################
# SCRIPT TO CREATE DATASET INPUT FOR THE LIGHT GBM MODEL
# IT SAMPLES AN EQUALLY DISTRIBUTED DATAFRAME CONDISERING ONLY RISKY EVENTS 
# RISKY EVENTS ARE THOSE THAT COLLISSION_PROBABILITY IS GREATER THAN 10 E-6
##############################################################################################################
import pandas as pd
import datetime as dt
import numpy as np
import os

from preparing_data import *



df=pd.read_csv("./data/train_data.csv")

# CONVERT KELVIN DATASET TO CDM FORMAT TO SIMULATE ACTUAL INPUT
cdm=convertKelvinDatasetToCDMFormat(df)

# DELETE NULLS FROM ONE COLUMN NEEDED TO RUN FOLLOWING TIME CONVERSIONS
cdm.dropna(subset = ["OBJECT2_TIME_LASTOB_START"], inplace=True)

# CONVERT TIME STRING TO TIMEDATE
cdm=convertTimestringToTimedate(cdm)
# CONVERT TIMEDATE TO RANGE IN DAYS
cdm=convertTimedateToDaysRange(cdm)
# CONVERT RISK IN LOGARITHMIC SCALE TO NATURAL SCALE THE SAME THAT COLLISSION PROBABILITY USES IN THE CDMs
cdm=convertPCto10logaritmicscale(cdm)

#DELETE NULS FROM ALL THER OTHER ROWS
cdm.dropna(inplace=True)

# DROP NON NUMERIC COLUMNS
numeric_cols=cdm.select_dtypes(exclude='number')
cdm.drop(numeric_cols, axis=1, inplace=True)

print("Adding correlation matrix elements to the dataframe \n")

# CALCULATE AND ADD CORRELATION COLUMNS TO IMPROVE MACHINE LEARNING MODEL
cdm=addCorrelationColumns(cdm)

#DELETE COVARIANCE MATRIX NON DIAGONAL ELEMENTS
print("Deleting covariance matrix elements from the dataframe \n")

cdm=deleteCovarianceNonDiagonalElements(cdm)
print("Dataframe size without feature engineering {} x {}".format(cdm.shape[0],cdm.shape[1]))
cdm.head()


#DELETING OBSERVATION COLUMNS NO NEEDED IN THE MODEL
cdm.drop([     'OBJECT1_TIME_LASTOB_START',
                'OBJECT1_TIME_LASTOB_END',
                'OBJECT2_TIME_LASTOB_START',
                'OBJECT2_TIME_LASTOB_END'
                ], inplace=True, axis=1)


# REORDERING COLUMNS BRING __time_to_tca TO FRONT
cdm=cdm[ ['__time_to_tca'] + [ col for col in cdm.columns if col != '__time_to_tca' ] ]


#SORT DATAFRAME BY event_id AND THEN BY __time_to_tca DESCENDING
cdm.sort_values(by=['event_id', '__time_to_tca'],ascending=[True, False],inplace=True)

Adding correlation matrix elements to the dataframe 

Deleting covariance matrix elements from the dataframe 

Dataframe size without feature engineering 153393 x 81


In [2]:
PC_data=cdm.COLLISSION_PROBABILITY

In [3]:
PC_data.shape

(153393,)

In [4]:
PC_data=PC_data.astype(int)

In [5]:
type(PC_data)

pandas.core.series.Series

In [6]:
PC_data=PC_data.to_frame(name="PC")

In [7]:
df = PC_data
df15 = PC_data[PC_data.PC >= -15]
df6 = PC_data[PC_data.PC >= -6]

In [8]:
# import plotly.express as px
# df = PC_data
# fig = px.histogram(df, x="PC",template="presentation",
#     labels={"PC":"PC"})
# fig.update_layout(yaxis_title="Cantidad")
# fig.show()

In [9]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go

# fig = go.Figure(data=[go.Histogram(x=df.PC)])
# fig.show()

In [10]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go
# df = PC_data
# df15 = PC_data[PC_data.PC >= -15]
# df6 = PC_data[PC_data.PC >= -6]

# fig = make_subplots(rows=3, cols=1)

# fig.append_trace(go.Histogram(x=df.PC), row=1, col=1)

# fig.append_trace(go.Histogram(x=df15.PC), row=2, col=1)

# fig.append_trace(go.Histogram(x=df6.PC), row=3, col=1)


# fig.update_layout(height=800, width=700)#, title_text="Stacked Subplots")

# fig.show()


In [11]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go
# df = PC_data

# fig = make_subplots(rows=3, cols=1)

# fig.append_trace(go.Histogram(x=df.PC,line=dict(color="grey")), row=1, col=1)

# fig.append_trace(go.Histogram(x=df.PC,line=dict(color="grey")), row=2, col=1)

# fig.append_trace(go.Histogram(x=df.PC), row=3, col=1)


# fig.update_layout(height=600, width=600)#, title_text="Stacked Subplots")
#fig.show()

In [12]:
import numpy as np

from bokeh.layouts import gridplot
from bokeh.plotting import figure, show

In [13]:
def make_plot(title, hist, edges):
    p = figure(title=title, tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)

    p.y_range.start = 0
    # p.legend.location = "center_right"
    # p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'Probabilidad de Colisión'
    p.yaxis.axis_label = 'Cantidad'
    p.grid.grid_line_color="white"
    return p


In [14]:
measured = df.to_numpy()
hist, edges = np.histogram(measured, density=False,bins=30)
p1 = make_plot("Rango PC ε [-30, 0)", hist, edges)

measured = df15.to_numpy()
hist, edges = np.histogram(measured, density=False,bins=15)
p2 = make_plot("Rango PC ε [-15, 0)", hist, edges)

measured = df6.to_numpy()
hist, edges = np.histogram(measured, density=False,bins=4)
p3 = make_plot("Rango PC ε [-6, 0)", hist, edges)

show(gridplot([p1,p2,p3], ncols=1, width=800, height=400, toolbar_location=None))

In [15]:
from bokeh.io import export_png

export_png(p1, filename="plot.png")

'/home/esteban/automatic-collision-avoidance/automatic-collision-detection/plot.png'

In [16]:
full_grid=gridplot([p1,p2,p3], ncols=1, width=800, height=300, toolbar_location=None)
export_png(full_grid, filename="/home/esteban/Documents/ITBA/Collision-avoidance/images/plot-grid.png")


'/home/esteban/Documents/ITBA/Collision-avoidance/images/plot-grid.png'

In [17]:
# # SELECTING DATA TO BUILD MODEL

# print("Building dataframe... \n")
# aux1=cdm[(cdm["COLLISSION_PROBABILITY"]>-4)& (cdm["__time_to_tca"]<1)]
# aux2=cdm[(cdm["COLLISSION_PROBABILITY"]<-4) & (cdm["COLLISSION_PROBABILITY"]>-5)& (cdm["__time_to_tca"]<1)]
# aux3=cdm[(cdm["COLLISSION_PROBABILITY"]<-5) & (cdm["COLLISSION_PROBABILITY"]>-6)& (cdm["__time_to_tca"]<1)]


# # APPEND SUBPART OF DATAFRAMES EVENTS WITH PROBABILITIES LOWER THAN 10-6 TO CREATE AN EQUALLY DISTRIBUITED PROBABILITY DATAFRAME

