In [1]:
############################################################################
# SCRIPT TO CREATE DATASET INPUT FOR THE LIGHT GBM MODEL
# IT SAMPLES AN EQUALLY DISTRIBUTED DATAFRAME CONDISERING ONLY RISKY EVENTS 
# RISKY EVENTS ARE THOSE THAT COLLISSION_PROBABILITY IS GREATER THAN 10 E-6
#############################################################################
import pandas as pd
import datetime as dt
import numpy as np
import os

from preparing_data import *

print("------------------------------------------------------")
print("This script creates the dataframe needed as input for the computation of the Machine Learning Model")
print("----------------------------------------------------- \n")
print("Loading full Kelvin's Challenge dataframe...............")
print("............................................")
df=pd.read_csv("./data/train_data.csv")

# CONVERT KELVIN DATASET TO CDM FORMAT TO SIMULATE ACTUAL INPUT
cdm=convertKelvinDatasetToCDMFormat(df)

# DELETE NULLS FROM ONE COLUMN NEEDED TO RUN FOLLOWING TIME CONVERSIONS
cdm.dropna(subset = ["OBJECT2_TIME_LASTOB_START"], inplace=True)

# CONVERT TIME STRING TO TIMEDATE
cdm=convertTimestringToTimedate(cdm)
# CONVERT TIMEDATE TO RANGE IN DAYS
cdm=convertTimedateToDaysRange(cdm)
# CONVERT RISK IN LOGARITHMIC SCALE TO NATURAL SCALE THE SAME THAT COLLISSION PROBABILITY USES IN THE CDMs
cdm=convertPCto10logaritmicscale(cdm)

#DELETE NULS FROM ALL THER OTHER ROWS
cdm.dropna(inplace=True)

# DROP NON NUMERIC COLUMNS
numeric_cols=cdm.select_dtypes(exclude='number')
cdm.drop(numeric_cols, axis=1, inplace=True)

print("Adding correlation matrix elements to the dataframe \n")

# CALCULATE AND ADD CORRELATION COLUMNS TO IMPROVE MACHINE LEARNING MODEL
cdm=addCorrelationColumns(cdm)

#DELETE COVARIANCE MATRIX NON DIAGONAL ELEMENTS
print("Deleting covariance matrix elements from the dataframe \n")

cdm=deleteCovarianceNonDiagonalElements(cdm)
print("Dataframe size without feature engineering {} x {}".format(cdm.shape[0],cdm.shape[1]))
cdm.head()


#DELETING OBSERVATION COLUMNS NO NEEDED IN THE MODEL
cdm.drop([     'OBJECT1_TIME_LASTOB_START',
                'OBJECT1_TIME_LASTOB_END',
                'OBJECT2_TIME_LASTOB_START',
                'OBJECT2_TIME_LASTOB_END'
                ], inplace=True, axis=1)


# REORDERING COLUMNS BRING __time_to_tca TO FRONT
cdm=cdm[ ['__time_to_tca'] + [ col for col in cdm.columns if col != '__time_to_tca' ] ]


#SORT DATAFRAME BY event_id AND THEN BY __time_to_tca DESCENDING
cdm.sort_values(by=['event_id', '__time_to_tca'],ascending=[True, False],inplace=True)

# COLUMN TO PREDICT
cdm["TARGET_PC"] = cdm["COLLISSION_PROBABILITY"].shift(-1)

#AUXILIAR ROW TO DELETE THE LAST CDM BECAUSE IT IS THE ONE WE WANT TO PREDICT
cdm["LAST_CDM"]=cdm["event_id"]==cdm["event_id"].shift(-1)
cdm=cdm[cdm["LAST_CDM"]==True]
#cdm.reset_index(inplace=True)
cdm.drop(["LAST_CDM"], inplace=True, axis=1)

# FEATURE ENGINEERING
print("-----------------------------------------------------------")
print("------------- FEATURE ENGINEERING -------------------------")
print("-----------------------------------------------------------")
print("FROM MAIN VARIABLES MISS_DISTANCE and COLLISSION_PROBABILITY...")
print("Computing...")
print("trends...")
print("gradients...")
print("............................................................")
################## COLLISSION_PROBABILITY ##################
# (PC_i + PC_i-1 + PC_i-2) / 3 (MOVING AVERAGE WINDOW 3)
#cdm["PC_mavg_1"]=(cdm["COLLISSION_PROBABILITY"]+cdm["COLLISSION_PROBABILITY"].shift(1)+cdm["COLLISSION_PROBABILITY"].shift(2))/3
# PC_i - P_i-1 (TREND)
cdm["PC_trend_1"]=cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(1)
# PC_i - P_i-3 (TREND)
cdm["PC_trend_3"]=cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(3)
# ( PC_i - PC_i-1 ) / time_delta  (GRADIENT)
cdm["PC_gradient_1"]=(cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(1))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(1)))
# ( PC_i - PC_i-3 ) / time_delta  (GRADIENT)
cdm["PC_gradient_3"]=(cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(3))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(3)))

################## MISS_DISTANCE ##################
# ( _i + _i-1 + _i-2) / 3 (MOVING AVERAGE WINDOW 3)
#cdm["MD_mavg_1"]=(cdm["MISS_DISTANCE"]+cdm["MISS_DISTANCE"].shift(1)+cdm["MISS_DISTANCE"].shift(2))/3
# _i - _i-1 (TREND)
cdm["MD_trend_1"]=cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(1)
# _i -  _i-3 (TREND)
cdm["MD_trend_3"]=cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(3)
#( _i - _i-1 ) / time_delta  (GRADIENT)
cdm["MD_gradient_1"]=(cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(1))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(1)))
#( _i - _i-3 ) / time_delta  (GRADIENT)
cdm["MD_gradient_3"]=(cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(3))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(3)))



#AUXILIAR COLUMN TO DELETE VALUES MIXING CDMs OF DIFFERENT event_id
cdm["VALID_ROW"]=cdm["event_id"]==cdm["event_id"].shift(3)
cdm=cdm[cdm["VALID_ROW"]==True]
cdm.reset_index(inplace=True)

# REORDERING COLUMNS TARGET_PC to end
cdm=cdm[ [ col for col in cdm.columns if col != 'TARGET_PC' ] + ['TARGET_PC'] ]

# DELETING COLUMNS
cdm.drop(["index","VALID_ROW","event_id"], inplace=True, axis=1)

# CREATE PICKLE FILE TO LOAD WHEN NEEDED
print("Creating pickle file... \n")
filename="./dataframe/_PRUEBA_df_{}.pkl".format(dt.datetime.now().strftime("%Y%m%d_%H%M%S"))
print("Saving dataframe for future usage filename = {}".format(filename))
cdm.to_pickle(filename)

------------------------------------------------------
This script creates the dataframe needed as input for the computation of the Machine Learning Model
----------------------------------------------------- 

Loading full Kelvin's Challenge dataframe...............
............................................
Adding correlation matrix elements to the dataframe 

Deleting covariance matrix elements from the dataframe 

Dataframe size without feature engineering 153393 x 81
-----------------------------------------------------------
------------- FEATURE ENGINEERING -------------------------
-----------------------------------------------------------
FROM MAIN VARIABLES MISS_DISTANCE and COLLISSION_PROBABILITY...
Computing...
trends...
gradients...
............................................................
Creating pickle file... 

Saving dataframe for future usage filename = ./dataframe/_PRUEBA_df_20211225_143242.pkl


In [2]:
print(cdm.shape)
print(list(cdm.columns))
cdm.head()

(110337, 85)
['__time_to_tca', 'MISS_DISTANCE', 'RELATIVE_SPEED', 'RELATIVE_POSITION_R', 'RELATIVE_POSITION_T', 'RELATIVE_POSITION_N', 'RELATIVE_VELOCITY_R', 'RELATIVE_VELOCITY_T', 'RELATIVE_VELOCITY_N', 'COLLISSION_PROBABILITY', 'OBJECT1_CR_R', 'OBJECT1_CT_T', 'OBJECT1_CN_N', 'OBJECT1_CRDOT_RDOT', 'OBJECT1_CTDOT_TDOT', 'OBJECT1_CNDOT_NDOT', 'OBJECT1_RECOMMENDED_OD_SPAN', 'OBJECT1_ACTUAL_OD_SPAN', 'OBJECT1_OBS_AVAILABLE', 'OBJECT1_OBS_USED', 'OBJECT1_RESIDUALS_ACCEPTED', 'OBJECT1_WEIGHTED_RMS', 'OBJECT1_SEDR', 'OBJECT1_CD_AREA_OVER_MASS', 'OBJECT1_CR_AREA_OVER_MASS', 'OBJECT1_APOGEE_ALTITUDE', 'OBJECT1_PERIGEE_ALTITUDE', 'OBJECT1_INCLINATION', 'OBJECT2_CR_R', 'OBJECT2_CT_T', 'OBJECT2_CN_N', 'OBJECT2_CRDOT_RDOT', 'OBJECT2_CTDOT_TDOT', 'OBJECT2_CNDOT_NDOT', 'OBJECT2_RECOMMENDED_OD_SPAN', 'OBJECT2_ACTUAL_OD_SPAN', 'OBJECT2_OBS_AVAILABLE', 'OBJECT2_OBS_USED', 'OBJECT2_RESIDUALS_ACCEPTED', 'OBJECT2_WEIGHTED_RMS', 'OBJECT2_SEDR', 'OBJECT2_CD_AREA_OVER_MASS', 'OBJECT2_CR_AREA_OVER_MASS', 'OBJ

Unnamed: 0,__time_to_tca,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,RELATIVE_POSITION_N,RELATIVE_VELOCITY_R,RELATIVE_VELOCITY_T,RELATIVE_VELOCITY_N,COLLISSION_PROBABILITY,...,OBJECT2_CORR_CNDOT_TDOT,PC_trend_1,PC_trend_3,PC_gradient_1,PC_gradient_3,MD_trend_1,MD_trend_3,MD_gradient_1,MD_gradient_3,TARGET_PC
0,0.579669,14579.0,13792.0,472.7,5838.9,-13350.7,-7.0,-12637.0,-5525.9,-10.337809,...,-0.675656,0.007822,-0.132854,0.020997,-0.134587,104.0,-344.0,279.177108,-348.485402,-10.39126
1,6.042352,18842.0,14347.0,-700.0,-5192.1,18099.4,14.4,-13791.4,-3957.2,-30.0,...,-0.699265,0.0,-19.183839,0.0,-20.384022,57.0,-4060.0,250.409557,-4314.002473,-30.0
2,5.711716,19015.0,14347.0,-709.9,-5242.1,18264.8,14.5,-13791.4,-3957.2,-30.0,...,-0.69978,0.0,-19.149527,0.0,-19.542437,173.0,-3951.0,523.23376,-4032.066516,-30.0
3,5.377642,19137.0,14347.0,-710.3,-5273.6,18382.8,14.5,-13791.4,-3957.2,-30.0,...,-0.699793,0.0,0.0,0.0,0.0,122.0,352.0,365.188863,394.469816,-30.0
4,5.028915,18918.0,14347.0,-714.8,-5213.9,18172.0,14.5,-13791.4,-3957.2,-30.0,...,-0.699527,0.0,0.0,0.0,0.0,-219.0,76.0,-627.997277,74.992291,-30.0
