In [1]:
############################################################################
# SCRIPT TO CREATE DATASET INPUT FOR THE LIGHT GBM MODEL
# IT SAMPLES AN EQUALLY DISTRIBUTED DATAFRAME CONDISERING ONLY RISKY EVENTS 
# RISKY EVENTS ARE THOSE THAT COLLISSION_PROBABILITY IS GREATER THAN 10 E-6
#############################################################################
import pandas as pd
import datetime as dt
import numpy as np
import os

from preparing_data import *

print("------------------------------------------------------")
print("This script creates the dataframe needed as input for the computation of the Machine Learning Model")
print("----------------------------------------------------- \n")
print("Loading full Kelvin's Challenge dataframe...............")
print("............................................")
df=pd.read_csv("./data/train_data.csv")

# CONVERT KELVIN DATASET TO CDM FORMAT TO SIMULATE ACTUAL INPUT
cdm=convertKelvinDatasetToCDMFormat(df)

standard_df=convertKelvinDatasetToCDMFormat(df)



------------------------------------------------------
This script creates the dataframe needed as input for the computation of the Machine Learning Model
----------------------------------------------------- 

Loading full Kelvin's Challenge dataframe...............
............................................


In [2]:
len(df.columns)

103

In [3]:
print(list(df.columns))

['event_id', 'time_to_tca', 'mission_id', 'risk', 'max_risk_estimate', 'max_risk_scaling', 'miss_distance', 'relative_speed', 'relative_position_r', 'relative_position_t', 'relative_position_n', 'relative_velocity_r', 'relative_velocity_t', 'relative_velocity_n', 't_time_lastob_start', 't_time_lastob_end', 't_recommended_od_span', 't_actual_od_span', 't_obs_available', 't_obs_used', 't_residuals_accepted', 't_weighted_rms', 't_rcs_estimate', 't_cd_area_over_mass', 't_cr_area_over_mass', 't_sedr', 't_j2k_sma', 't_j2k_ecc', 't_j2k_inc', 't_ct_r', 't_cn_r', 't_cn_t', 't_crdot_r', 't_crdot_t', 't_crdot_n', 't_ctdot_r', 't_ctdot_t', 't_ctdot_n', 't_ctdot_rdot', 't_cndot_r', 't_cndot_t', 't_cndot_n', 't_cndot_rdot', 't_cndot_tdot', 'c_object_type', 'c_time_lastob_start', 'c_time_lastob_end', 'c_recommended_od_span', 'c_actual_od_span', 'c_obs_available', 'c_obs_used', 'c_residuals_accepted', 'c_weighted_rms', 'c_rcs_estimate', 'c_cd_area_over_mass', 'c_cr_area_over_mass', 'c_sedr', 'c_j2k_sm

In [4]:
df.shape

(162634, 103)

In [5]:
print(list(cdm.columns))

['event_id', 'TCA', 'CREATION_DATE', 'MISS_DISTANCE', 'RELATIVE_SPEED', 'RELATIVE_POSITION_R', 'RELATIVE_POSITION_T', 'RELATIVE_POSITION_N', 'RELATIVE_VELOCITY_R', 'RELATIVE_VELOCITY_T', 'RELATIVE_VELOCITY_N', 'COLLISSION_PROBABILITY', 'OBJECT1_CR_R', 'OBJECT1_CT_R', 'OBJECT1_CT_T', 'OBJECT1_CN_R', 'OBJECT1_CN_T', 'OBJECT1_CN_N', 'OBJECT1_CRDOT_R', 'OBJECT1_CRDOT_T', 'OBJECT1_CRDOT_N', 'OBJECT1_CRDOT_RDOT', 'OBJECT1_CTDOT_R', 'OBJECT1_CTDOT_T', 'OBJECT1_CTDOT_N', 'OBJECT1_CTDOT_RDOT', 'OBJECT1_CTDOT_TDOT', 'OBJECT1_CNDOT_R', 'OBJECT1_CNDOT_T', 'OBJECT1_CNDOT_N', 'OBJECT1_CNDOT_RDOT', 'OBJECT1_CNDOT_TDOT', 'OBJECT1_CNDOT_NDOT', 'OBJECT1_RECOMMENDED_OD_SPAN', 'OBJECT1_ACTUAL_OD_SPAN', 'OBJECT1_OBS_AVAILABLE', 'OBJECT1_OBS_USED', 'OBJECT1_RESIDUALS_ACCEPTED', 'OBJECT1_WEIGHTED_RMS', 'OBJECT1_SEDR', 'OBJECT1_TIME_LASTOB_START', 'OBJECT1_TIME_LASTOB_END', 'OBJECT1_CD_AREA_OVER_MASS', 'OBJECT1_CR_AREA_OVER_MASS', 'OBJECT1_APOGEE_ALTITUDE', 'OBJECT1_PERIGEE_ALTITUDE', 'OBJECT1_INCLINATION', '

In [6]:
cdm.shape

(162634, 83)

In [7]:
# DELETE NULLS FROM ONE COLUMN NEEDED TO RUN FOLLOWING TIME CONVERSIONS
cdm.dropna(subset = ["OBJECT2_TIME_LASTOB_START"], inplace=True)

# CONVERT TIME STRING TO TIMEDATE
cdm=convertTimestringToTimedate(cdm)
# CONVERT TIMEDATE TO RANGE IN DAYS
cdm=convertTimedateToDaysRange(cdm)
# CONVERT RISK IN LOGARITHMIC SCALE TO NATURAL SCALE THE SAME THAT COLLISSION PROBABILITY USES IN THE CDMs
cdm=convertPCto10logaritmicscale(cdm)

#DELETE NULS FROM ALL THER OTHER ROWS
cdm.dropna(inplace=True)

# DROP NON NUMERIC COLUMNS
numeric_cols=cdm.select_dtypes(exclude='number')
cdm.drop(numeric_cols, axis=1, inplace=True)

print("Adding correlation matrix elements to the dataframe \n")

# CALCULATE AND ADD CORRELATION COLUMNS TO IMPROVE MACHINE LEARNING MODEL
cdm=addCorrelationColumns(cdm)

#DELETE COVARIANCE MATRIX NON DIAGONAL ELEMENTS
print("Deleting covariance matrix elements from the dataframe \n")

cdm=deleteCovarianceNonDiagonalElements(cdm)
print("Dataframe size without feature engineering {} x {}".format(cdm.shape[0],cdm.shape[1]))
cdm.head()


#DELETING OBSERVATION COLUMNS NO NEEDED IN THE MODEL
cdm.drop([     'OBJECT1_TIME_LASTOB_START',
                'OBJECT1_TIME_LASTOB_END',
                'OBJECT2_TIME_LASTOB_START',
                'OBJECT2_TIME_LASTOB_END'
                ], inplace=True, axis=1)


# REORDERING COLUMNS BRING __time_to_tca TO FRONT
cdm=cdm[ ['__time_to_tca'] + [ col for col in cdm.columns if col != '__time_to_tca' ] ]

Adding correlation matrix elements to the dataframe 

Deleting covariance matrix elements from the dataframe 

Dataframe size without feature engineering 153393 x 81


In [8]:
antes_de_FE=cdm.copy()

In [9]:
#SORT DATAFRAME BY event_id AND THEN BY __time_to_tca DESCENDING
cdm.sort_values(by=['event_id', '__time_to_tca'],ascending=[True, False],inplace=True)

# COLUMN TO PREDICT
cdm["TARGET_PC"] = cdm["COLLISSION_PROBABILITY"].shift(-1)

#AUXILIAR ROW TO DELETE THE LAST CDM BECAUSE IT IS THE ONE WE WANT TO PREDICT
cdm["LAST_CDM"]=cdm["event_id"]==cdm["event_id"].shift(-1)
cdm=cdm[cdm["LAST_CDM"]==True]
#cdm.reset_index(inplace=True)
cdm.drop(["LAST_CDM"], inplace=True, axis=1)

# FEATURE ENGINEERING
print("-----------------------------------------------------------")
print("------------- FEATURE ENGINEERING -------------------------")
print("-----------------------------------------------------------")
print("FROM MAIN VARIABLES MISS_DISTANCE and COLLISSION_PROBABILITY...")
print("Computing...")
print("trends...")
print("gradients...")
print("............................................................")
################## COLLISSION_PROBABILITY ##################
# (PC_i + PC_i-1 + PC_i-2) / 3 (MOVING AVERAGE WINDOW 3)
#cdm["PC_mavg_1"]=(cdm["COLLISSION_PROBABILITY"]+cdm["COLLISSION_PROBABILITY"].shift(1)+cdm["COLLISSION_PROBABILITY"].shift(2))/3
# PC_i - P_i-1 (TREND)
cdm["PC_trend_1"]=cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(1)
# PC_i - P_i-3 (TREND)
cdm["PC_trend_3"]=cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(3)
# ( PC_i - PC_i-1 ) / time_delta  (GRADIENT)
cdm["PC_gradient_1"]=(cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(1))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(1)))
# ( PC_i - PC_i-3 ) / time_delta  (GRADIENT)
cdm["PC_gradient_3"]=(cdm["COLLISSION_PROBABILITY"]-cdm["COLLISSION_PROBABILITY"].shift(3))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(3)))

################## MISS_DISTANCE ##################
# ( _i + _i-1 + _i-2) / 3 (MOVING AVERAGE WINDOW 3)
#cdm["MD_mavg_1"]=(cdm["MISS_DISTANCE"]+cdm["MISS_DISTANCE"].shift(1)+cdm["MISS_DISTANCE"].shift(2))/3
# _i - _i-1 (TREND)
cdm["MD_trend_1"]=cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(1)
# _i -  _i-3 (TREND)
cdm["MD_trend_3"]=cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(3)
#( _i - _i-1 ) / time_delta  (GRADIENT)
cdm["MD_gradient_1"]=(cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(1))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(1)))
#( _i - _i-3 ) / time_delta  (GRADIENT)
cdm["MD_gradient_3"]=(cdm["MISS_DISTANCE"]-cdm["MISS_DISTANCE"].shift(3))/(abs(cdm["__time_to_tca"]-cdm["__time_to_tca"].shift(3)))



#AUXILIAR COLUMN TO DELETE VALUES MIXING CDMs OF DIFFERENT event_id
cdm["VALID_ROW"]=cdm["event_id"]==cdm["event_id"].shift(3)
cdm=cdm[cdm["VALID_ROW"]==True]
cdm.reset_index(inplace=True)

# REORDERING COLUMNS TARGET_PC to end
cdm=cdm[ [ col for col in cdm.columns if col != 'TARGET_PC' ] + ['TARGET_PC'] ]

# DELETING COLUMNS
cdm.drop(["index","VALID_ROW","event_id"], inplace=True, axis=1)

-----------------------------------------------------------
------------- FEATURE ENGINEERING -------------------------
-----------------------------------------------------------
FROM MAIN VARIABLES MISS_DISTANCE and COLLISSION_PROBABILITY...
Computing...
trends...
gradients...
............................................................


In [10]:
cdm.shape

(110337, 85)

In [11]:
data=pd.read_pickle("dataframe/_PRUEBA_df_filtered_20220115_112153.pkl")
data.reset_index(inplace=True)
data.drop(['index'], inplace=True, axis=1)
print(data.shape)
data.head()

(5394, 85)


Unnamed: 0,__time_to_tca,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,RELATIVE_POSITION_N,RELATIVE_VELOCITY_R,RELATIVE_VELOCITY_T,RELATIVE_VELOCITY_N,COLLISSION_PROBABILITY,...,OBJECT2_CORR_CNDOT_TDOT,PC_trend_1,PC_trend_3,PC_gradient_1,PC_gradient_3,MD_trend_1,MD_trend_3,MD_gradient_1,MD_gradient_3,TARGET_PC
0,5.775947,568.0,2001.0,-20.9,-562.8,-75.9,0.8,-268.6,1983.8,-5.415895,...,-0.054742,0.22617,0.773872,0.65261,0.763062,-123.0,144.0,-354.914301,141.988456,-5.345246
1,5.420762,611.0,2001.0,-19.9,-605.4,-81.8,0.8,-268.6,1983.8,-5.345246,...,0.029148,0.070649,0.237282,0.198907,0.234313,43.0,50.0,121.063636,49.374243,-4.792366
2,5.119489,576.0,2001.0,-9.6,-571.2,-77.4,0.8,-268.6,1983.8,-4.792366,...,-0.03092,0.55288,0.8497,1.835148,0.847141,-35.0,-115.0,-116.173757,-114.653689,-4.20845
3,4.750068,328.0,2001.0,2.7,-325.3,-43.7,0.5,-268.6,1983.8,-4.20845,...,-0.104064,0.583916,1.207445,1.580622,1.176985,-248.0,-240.0,-671.319754,-233.945596,-4.049879
4,4.087221,56.0,2001.0,9.2,-55.1,-7.3,0.2,-268.6,1983.8,-4.049879,...,-0.117181,0.158571,1.295367,0.239227,0.971374,-272.0,-555.0,-410.351242,-416.185112,-5.289798


In [12]:
print(len(data.columns))
print(list(data.columns))
lista_final=list(data.columns)

85
['__time_to_tca', 'MISS_DISTANCE', 'RELATIVE_SPEED', 'RELATIVE_POSITION_R', 'RELATIVE_POSITION_T', 'RELATIVE_POSITION_N', 'RELATIVE_VELOCITY_R', 'RELATIVE_VELOCITY_T', 'RELATIVE_VELOCITY_N', 'COLLISSION_PROBABILITY', 'OBJECT1_CR_R', 'OBJECT1_CT_T', 'OBJECT1_CN_N', 'OBJECT1_CRDOT_RDOT', 'OBJECT1_CTDOT_TDOT', 'OBJECT1_CNDOT_NDOT', 'OBJECT1_RECOMMENDED_OD_SPAN', 'OBJECT1_ACTUAL_OD_SPAN', 'OBJECT1_OBS_AVAILABLE', 'OBJECT1_OBS_USED', 'OBJECT1_RESIDUALS_ACCEPTED', 'OBJECT1_WEIGHTED_RMS', 'OBJECT1_SEDR', 'OBJECT1_CD_AREA_OVER_MASS', 'OBJECT1_CR_AREA_OVER_MASS', 'OBJECT1_APOGEE_ALTITUDE', 'OBJECT1_PERIGEE_ALTITUDE', 'OBJECT1_INCLINATION', 'OBJECT2_CR_R', 'OBJECT2_CT_T', 'OBJECT2_CN_N', 'OBJECT2_CRDOT_RDOT', 'OBJECT2_CTDOT_TDOT', 'OBJECT2_CNDOT_NDOT', 'OBJECT2_RECOMMENDED_OD_SPAN', 'OBJECT2_ACTUAL_OD_SPAN', 'OBJECT2_OBS_AVAILABLE', 'OBJECT2_OBS_USED', 'OBJECT2_RESIDUALS_ACCEPTED', 'OBJECT2_WEIGHTED_RMS', 'OBJECT2_SEDR', 'OBJECT2_CD_AREA_OVER_MASS', 'OBJECT2_CR_AREA_OVER_MASS', 'OBJECT2_APOGE

In [13]:
print(len(antes_de_FE.columns))
print(list(antes_de_FE.columns))
lista_antes=list(antes_de_FE.columns)

77
['__time_to_tca', 'event_id', 'MISS_DISTANCE', 'RELATIVE_SPEED', 'RELATIVE_POSITION_R', 'RELATIVE_POSITION_T', 'RELATIVE_POSITION_N', 'RELATIVE_VELOCITY_R', 'RELATIVE_VELOCITY_T', 'RELATIVE_VELOCITY_N', 'COLLISSION_PROBABILITY', 'OBJECT1_CR_R', 'OBJECT1_CT_T', 'OBJECT1_CN_N', 'OBJECT1_CRDOT_RDOT', 'OBJECT1_CTDOT_TDOT', 'OBJECT1_CNDOT_NDOT', 'OBJECT1_RECOMMENDED_OD_SPAN', 'OBJECT1_ACTUAL_OD_SPAN', 'OBJECT1_OBS_AVAILABLE', 'OBJECT1_OBS_USED', 'OBJECT1_RESIDUALS_ACCEPTED', 'OBJECT1_WEIGHTED_RMS', 'OBJECT1_SEDR', 'OBJECT1_CD_AREA_OVER_MASS', 'OBJECT1_CR_AREA_OVER_MASS', 'OBJECT1_APOGEE_ALTITUDE', 'OBJECT1_PERIGEE_ALTITUDE', 'OBJECT1_INCLINATION', 'OBJECT2_CR_R', 'OBJECT2_CT_T', 'OBJECT2_CN_N', 'OBJECT2_CRDOT_RDOT', 'OBJECT2_CTDOT_TDOT', 'OBJECT2_CNDOT_NDOT', 'OBJECT2_RECOMMENDED_OD_SPAN', 'OBJECT2_ACTUAL_OD_SPAN', 'OBJECT2_OBS_AVAILABLE', 'OBJECT2_OBS_USED', 'OBJECT2_RESIDUALS_ACCEPTED', 'OBJECT2_WEIGHTED_RMS', 'OBJECT2_SEDR', 'OBJECT2_CD_AREA_OVER_MASS', 'OBJECT2_CR_AREA_OVER_MASS', 'O

In [14]:
list1=lista_antes
list2=lista_final
res = [x for x in list1 + list2 if x not in list1 or x not in list2]

print(len(res))
print(res)

10
['event_id', 'PC_trend_1', 'PC_trend_3', 'PC_gradient_1', 'PC_gradient_3', 'MD_trend_1', 'MD_trend_3', 'MD_gradient_1', 'MD_gradient_3', 'TARGET_PC']


In [15]:
list0=list(standard_df.columns)
res = [x for x in list1 + list0 if x not in list1 or x not in list0]

print(len(res))
print(res)

68
['__time_to_tca', 'OBJECT1_CORR_CT_R', 'OBJECT1_CORR_CN_R', 'OBJECT1_CORR_CN_T', 'OBJECT1_CORR_CRDOT_R', 'OBJECT1_CORR_CRDOT_T', 'OBJECT1_CORR_CRDOT_N', 'OBJECT1_CORR_CTDOT_R', 'OBJECT1_CORR_CTDOT_T', 'OBJECT1_CORR_CTDOT_N', 'OBJECT1_CORR_CTDOT_RDOT', 'OBJECT1_CORR_CNDOT_R', 'OBJECT1_CORR_CNDOT_T', 'OBJECT1_CORR_CNDOT_N', 'OBJECT1_CORR_CNDOT_RDOT', 'OBJECT1_CORR_CNDOT_TDOT', 'OBJECT2_CORR_CT_R', 'OBJECT2_CORR_CN_R', 'OBJECT2_CORR_CN_T', 'OBJECT2_CORR_CRDOT_R', 'OBJECT2_CORR_CRDOT_T', 'OBJECT2_CORR_CRDOT_N', 'OBJECT2_CORR_CTDOT_R', 'OBJECT2_CORR_CTDOT_T', 'OBJECT2_CORR_CTDOT_N', 'OBJECT2_CORR_CTDOT_RDOT', 'OBJECT2_CORR_CNDOT_R', 'OBJECT2_CORR_CNDOT_T', 'OBJECT2_CORR_CNDOT_N', 'OBJECT2_CORR_CNDOT_RDOT', 'OBJECT2_CORR_CNDOT_TDOT', 'TCA', 'CREATION_DATE', 'OBJECT1_CT_R', 'OBJECT1_CN_R', 'OBJECT1_CN_T', 'OBJECT1_CRDOT_R', 'OBJECT1_CRDOT_T', 'OBJECT1_CRDOT_N', 'OBJECT1_CTDOT_R', 'OBJECT1_CTDOT_T', 'OBJECT1_CTDOT_N', 'OBJECT1_CTDOT_RDOT', 'OBJECT1_CNDOT_R', 'OBJECT1_CNDOT_T', 'OBJECT1_CNDO

In [16]:
res = [x for x in list0 if x not in list1]
print(len(res))
print(res)

37
['TCA', 'CREATION_DATE', 'OBJECT1_CT_R', 'OBJECT1_CN_R', 'OBJECT1_CN_T', 'OBJECT1_CRDOT_R', 'OBJECT1_CRDOT_T', 'OBJECT1_CRDOT_N', 'OBJECT1_CTDOT_R', 'OBJECT1_CTDOT_T', 'OBJECT1_CTDOT_N', 'OBJECT1_CTDOT_RDOT', 'OBJECT1_CNDOT_R', 'OBJECT1_CNDOT_T', 'OBJECT1_CNDOT_N', 'OBJECT1_CNDOT_RDOT', 'OBJECT1_CNDOT_TDOT', 'OBJECT1_TIME_LASTOB_START', 'OBJECT1_TIME_LASTOB_END', 'OBJECT2_CT_R', 'OBJECT2_CN_R', 'OBJECT2_CN_T', 'OBJECT2_CRDOT_R', 'OBJECT2_CRDOT_T', 'OBJECT2_CRDOT_N', 'OBJECT2_CTDOT_R', 'OBJECT2_CTDOT_T', 'OBJECT2_CTDOT_N', 'OBJECT2_CTDOT_RDOT', 'OBJECT2_CNDOT_R', 'OBJECT2_CNDOT_T', 'OBJECT2_CNDOT_N', 'OBJECT2_CNDOT_RDOT', 'OBJECT2_CNDOT_TDOT', 'OBJECT2_OBJECT_TYPE', 'OBJECT2_TIME_LASTOB_START', 'OBJECT2_TIME_LASTOB_END']


In [18]:
diff01=['TCA', 'CREATION_DATE', 'OBJECT1_TIME_LASTOB_START', 'OBJECT1_TIME_LASTOB_END', 'OBJECT2_OBJECT_TYPE', 'OBJECT2_TIME_LASTOB_START', 'OBJECT2_TIME_LASTOB_END']

In [19]:
len(diff01)

7

In [17]:
res = [x for x in list1 if x not in list0]
print(len(res))
print(res)

31
['__time_to_tca', 'OBJECT1_CORR_CT_R', 'OBJECT1_CORR_CN_R', 'OBJECT1_CORR_CN_T', 'OBJECT1_CORR_CRDOT_R', 'OBJECT1_CORR_CRDOT_T', 'OBJECT1_CORR_CRDOT_N', 'OBJECT1_CORR_CTDOT_R', 'OBJECT1_CORR_CTDOT_T', 'OBJECT1_CORR_CTDOT_N', 'OBJECT1_CORR_CTDOT_RDOT', 'OBJECT1_CORR_CNDOT_R', 'OBJECT1_CORR_CNDOT_T', 'OBJECT1_CORR_CNDOT_N', 'OBJECT1_CORR_CNDOT_RDOT', 'OBJECT1_CORR_CNDOT_TDOT', 'OBJECT2_CORR_CT_R', 'OBJECT2_CORR_CN_R', 'OBJECT2_CORR_CN_T', 'OBJECT2_CORR_CRDOT_R', 'OBJECT2_CORR_CRDOT_T', 'OBJECT2_CORR_CRDOT_N', 'OBJECT2_CORR_CTDOT_R', 'OBJECT2_CORR_CTDOT_T', 'OBJECT2_CORR_CTDOT_N', 'OBJECT2_CORR_CTDOT_RDOT', 'OBJECT2_CORR_CNDOT_R', 'OBJECT2_CORR_CNDOT_T', 'OBJECT2_CORR_CNDOT_N', 'OBJECT2_CORR_CNDOT_RDOT', 'OBJECT2_CORR_CNDOT_TDOT']
