In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# Import data
filename = 'data/train.csv'
raw_df = pd.read_csv(filename)

size = raw_df.shape[0]
print('Total size: ' + str(size))

raw_df.head()

Total size: 250000


Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,...,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,...,-0.277,258.733,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497
1,100001,b,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,...,-1.916,164.546,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226
2,100002,b,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,...,-2.186,260.414,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251
3,100003,b,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,...,0.06,86.062,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0
4,100004,b,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,...,-0.871,53.131,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0


In [3]:
def compute_missing_values(df, column_name):
    
    column = df[column_name].to_numpy()

    return column[np.where(column == -999.0)].shape[0]

In [4]:
for column in raw_df.columns[2:]:
    n_nan = compute_missing_values(raw_df, column)
    rel_nan = n_nan / size
    
    print('{column} is missing {n_nan} values, with a percentage of {rel_nan}'
              .format(column=column, n_nan=n_nan, rel_nan=rel_nan)
         )

DER_mass_MMC is missing 38114 values, with a percentage of 0.152456
DER_mass_transverse_met_lep is missing 0 values, with a percentage of 0.0
DER_mass_vis is missing 0 values, with a percentage of 0.0
DER_pt_h is missing 0 values, with a percentage of 0.0
DER_deltaeta_jet_jet is missing 177457 values, with a percentage of 0.709828
DER_mass_jet_jet is missing 177457 values, with a percentage of 0.709828
DER_prodeta_jet_jet is missing 177457 values, with a percentage of 0.709828
DER_deltar_tau_lep is missing 0 values, with a percentage of 0.0
DER_pt_tot is missing 0 values, with a percentage of 0.0
DER_sum_pt is missing 0 values, with a percentage of 0.0
DER_pt_ratio_lep_tau is missing 0 values, with a percentage of 0.0
DER_met_phi_centrality is missing 0 values, with a percentage of 0.0
DER_lep_eta_centrality is missing 177457 values, with a percentage of 0.709828
PRI_tau_pt is missing 0 values, with a percentage of 0.0
PRI_tau_eta is missing 0 values, with a percentage of 0.0
PRI_tau_p

We can see that several features have high percentages of missing values (> 40%), then we start by training the data without those features. Based on the information we will understand, we will next decide how to treat the `DER_mass_MMC` feature, which is missing ~15% of the values.

In [5]:
def nan_columns(df):
    
    columns = []
      
    for column in df.columns[2:]:
        n_nan = compute_missing_values(df, column)
        rel_nan = n_nan / size

        if rel_nan > 0.3:
            columns.append(column)
    
    return columns

In [6]:
columns = nan_columns(raw_df)

clean_df = raw_df.drop(columns, axis=1)
        
clean_df.head()

Unnamed: 0,Id,Prediction,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltar_tau_lep,DER_pt_tot,DER_sum_pt,DER_pt_ratio_lep_tau,...,PRI_tau_eta,PRI_tau_phi,PRI_lep_pt,PRI_lep_eta,PRI_lep_phi,PRI_met,PRI_met_phi,PRI_met_sumet,PRI_jet_num,PRI_jet_all_pt
0,100000,s,138.47,51.655,97.827,27.98,3.064,41.928,197.76,1.582,...,1.017,0.381,51.626,2.273,-2.414,16.824,-0.277,258.733,2,113.497
1,100001,b,160.937,68.768,103.235,48.146,3.473,2.078,125.157,0.879,...,2.039,-3.011,36.918,0.501,0.103,44.704,-1.916,164.546,1,46.226
2,100002,b,-999.0,162.172,125.953,35.635,3.148,9.336,197.814,3.776,...,-0.705,-2.093,121.409,-0.953,1.052,54.283,-2.186,260.414,1,44.251
3,100003,b,143.905,81.417,80.943,0.414,3.31,0.414,75.968,2.354,...,-1.655,0.01,53.321,-0.522,-3.1,31.082,0.06,86.062,0,0.0
4,100004,b,175.864,16.915,134.805,16.405,3.891,16.405,57.983,1.056,...,-2.197,-2.231,29.774,0.798,1.569,2.723,-0.871,53.131,0,0.0
