## Preparing data for Machine learning tasks

In [84]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [85]:
# Load the dataset from CSV file
exoplanets_df = pd.read_csv('exoplanets.csv')
exoplanets_df.head()


Unnamed: 0,loc_rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,3,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,4,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.0,0,1,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.0,0,0,0,...,-211.0,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509


In [86]:
# Analysing the data
exoplanets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5659 entries, 0 to 5658
Data columns (total 50 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loc_rowid          5659 non-null   int64  
 1   kepid              5659 non-null   int64  
 2   kepoi_name         5659 non-null   object 
 3   kepler_name        2628 non-null   object 
 4   koi_disposition    5659 non-null   object 
 5   koi_pdisposition   5659 non-null   object 
 6   koi_score          5318 non-null   float64
 7   koi_fpflag_nt      5659 non-null   int64  
 8   koi_fpflag_ss      5659 non-null   int64  
 9   koi_fpflag_co      5659 non-null   int64  
 10  koi_fpflag_ec      5659 non-null   int64  
 11  koi_period         5659 non-null   float64
 12  koi_period_err1    5558 non-null   float64
 13  koi_period_err2    5558 non-null   float64
 14  koi_time0bk        5659 non-null   float64
 15  koi_time0bk_err1   5558 non-null   float64
 16  koi_time0bk_err2   5557 

### Data cleaning

In [87]:
## Removing irrelevent data
exoplanets_df.drop(columns=['kepler_name','kepoi_name','koi_teq_err1', 'koi_teq_err2',
                 'kepid',
                 'koi_fpflag_nt','koi_fpflag_ss','koi_fpflag_co',
                 'koi_fpflag_ec','koi_tce_delivname'], inplace=True)

In [88]:
## Checking for NULL values
exoplanets_df.isna().any()

loc_rowid            False
koi_disposition      False
koi_pdisposition     False
koi_score             True
koi_period           False
koi_period_err1       True
koi_period_err2       True
koi_time0bk          False
koi_time0bk_err1      True
koi_time0bk_err2      True
koi_impact            True
koi_impact_err1       True
koi_impact_err2       True
koi_duration          True
koi_duration_err1     True
koi_duration_err2     True
koi_depth             True
koi_depth_err1        True
koi_depth_err2        True
koi_prad              True
koi_prad_err1         True
koi_prad_err2         True
koi_teq               True
koi_insol             True
koi_insol_err1        True
koi_insol_err2        True
koi_model_snr         True
koi_tce_plnt_num      True
koi_steff             True
koi_steff_err1        True
koi_steff_err2        True
koi_slogg             True
koi_slogg_err1        True
koi_slogg_err2        True
koi_srad              True
koi_srad_err1         True
koi_srad_err2         True
r

In [89]:
## Filling 'NaN' values with mean
exoplanets_df = exoplanets_df.fillna(exoplanets_df.mean())

  exoplanets_df = exoplanets_df.fillna(exoplanets_df.mean())


In [90]:
## Cleaning the missing values
exoplanets_df.dropna(inplace=True)
exoplanets_df.shape


(5659, 40)

In [91]:
## Remove outliners from dataset
for col in exoplanets_df.columns:
    if exoplanets_df[col].dtype == 'float64':
        # Calculate the IQR
        q1 = exoplanets_df[col].quantile(0.25)
        q3 = exoplanets_df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr

        # Remove outliers
        exoplanets_df[col] = exoplanets_df[col].where((exoplanets_df[col] >= lower_bound) & (exoplanets_df[col] <= upper_bound))

In [92]:
exoplanets_df.shape

(5659, 40)

### Data transformtions

In [93]:
## Encoding labels
le_pdisposition = LabelEncoder()
le_disposition = LabelEncoder()

exoplanets_df['exoplanet_confirmed'] = le_disposition.fit_transform(exoplanets_df['koi_disposition'].apply(lambda x: 1 if x == 'CONFIRMED' else 0))
exoplanets_df['exoplanet_candidate'] = le_pdisposition.fit_transform(exoplanets_df['koi_pdisposition'].apply(lambda x: 1 if x == 'CANDIDATE' else 0))

exoplanets_df.drop(['koi_pdisposition', 'koi_disposition'], axis=1, inplace=True)

exoplanets_df.head()


Unnamed: 0,loc_rowid,koi_score,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,...,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,exoplanet_confirmed,exoplanet_candidate
0,1,1.0,9.488036,2.775e-05,-2.775e-05,170.53875,0.00216,-0.00216,0.146,0.318,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,1
1,2,0.969,54.418383,0.0002479,-0.0002479,162.51384,0.00352,-0.00352,0.586,0.059,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,1
2,3,0.0,19.89914,1.494e-05,-1.494e-05,175.850252,0.000581,-0.000581,0.969,,...,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,0,1
3,4,0.0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,...,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0,0
4,5,1.0,2.525592,3.761e-06,-3.761e-06,171.59555,0.00113,-0.00113,0.701,0.235,...,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1,1


In [94]:
## Changing the data types to float
exoplanets_df.astype(np.float64)

Unnamed: 0,loc_rowid,koi_score,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,...,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag,exoplanet_confirmed,exoplanet_candidate
0,1.0,1.000000,9.488036,2.775000e-05,-2.775000e-05,170.538750,0.002160,-0.002160,0.146000,0.3180,...,0.064000,-0.096000,0.927000,0.105000,-0.061000,291.934230,48.141651,15.347000,1.0,1.0
1,2.0,0.969000,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,0.586000,0.0590,...,0.064000,-0.096000,0.927000,0.105000,-0.061000,291.934230,48.141651,15.347000,1.0,1.0
2,3.0,0.000000,19.899140,1.494000e-05,-1.494000e-05,175.850252,0.000581,-0.000581,0.969000,,...,0.044000,-0.176000,0.868000,0.233000,-0.078000,297.004820,48.134129,15.436000,0.0,1.0
3,4.0,0.000000,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,1.276000,0.1150,...,0.053000,-0.168000,0.791000,0.201000,-0.067000,285.534610,48.285210,15.597000,0.0,0.0
4,5.0,1.000000,2.525592,3.761000e-06,-3.761000e-06,171.595550,0.001130,-0.001130,0.701000,0.2350,...,0.070000,-0.210000,1.046000,0.334000,-0.133000,288.754880,48.226200,15.509000,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5654,5655.0,0.621719,,,,,,,0.129600,0.3117,...,0.079000,-0.300000,1.112000,0.569000,-0.126000,295.039030,43.954540,15.487000,0.0,0.0
5655,5656.0,0.621719,45.603048,,,171.320010,0.007178,-0.006279,0.620349,,...,0.107614,-0.131753,1.439144,0.287637,-0.293413,293.697420,39.036388,15.206000,0.0,0.0
5656,5657.0,0.001000,2.468101,1.408000e-05,-1.408000e-05,132.076920,0.004900,-0.004900,0.689000,0.2410,...,0.100000,-0.150000,0.872000,0.196000,-0.105000,298.849790,44.615009,14.874000,0.0,0.0
5657,5658.0,0.621719,12.534499,,,134.950780,0.007178,-0.006279,0.620349,,...,0.107614,-0.131753,1.439144,0.287637,-0.293413,294.907650,40.278831,15.725000,0.0,0.0


In [None]:
## Normalizing the dataset
scaler = StandardScaler()
exoplanets_df = scaler.fit_transform(exoplanets_df)


### Data splitting

In [None]:
## Normalizing the dataset
scaler = StandardScaler()
exoplanets_df = scaler.fit_transform(exoplanets_df)


In [None]:
## Extracting the features and labels
X = exoplanets_df.drop(columns=['exoplanet_candidate','exoplanet_confirmed']) # features
y = exoplanets_df.exoplanet_candidate # targets


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4) # split the data into train (60%) and temp (40%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.25) # split the remaining 40% into val (75%) and test (25%)
