In [1]:
# Import packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from toolz import interleave

In [2]:
df = pd.read_csv('exoplanets_2018.csv')

df = df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod[days',
'koi_period_err1':'OrbitalPeriodUpperUnc.[days',
'koi_period_err2':'OrbitalPeriodLowerUnc.[days',
'koi_time0bk':'TransitEpoch[BKJD',
'koi_time0bk_err1':'TransitEpochUpperUnc.[BKJD',
'koi_time0bk_err2':'TransitEpochLowerUnc.[BKJD',
'koi_impact':'ImpactParamete',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration[hrs',
'koi_duration_err1':'TransitDurationUpperUnc.[hrs',
'koi_duration_err2':'TransitDurationLowerUnc.[hrs',
'koi_depth':'TransitDepth[ppm',
'koi_depth_err1':'TransitDepthUpperUnc.[ppm',
'koi_depth_err2':'TransitDepthLowerUnc.[ppm',
'koi_prad':'PlanetaryRadius[Earthradii',
'koi_prad_err1':'PlanetaryRadiusUpperUnc.[Earthradii',
'koi_prad_err2':'PlanetaryRadiusLowerUnc.[Earthradii',
'koi_teq':'EquilibriumTemperature[K',
'koi_teq_err1':'EquilibriumTemperatureUpperUnc.[K',
'koi_teq_err2':'EquilibriumTemperatureLowerUnc.[K',
'koi_insol':'InsolationFlux[Earthflux',
'koi_insol_err1':'InsolationFluxUpperUnc.[Earthflux',
'koi_insol_err2':'InsolationFluxLowerUnc.[Earthflux',
'koi_model_snr':'TransitSignal-to-Nois',
'koi_tce_plnt_num':'TCEPlanetNumbe',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperature[K',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUnc.[K',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUnc.[K',
'koi_slogg':'StellarSurfaceGravity[log10(cm/s**2)',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc.[log10(cm/s**2)',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc.[log10(cm/s**2)',
'koi_srad':'StellarRadius[Solarradii',
'koi_srad_err1':'StellarRadiusUpperUnc.[Solarradii',
'koi_srad_err2':'StellarRadiusLowerUnc.[Solarradii',
'ra':'RA[decimaldegrees',
'dec':'Dec[decimaldegrees',
'koi_kepmag':'Kepler-band[mag]'
})


df['ExoplanetCandidate'] = df['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoplanetConfirmed'] = df['ExoplanetArchiveDisposition'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )
df.drop(columns=['KeplerName','KOIName','EquilibriumTemperatureUpperUnc.[K',
                 'KepID','ExoplanetArchiveDisposition','DispositionUsingKeplerData',
                 'NotTransit-LikeFalsePositiveFlag','koi_fpflag_ss','CentroidOffsetFalsePositiveFlag',
                 'EphemerisMatchIndicatesContaminationFalsePositiveFlag','TCEDeliver',
                 'EquilibriumTemperatureLowerUnc.[K', 'ExoplanetConfirmed'], inplace=True)
df.dropna(inplace=True)
df.shape

(7803, 38)

In [3]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)
train, test = train_test_split(df, test_size=0.2, random_state=42)

train.to_csv('planet_1.txt', sep=',', index=False)
test.to_csv('planet_2.txt', sep=',', index=False)


In [4]:
df_candidate_0 = df[df['ExoplanetCandidate'] == 0]
df_candidate_1 = df[df['ExoplanetCandidate'] == 1]

sample_candidate_0 = df_candidate_0.sample(n=3700, random_state=42)
sample_candidate_1 = df_candidate_1.sample(n=3700, random_state=42)

# Concatenate the two sampled DataFrames
columns_from_df1 = sample_candidate_0.columns
combined_sample = pd.DataFrame(interleave([sample_candidate_0.values, sample_candidate_1.values]))
combined_sample.columns = columns_from_df1

#combined_sample = pd.concat([sample_candidate_0, sample_candidate_1]).sort_index().reset_index(drop=True)
# Display the combined DataFrame
combined_sample.head(20)

Unnamed: 0,DispositionScore,OrbitalPeriod[days,OrbitalPeriodUpperUnc.[days,OrbitalPeriodLowerUnc.[days,TransitEpoch[BKJD,TransitEpochUpperUnc.[BKJD,TransitEpochLowerUnc.[BKJD,ImpactParamete,ImpactParameterUpperUnc,ImpactParameterLowerUnc,...,StellarSurfaceGravity[log10(cm/s**2),StellarSurfaceGravityUpperUnc.[log10(cm/s**2),StellarSurfaceGravityLowerUnc.[log10(cm/s**2),StellarRadius[Solarradii,StellarRadiusUpperUnc.[Solarradii,StellarRadiusLowerUnc.[Solarradii,RA[decimaldegrees,Dec[decimaldegrees,Kepler-band[mag],ExoplanetCandidate
0,0.0,9.080194,5.98e-07,-5.98e-07,136.817021,5.1e-05,-5.1e-05,0.614,0.007,-0.011,...,4.396,0.062,-0.175,1.131,0.322,-0.138,297.17203,46.953949,15.163,0.0
1,0.135,40.651102,0.000762,-0.000762,164.5528,0.0181,-0.0181,0.682,0.271,-0.442,...,4.26,0.095,-0.116,1.369,0.249,-0.166,289.91971,40.16853,13.935,1.0
2,0.0,37.809539,0.00138,-0.00138,158.4002,0.0329,-0.0329,0.03,0.416,-0.03,...,4.57,0.038,-0.152,0.82,0.181,-0.078,289.77826,38.262741,15.752,0.0
3,0.966,3.295346,2.57e-05,-2.57e-05,132.48799,0.0072,-0.0072,0.729,0.009,-0.534,...,4.5,0.046,-0.184,0.954,0.252,-0.079,288.32791,47.381592,15.342,1.0
4,0.0,8.480398,5.76e-05,-5.76e-05,135.85018,0.00613,-0.00613,0.486,0.446,-0.279,...,4.364,0.153,-0.187,1.05,0.297,-0.183,298.73007,46.688671,14.673,0.0
5,0.519,367.947848,0.00479,-0.00479,416.20998,0.00972,-0.00972,0.902,0.062,-0.638,...,4.561,0.03,-0.17,0.855,0.207,-0.069,297.73398,46.961529,15.719,1.0
6,0.0,3.187752,1.82e-05,-1.82e-05,134.418,0.00437,-0.00437,1.178,0.387,-0.114,...,4.489,0.069,-0.161,0.896,0.222,-0.095,285.37082,47.946499,14.64,0.0
7,1.0,114.73658,0.00015,-0.00015,165.150671,0.000977,-0.000977,0.595,0.037,-0.057,...,3.963,0.054,-0.045,1.699,0.147,-0.123,297.91687,46.96513,14.6,1.0
8,0.0,0.579517,3.63e-06,-3.63e-06,132.00256,0.0049,-0.0049,0.638,0.342,-0.431,...,4.387,0.087,-0.203,1.102,0.353,-0.151,294.41693,43.799587,13.559,0.0
9,1.0,7.362721,1.93e-05,-1.93e-05,136.86143,0.00214,-0.00214,0.253,0.198,-0.253,...,4.53,0.042,-0.179,0.894,0.231,-0.077,293.82663,38.856621,14.941,1.0


In [5]:
combined_sample.to_csv('planet.txt', sep=',', index=False, header=False)

In [12]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Separate features and result column
features = combined_sample.drop(columns=['ExoplanetCandidate'])
result = combined_sample['ExoplanetCandidate']

# Apply PCA on features only
pca = PCA(n_components=20)  # Adjust `n_components` as needed
pca_features = pca.fit_transform(features)

# Convert PCA results to a DataFrame
pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(pca.n_components)])

# Combine the PCA-transformed features with the result column
final_combined = pd.concat([pca_df, result.reset_index(drop=True)], axis=1)

# Display the final DataFrame
final_combined

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,ExoplanetCandidate
0,-8935.860891,298160.395995,-55.990257,-1563.361520,364.184437,1407.147473,-84.731094,-110.261683,235.112795,-40.524970,...,-3.327175,4.205792,-19.226070,8.266468,-1.498197,0.736401,4.908055,1.256245,3.050098,0.0
1,-7719.211079,-26715.502204,-818.645565,-1197.242986,254.325459,34.374248,-722.500798,13.449825,69.221184,-45.245577,...,9.068376,2.265264,-21.731569,-2.253346,-0.781609,5.066188,-1.770378,0.668954,-3.634311,1.0
2,-7751.882173,-26365.591070,-828.216157,-1204.063835,-406.816261,54.449709,-457.250050,21.807179,5.028337,-36.014254,...,-0.621770,-1.892162,6.382382,-3.480418,-2.545963,1.519808,-2.397852,0.543350,-5.797311,0.0
3,-7313.547013,-26708.297211,-662.858751,-997.777611,-82.449818,-144.553531,72.693245,-33.920984,31.056855,-35.417199,...,-7.819762,-2.999997,5.902275,-0.972782,-2.383246,-3.072933,-3.636604,0.246093,3.346218,1.0
4,-7603.383362,-26591.862786,-788.301313,-1127.357026,-351.754142,-51.421336,-44.392522,-24.119363,42.097999,-43.799633,...,-6.894511,-3.140132,13.022281,-2.185971,-1.802098,2.413952,6.278229,-0.026335,2.905278,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7395,-7658.959218,-26135.223433,-799.645017,-1189.314664,-58.263787,56.179438,-286.333554,-8.081972,49.387125,-52.900998,...,-0.850355,-0.478084,0.301174,-2.202450,-1.163432,0.066516,3.543899,0.526382,0.076375,1.0
7396,-4626.461796,30470.224156,-225.056850,262.544877,849.613996,322.831496,387.393888,-2.679922,-40.051652,-4.877364,...,-6.242838,-1.631708,17.187928,1.695540,-1.742882,-0.869565,3.851709,-0.177541,-0.091288,0.0
7397,-7019.908965,-26759.036588,-607.732070,-1061.884309,871.886123,-168.987624,-407.277488,-14.499577,53.829321,-55.275206,...,-1.739166,2.235377,-23.919308,-0.130123,-0.156454,-0.607628,3.600401,0.857773,-2.972811,1.0
7398,-3728.621980,-26801.584754,480.029685,778.197104,691.884071,-373.606759,407.851539,-74.332079,26.867814,0.803371,...,-6.981427,-4.845967,8.520680,1.447383,-1.196033,-0.872810,-0.476313,-1.029915,4.091563,0.0


In [18]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features)

pca2 = PCA(n_components=20)  # Adjust `n_components` as needed
pca_features = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(pca.n_components)])

pca_df

<bound method NDFrame.head of            PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0    -0.332006 -1.103695  2.713078 -1.609298 -1.759507 -1.208250  1.413773   
1     0.374029  0.997704 -0.691152  0.509041  0.129037 -0.534699 -0.452495   
2     0.796969  1.421821  0.207496 -0.510873 -0.273747  1.133074 -0.660913   
3     0.730914 -0.456826 -0.518796  0.418522  0.481572  1.117805 -0.477999   
4     0.113450 -0.240281  0.459484 -0.537287  0.520924  0.465364 -0.074586   
...        ...       ...       ...       ...       ...       ...       ...   
7395  0.475826 -0.917421 -0.754275  0.387835  0.338453 -0.365196 -0.132102   
7396 -1.075603 -0.939758  1.762030 -1.373583  0.410287 -0.477917 -0.071250   
7397  0.002201 -0.340027 -1.226287  0.285109  0.591700 -1.069723 -1.268919   
7398 -0.168967  0.991682  0.472202 -0.763174  0.752975  1.165761 -1.240350   
7399  0.851533 -0.504438 -1.670434  1.150574 -0.121769 -0.463460 -0.578133   

           PC8       PC9      PC1

In [26]:
final_combined.to_csv('planet_pca.txt', sep=',', index=False, header=False)