In [1]:
# Import packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [14]:
df = pd.read_csv('exoplanets_2018.csv')

df = df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod[days',
'koi_period_err1':'OrbitalPeriodUpperUnc.[days',
'koi_period_err2':'OrbitalPeriodLowerUnc.[days',
'koi_time0bk':'TransitEpoch[BKJD',
'koi_time0bk_err1':'TransitEpochUpperUnc.[BKJD',
'koi_time0bk_err2':'TransitEpochLowerUnc.[BKJD',
'koi_impact':'ImpactParamete',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration[hrs',
'koi_duration_err1':'TransitDurationUpperUnc.[hrs',
'koi_duration_err2':'TransitDurationLowerUnc.[hrs',
'koi_depth':'TransitDepth[ppm',
'koi_depth_err1':'TransitDepthUpperUnc.[ppm',
'koi_depth_err2':'TransitDepthLowerUnc.[ppm',
'koi_prad':'PlanetaryRadius[Earthradii',
'koi_prad_err1':'PlanetaryRadiusUpperUnc.[Earthradii',
'koi_prad_err2':'PlanetaryRadiusLowerUnc.[Earthradii',
'koi_teq':'EquilibriumTemperature[K',
'koi_teq_err1':'EquilibriumTemperatureUpperUnc.[K',
'koi_teq_err2':'EquilibriumTemperatureLowerUnc.[K',
'koi_insol':'InsolationFlux[Earthflux',
'koi_insol_err1':'InsolationFluxUpperUnc.[Earthflux',
'koi_insol_err2':'InsolationFluxLowerUnc.[Earthflux',
'koi_model_snr':'TransitSignal-to-Nois',
'koi_tce_plnt_num':'TCEPlanetNumbe',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperature[K',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUnc.[K',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUnc.[K',
'koi_slogg':'StellarSurfaceGravity[log10(cm/s**2)',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc.[log10(cm/s**2)',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc.[log10(cm/s**2)',
'koi_srad':'StellarRadius[Solarradii',
'koi_srad_err1':'StellarRadiusUpperUnc.[Solarradii',
'koi_srad_err2':'StellarRadiusLowerUnc.[Solarradii',
'ra':'RA[decimaldegrees',
'dec':'Dec[decimaldegrees',
'koi_kepmag':'Kepler-band[mag]'
})


df['ExoplanetCandidate'] = df['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoplanetConfirmed'] = df['ExoplanetArchiveDisposition'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )
df.drop(columns=['KeplerName','KOIName','EquilibriumTemperatureUpperUnc.[K',
                 'KepID','ExoplanetArchiveDisposition','DispositionUsingKeplerData',
                 'NotTransit-LikeFalsePositiveFlag','koi_fpflag_ss','CentroidOffsetFalsePositiveFlag',
                 'EphemerisMatchIndicatesContaminationFalsePositiveFlag','TCEDeliver',
                 'EquilibriumTemperatureLowerUnc.[K', 'ExoplanetConfirmed'], inplace=True)
df.dropna(inplace=True)
df.shape

(7803, 38)

In [15]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)
train, test = train_test_split(df, test_size=0.2, random_state=42)

train.to_csv('planet_1.txt', sep=',', index=False)
test.to_csv('planet_2.txt', sep=',', index=False)


In [19]:
df_candidate_0 = df[df['ExoplanetCandidate'] == 0]
df_candidate_1 = df[df['ExoplanetCandidate'] == 1]

sample_candidate_0 = df_candidate_0.sample(n=3700, random_state=42)
sample_candidate_1 = df_candidate_1.sample(n=3700, random_state=42)

# Concatenate the two sampled DataFrames
combined_sample = pd.concat([sample_candidate_0, sample_candidate_1], ignore_index=True)
# Display the combined DataFrame
combined_sample

Unnamed: 0,DispositionScore,OrbitalPeriod[days,OrbitalPeriodUpperUnc.[days,OrbitalPeriodLowerUnc.[days,TransitEpoch[BKJD,TransitEpochUpperUnc.[BKJD,TransitEpochLowerUnc.[BKJD,ImpactParamete,ImpactParameterUpperUnc,ImpactParameterLowerUnc,...,StellarSurfaceGravity[log10(cm/s**2),StellarSurfaceGravityUpperUnc.[log10(cm/s**2),StellarSurfaceGravityLowerUnc.[log10(cm/s**2),StellarRadius[Solarradii,StellarRadiusUpperUnc.[Solarradii,StellarRadiusLowerUnc.[Solarradii,RA[decimaldegrees,Dec[decimaldegrees,Kepler-band[mag],ExoplanetCandidate
0,0.000,9.080194,5.980000e-07,-5.980000e-07,136.817021,0.000051,-0.000051,0.614,0.007,-0.011,...,4.396,0.062,-0.175,1.131,0.322,-0.138,297.17203,46.953949,15.163,0
1,0.000,37.809539,1.380000e-03,-1.380000e-03,158.400200,0.032900,-0.032900,0.030,0.416,-0.030,...,4.570,0.038,-0.152,0.820,0.181,-0.078,289.77826,38.262741,15.752,0
2,0.000,8.480398,5.760000e-05,-5.760000e-05,135.850180,0.006130,-0.006130,0.486,0.446,-0.279,...,4.364,0.153,-0.187,1.050,0.297,-0.183,298.73007,46.688671,14.673,0
3,0.000,3.187752,1.820000e-05,-1.820000e-05,134.418000,0.004370,-0.004370,1.178,0.387,-0.114,...,4.489,0.069,-0.161,0.896,0.222,-0.095,285.37082,47.946499,14.640,0
4,0.000,0.579517,3.630000e-06,-3.630000e-06,132.002560,0.004900,-0.004900,0.638,0.342,-0.431,...,4.387,0.087,-0.203,1.102,0.353,-0.151,294.41693,43.799587,13.559,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7395,0.989,6.634480,9.810000e-05,-9.810000e-05,137.069400,0.010200,-0.010200,0.116,0.339,-0.116,...,4.313,0.105,-0.195,1.225,0.384,-0.192,296.92773,40.429661,13.805,1
7396,0.995,4.361259,3.540000e-05,-3.540000e-05,132.532280,0.006830,-0.006830,0.374,0.084,-0.374,...,4.544,0.021,-0.119,0.873,0.118,-0.039,294.08270,39.556099,14.951,1
7397,1.000,14.006404,1.730000e-05,-1.730000e-05,142.208850,0.001000,-0.001000,0.568,0.286,-0.368,...,4.323,0.132,-0.108,1.124,0.164,-0.148,295.16382,43.963039,13.837,1
7398,0.995,6.936509,8.200000e-05,-8.200000e-05,133.983090,0.009480,-0.009480,0.331,0.131,-0.331,...,4.173,0.108,-0.132,1.574,0.297,-0.216,294.89734,40.798168,12.209,1


In [18]:
combined_sample.to_csv('planet.txt', sep=',', index=False)

In [21]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Separate features and result column
features = combined_sample.drop(columns=['ExoplanetCandidate'])
result = combined_sample['ExoplanetCandidate']

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply PCA on features only
pca = PCA(n_components=20)  # Adjust `n_components` as needed
pca_features = pca.fit_transform(scaled_features)

# Convert PCA results to a DataFrame
pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(pca.n_components)])

# Combine the PCA-transformed features with the result column
final_combined = pd.concat([pca_df, result.reset_index(drop=True)], axis=1)

# Display the final DataFrame
final_combined

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,ExoplanetCandidate
0,-0.332006,-1.103695,2.713078,-1.609298,1.759507,-1.208250,1.413773,1.187088,2.233750,0.424125,...,0.796780,0.386945,0.388769,0.553638,-0.065769,1.197436,0.033353,-0.313569,0.288193,0
1,0.796969,1.421821,0.207496,-0.510873,0.273747,1.133074,-0.660913,1.080336,0.420467,1.243399,...,-1.478791,0.817592,-0.602130,-0.339659,-0.280404,-0.595151,0.306486,0.030731,-0.182421,0
2,0.113450,-0.240281,0.459484,-0.537287,-0.520924,0.465364,-0.074586,0.266549,0.040719,1.051815,...,1.258526,-0.119752,0.628966,-0.434787,0.372410,-0.378270,-0.124378,-0.315152,-0.333727,0
3,0.318151,0.028717,0.388084,0.028091,-0.271489,0.243232,0.178140,0.303087,0.450981,0.161803,...,0.617117,-0.910060,-1.873836,-0.264628,-0.239760,-1.280778,-0.365983,-0.704657,-0.728827,0
4,-0.790653,-0.596192,1.081748,-1.372305,-1.228528,0.861634,-0.447744,0.262336,-0.114987,-0.069385,...,-0.043742,-0.586114,0.478600,0.494643,-0.545718,-0.118334,-0.308984,-0.705105,0.079330,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7395,-0.002278,-0.152837,0.231895,-0.973478,-0.925153,0.262047,-0.796337,0.600010,-0.544158,0.040402,...,-0.163855,0.820198,0.293372,-0.413641,-0.152279,1.479952,0.404811,-0.027968,-0.041068,1
7396,0.938940,-0.756714,-1.421597,0.900960,0.173760,0.282856,-0.521700,0.154260,0.429856,0.418973,...,-0.495806,0.348665,0.299625,-0.259101,0.095085,1.323176,-0.146316,-0.472997,0.089374,1
7397,0.475826,-0.917421,-0.754275,0.387835,-0.338453,-0.365196,-0.132102,-0.155215,-0.087297,-0.272423,...,0.387865,0.052425,0.446709,-0.642664,0.037843,0.899057,-0.137440,0.238450,-0.248375,1
7398,0.002201,-0.340027,-1.226287,0.285109,-0.591700,-1.069723,-1.268919,-0.552593,0.129980,-1.351425,...,0.072392,1.124851,0.562742,0.259184,-0.334284,1.184550,0.220686,-1.067798,0.050124,1


In [None]:
final_combined.to_csv('planet_pca.txt', sep=',', index=False)