In [2]:
!git clone https://github.com/SimonMen65/QML-Project.git
%cd QML-Project/

Cloning into 'QML-Project'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 63 (delta 27), reused 51 (delta 19), pack-reused 0 (from 0)[K
Receiving objects: 100% (63/63), 5.44 MiB | 866.00 KiB/s, done.
Resolving deltas: 100% (27/27), done.
/content/QML-Project


In [3]:
# Import packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from toolz import interleave

In [41]:
def compute_metrics(SVM,data_pts,true_labels):
    tp, fp, tn, fn = 0, 0, 0, 0
    print("Len: ", len(data_pts))
    print(data_pts.shape)
    pred = SVM.predict(data_pts)
    print("Pred: ", pred.shape)
    for i in range(len(pred)):

        #predicted_cls = SVM.predict_class(data[i], alpha, b)

        predicted_cls = pred[i]
        #print("predicted: ", predicted_cls)

        y_i = true_labels[i]
        #print("y_i: ", y_i)
        if(y_i == 1):
            if(predicted_cls > 0):
                tp += 1
            else:
                fp += 1
        else:
            if(predicted_cls == 0):
                tn += 1
            else:
                fn += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = tp/(tp + 1/2*(fp+fn))
    accuracy = (tp + tn)/(tp+tn+fp+fn)
    print("tp: ",tp)
    print("tn: ",tn)
    print("fp: ",fp)
    print("fn: ", fn)

    return precision,recall,f_score,accuracy


In [5]:


df = pd.read_csv('exoplanets_2018.csv')

df = df.rename(columns={'kepid':'KepID',
'kepoi_name':'KOIName',
'kepler_name':'KeplerName',
'koi_disposition':'ExoplanetArchiveDisposition',
'koi_pdisposition':'DispositionUsingKeplerData',
'koi_score':'DispositionScore',
'koi_fpflag_nt':'NotTransit-LikeFalsePositiveFlag',
'koi_fpflag_ss':'koi_fpflag_ss',
'koi_fpflag_co':'CentroidOffsetFalsePositiveFlag',
'koi_fpflag_ec':'EphemerisMatchIndicatesContaminationFalsePositiveFlag',
'koi_period':'OrbitalPeriod[days',
'koi_period_err1':'OrbitalPeriodUpperUnc.[days',
'koi_period_err2':'OrbitalPeriodLowerUnc.[days',
'koi_time0bk':'TransitEpoch[BKJD',
'koi_time0bk_err1':'TransitEpochUpperUnc.[BKJD',
'koi_time0bk_err2':'TransitEpochLowerUnc.[BKJD',
'koi_impact':'ImpactParamete',
'koi_impact_err1':'ImpactParameterUpperUnc',
'koi_impact_err2':'ImpactParameterLowerUnc',
'koi_duration':'TransitDuration[hrs',
'koi_duration_err1':'TransitDurationUpperUnc.[hrs',
'koi_duration_err2':'TransitDurationLowerUnc.[hrs',
'koi_depth':'TransitDepth[ppm',
'koi_depth_err1':'TransitDepthUpperUnc.[ppm',
'koi_depth_err2':'TransitDepthLowerUnc.[ppm',
'koi_prad':'PlanetaryRadius[Earthradii',
'koi_prad_err1':'PlanetaryRadiusUpperUnc.[Earthradii',
'koi_prad_err2':'PlanetaryRadiusLowerUnc.[Earthradii',
'koi_teq':'EquilibriumTemperature[K',
'koi_teq_err1':'EquilibriumTemperatureUpperUnc.[K',
'koi_teq_err2':'EquilibriumTemperatureLowerUnc.[K',
'koi_insol':'InsolationFlux[Earthflux',
'koi_insol_err1':'InsolationFluxUpperUnc.[Earthflux',
'koi_insol_err2':'InsolationFluxLowerUnc.[Earthflux',
'koi_model_snr':'TransitSignal-to-Nois',
'koi_tce_plnt_num':'TCEPlanetNumbe',
'koi_tce_delivname':'TCEDeliver',
'koi_steff':'StellarEffectiveTemperature[K',
'koi_steff_err1':'StellarEffectiveTemperatureUpperUnc.[K',
'koi_steff_err2':'StellarEffectiveTemperatureLowerUnc.[K',
'koi_slogg':'StellarSurfaceGravity[log10(cm/s**2)',
'koi_slogg_err1':'StellarSurfaceGravityUpperUnc.[log10(cm/s**2)',
'koi_slogg_err2':'StellarSurfaceGravityLowerUnc.[log10(cm/s**2)',
'koi_srad':'StellarRadius[Solarradii',
'koi_srad_err1':'StellarRadiusUpperUnc.[Solarradii',
'koi_srad_err2':'StellarRadiusLowerUnc.[Solarradii',
'ra':'RA[decimaldegrees',
'dec':'Dec[decimaldegrees',
'koi_kepmag':'Kepler-band[mag]'
})


df['ExoplanetCandidate'] = df['DispositionUsingKeplerData'].apply(lambda x: 1 if x == 'CANDIDATE' else 0)
df['ExoplanetConfirmed'] = df['ExoplanetArchiveDisposition'].apply(lambda x: 2 if x == 'CONFIRMED' else 1 if x == 'CANDIDATE' else 0 )
df.drop(columns=['KeplerName','KOIName','EquilibriumTemperatureUpperUnc.[K',
                 'KepID','ExoplanetArchiveDisposition','DispositionUsingKeplerData',
                 'NotTransit-LikeFalsePositiveFlag','koi_fpflag_ss','CentroidOffsetFalsePositiveFlag',
                 'EphemerisMatchIndicatesContaminationFalsePositiveFlag','TCEDeliver',
                 'EquilibriumTemperatureLowerUnc.[K', 'ExoplanetConfirmed'], inplace=True)
df.dropna(inplace=True)
df.shape


(7803, 38)

In [6]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.float64)

clean_dataset(df)
train, test = train_test_split(df, test_size=0.2, random_state=42)

train.to_csv('planet_1.txt', sep=',', index=False)
test.to_csv('planet_2.txt', sep=',', index=False)


In [7]:
df_candidate_0 = df[df['ExoplanetCandidate'] == 0]
df_candidate_1 = df[df['ExoplanetCandidate'] == 1]

sample_candidate_0 = df_candidate_0.sample(n=3700, random_state=42)
sample_candidate_1 = df_candidate_1.sample(n=3700, random_state=42)

# Concatenate the two sampled DataFrames
columns_from_df1 = sample_candidate_0.columns
combined_sample = pd.DataFrame(interleave([sample_candidate_0.values, sample_candidate_1.values]))
combined_sample.columns = columns_from_df1

#combined_sample = pd.concat([sample_candidate_0, sample_candidate_1]).sort_index().reset_index(drop=True)
# Display the combined DataFrame
combined_sample.head(20)


Unnamed: 0,DispositionScore,OrbitalPeriod[days,OrbitalPeriodUpperUnc.[days,OrbitalPeriodLowerUnc.[days,TransitEpoch[BKJD,TransitEpochUpperUnc.[BKJD,TransitEpochLowerUnc.[BKJD,ImpactParamete,ImpactParameterUpperUnc,ImpactParameterLowerUnc,...,StellarSurfaceGravity[log10(cm/s**2),StellarSurfaceGravityUpperUnc.[log10(cm/s**2),StellarSurfaceGravityLowerUnc.[log10(cm/s**2),StellarRadius[Solarradii,StellarRadiusUpperUnc.[Solarradii,StellarRadiusLowerUnc.[Solarradii,RA[decimaldegrees,Dec[decimaldegrees,Kepler-band[mag],ExoplanetCandidate
0,0.0,9.080194,5.98e-07,-5.98e-07,136.817021,5.1e-05,-5.1e-05,0.614,0.007,-0.011,...,4.396,0.062,-0.175,1.131,0.322,-0.138,297.17203,46.953949,15.163,0.0
1,0.135,40.651102,0.000762,-0.000762,164.5528,0.0181,-0.0181,0.682,0.271,-0.442,...,4.26,0.095,-0.116,1.369,0.249,-0.166,289.91971,40.16853,13.935,1.0
2,0.0,37.809539,0.00138,-0.00138,158.4002,0.0329,-0.0329,0.03,0.416,-0.03,...,4.57,0.038,-0.152,0.82,0.181,-0.078,289.77826,38.262741,15.752,0.0
3,0.966,3.295346,2.57e-05,-2.57e-05,132.48799,0.0072,-0.0072,0.729,0.009,-0.534,...,4.5,0.046,-0.184,0.954,0.252,-0.079,288.32791,47.381592,15.342,1.0
4,0.0,8.480398,5.76e-05,-5.76e-05,135.85018,0.00613,-0.00613,0.486,0.446,-0.279,...,4.364,0.153,-0.187,1.05,0.297,-0.183,298.73007,46.688671,14.673,0.0
5,0.519,367.947848,0.00479,-0.00479,416.20998,0.00972,-0.00972,0.902,0.062,-0.638,...,4.561,0.03,-0.17,0.855,0.207,-0.069,297.73398,46.961529,15.719,1.0
6,0.0,3.187752,1.82e-05,-1.82e-05,134.418,0.00437,-0.00437,1.178,0.387,-0.114,...,4.489,0.069,-0.161,0.896,0.222,-0.095,285.37082,47.946499,14.64,0.0
7,1.0,114.73658,0.00015,-0.00015,165.150671,0.000977,-0.000977,0.595,0.037,-0.057,...,3.963,0.054,-0.045,1.699,0.147,-0.123,297.91687,46.96513,14.6,1.0
8,0.0,0.579517,3.63e-06,-3.63e-06,132.00256,0.0049,-0.0049,0.638,0.342,-0.431,...,4.387,0.087,-0.203,1.102,0.353,-0.151,294.41693,43.799587,13.559,0.0
9,1.0,7.362721,1.93e-05,-1.93e-05,136.86143,0.00214,-0.00214,0.253,0.198,-0.253,...,4.53,0.042,-0.179,0.894,0.231,-0.077,293.82663,38.856621,14.941,1.0


In [8]:
combined_sample.to_csv('planet.txt', sep=',', index=False, header=False)

In [9]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Separate features and result column
features = combined_sample.drop(columns=['ExoplanetCandidate'])
result = combined_sample['ExoplanetCandidate']

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply PCA on features only
pca = PCA(n_components=20)  # Adjust `n_components` as needed
pca_features = pca.fit_transform(scaled_features)

# Convert PCA results to a DataFrame
pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(pca.n_components)])

# Combine the PCA-transformed features with the result column
final_combined = pd.concat([pca_df, result.reset_index(drop=True)], axis=1)

# Display the final DataFrame
final_combined

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,ExoplanetCandidate
0,-0.332006,-1.103695,2.713078,-1.609298,-1.759507,-1.208250,1.413773,1.187088,2.233750,0.424125,...,0.796780,0.386945,0.388769,0.553638,-0.065769,1.197436,0.033353,-0.313569,0.288193,0.0
1,0.374029,0.997704,-0.691152,0.509041,0.129037,-0.534699,-0.452495,-0.559086,1.188330,0.022555,...,-0.501392,0.291491,-0.682592,-0.749535,-0.438934,0.009446,-0.281145,-1.093045,0.071379,1.0
2,0.796969,1.421821,0.207496,-0.510873,-0.273747,1.133074,-0.660913,1.080336,0.420467,1.243399,...,-1.478791,0.817592,-0.602130,-0.339659,-0.280404,-0.595151,0.306486,0.030731,-0.182421,0.0
3,0.730914,-0.456826,-0.518796,0.418522,0.481572,1.117805,-0.477999,0.312939,-1.211396,-1.033682,...,0.609840,1.082982,-0.680661,1.768705,0.502444,-0.330017,-0.322505,-0.361838,-0.012457,1.0
4,0.113450,-0.240281,0.459484,-0.537287,0.520924,0.465364,-0.074586,0.266549,0.040719,1.051815,...,1.258526,-0.119752,0.628966,-0.434787,0.372410,-0.378270,-0.124378,-0.315152,-0.333727,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7395,0.475826,-0.917421,-0.754275,0.387835,0.338453,-0.365196,-0.132102,-0.155215,-0.087297,-0.272423,...,0.387865,0.052425,0.446709,-0.642664,0.037843,0.899057,-0.137440,0.238450,-0.248375,1.0
7396,-1.075603,-0.939758,1.762030,-1.373583,0.410287,-0.477917,-0.071250,0.169691,0.277609,0.238928,...,0.210552,-0.067008,0.874780,0.141961,0.370863,-0.221615,-0.114313,0.009658,-0.119748,0.0
7397,0.002201,-0.340027,-1.226287,0.285109,0.591700,-1.069723,-1.268919,-0.552593,0.129980,-1.351425,...,0.072392,1.124851,0.562742,0.259184,-0.334284,1.184550,0.220686,-1.067798,0.050124,1.0
7398,-0.168967,0.991682,0.472202,-0.763174,0.752975,1.165761,-1.240350,0.655422,0.359254,0.449055,...,0.811299,-1.022410,-0.379032,0.242277,0.051172,-0.372388,-0.004308,-0.933459,0.808491,0.0


In [10]:
final_combined.to_csv('planet_pca.txt', sep=',', index=False, header=False)

In [11]:
print(final_combined.shape) # dataset
print(result) # labels

(7400, 21)
0       0.0
1       1.0
2       0.0
3       1.0
4       0.0
       ... 
7395    1.0
7396    0.0
7397    1.0
7398    0.0
7399    1.0
Name: ExoplanetCandidate, Length: 7400, dtype: float64


In [12]:
print(type(final_combined))
print(type(result))
final_combined = final_combined.to_numpy()
result = result.to_numpy()


X_train = final_combined[:5000,]
Y_train = result[:5000,]
X_test = final_combined[5000:,]
Y_test = result[5000:,]


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [13]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(5000, 21)
(5000,)
(2400, 21)
(2400,)


In [14]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import time


svm = SVC(kernel='linear')
start_time = time.time()
svm.fit(X_train, Y_train)
end_time = time.time()
print(end_time-start_time)

0.039096832275390625


In [15]:
Y_pred_train = svm.predict(X_train)
train_acc = accuracy_score(Y_pred_train, Y_train)

print(np.mean(Y_pred_train == Y_train))
print("Train Accuracy:", train_acc)

1.0
None
Train Accuracy: 1.0


In [16]:
Y_pred_test = svm.predict(X_test)
test_acc = accuracy_score(Y_pred_test, Y_test)
print(np.mean(Y_pred_test == Y_test))
print("Testing Accuracy:", test_acc)

1.0
None
Testing Accuracy: 1.0


In [17]:
print(Y_pred_test)
print(Y_test)

[0. 1. 0. ... 1. 0. 1.]
[0. 1. 0. ... 1. 0. 1.]


In [20]:
print(type(X_train))
print(len(X_train))

<class 'numpy.ndarray'>
5000


In [42]:
precision,recall,f_score,accuracy = compute_metrics(svm,X_train,Y_train)
print(f'Training: {precision=} {recall=} {f_score=} {accuracy=}')



Len:  5000
(5000, 21)
Pred:  (5000,)
tp:  2500
tn:  2500
fp:  0
fn:  0
Training: precision=1.0 recall=1.0 f_score=1.0 accuracy=1.0


In [43]:

precision,recall,f_score,accuracy = compute_metrics(svm,X_test,Y_test)
print(f'Testing: {precision=} {recall=} {f_score=} {accuracy=}')


Len:  2400
(2400, 21)
Pred:  (2400,)
tp:  1200
tn:  1200
fp:  0
fn:  0
Testing: precision=1.0 recall=1.0 f_score=1.0 accuracy=1.0
