In [82]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

So far, this notebook has 3 topics:
1. Basic Exploration
2. Feature Importance for Kepler Identification
3. Planet Habitability

In [83]:
exo = pd.read_csv("data/composite_with_lum_dist.csv", skiprows=42)
exo.head()

Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,st_lum,st_logg,rastr,ra,decstr,dec,sy_dist,sy_vmag,sy_kmag,sy_gaiamag
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,1.29,...,2.243,2.31,12h20m42.91s,185.178779,+17d47m35.71s,17.793252,93.1846,4.72307,2.282,4.44038
1,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,2.43,1.93,15h17m05.90s,229.274595,+71d49m26.19s,71.823943,125.321,5.013,1.939,4.56216
2,14 And b,14 And,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,0,185.84,0.83,...,1.763,2.63,23h31m17.80s,352.82415,+39d14m09.01s,39.235837,75.4392,5.23133,2.331,4.91781
3,14 Her b,14 Her,1,2,Radial Velocity,2002,W. M. Keck Observatory,0,1765.0389,2.774,...,-0.153,4.45,16h10m24.50s,242.602101,+43d48m58.90s,43.816362,17.9323,6.61935,4.714,6.383
4,16 Cyg B b,16 Cyg B,3,1,Radial Velocity,1996,Multiple Observatories,0,798.5,1.66,...,0.097,4.36,19h41m51.75s,295.465642,+50d31m00.57s,50.516824,21.1397,6.215,4.651,6.06428


In [84]:
exo.columns.tolist()

['pl_name',
 'hostname',
 'sy_snum',
 'sy_pnum',
 'discoverymethod',
 'disc_year',
 'disc_facility',
 'pl_controv_flag',
 'pl_orbper',
 'pl_orbsmax',
 'pl_rade',
 'pl_radj',
 'pl_bmasse',
 'pl_bmassj',
 'pl_bmassprov',
 'pl_orbeccen',
 'pl_insol',
 'pl_eqt',
 'ttv_flag',
 'pl_ratdor',
 'st_spectype',
 'st_teff',
 'st_rad',
 'st_mass',
 'st_met',
 'st_metratio',
 'st_lum',
 'st_logg',
 'rastr',
 'ra',
 'decstr',
 'dec',
 'sy_dist',
 'sy_vmag',
 'sy_kmag',
 'sy_gaiamag']

## 1. Basic Exploration

In [85]:
def generalize_spectype(row):
      if "B" in row["st_spectype"]:
            return "B"
      elif "A" in row["st_spectype"]:
            return "A"
      elif "F" in row["st_spectype"]:
            return "F"
      elif "G" in row["st_spectype"]:
            return "G"
      elif "K" in row["st_spectype"]:
            return "K"
      elif "M" in row["st_spectype"]:
            return "M"
      else:
            return "Undefined"

exo["st_spectype"].fillna('', inplace=True)
exo["generalized_spectype"] = exo.apply(generalize_spectype, axis=1)
exo

Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,st_logg,rastr,ra,decstr,dec,sy_dist,sy_vmag,sy_kmag,sy_gaiamag,generalized_spectype
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.030000,1.290000,...,2.31,12h20m42.91s,185.178779,+17d47m35.71s,17.793252,93.1846,4.72307,2.282,4.44038,G
1,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.219970,1.530000,...,1.93,15h17m05.90s,229.274595,+71d49m26.19s,71.823943,125.3210,5.01300,1.939,4.56216,K
2,14 And b,14 And,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,0,185.840000,0.830000,...,2.63,23h31m17.80s,352.824150,+39d14m09.01s,39.235837,75.4392,5.23133,2.331,4.91781,K
3,14 Her b,14 Her,1,2,Radial Velocity,2002,W. M. Keck Observatory,0,1765.038900,2.774000,...,4.45,16h10m24.50s,242.602101,+43d48m58.90s,43.816362,17.9323,6.61935,4.714,6.38300,K
4,16 Cyg B b,16 Cyg B,3,1,Radial Velocity,1996,Multiple Observatories,0,798.500000,1.660000,...,4.36,19h41m51.75s,295.465642,+50d31m00.57s,50.516824,21.1397,6.21500,4.651,6.06428,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5465,ups And b,ups And,2,3,Radial Velocity,1996,Lick Observatory,0,4.617033,0.059222,...,4.13,01h36m47.60s,24.198353,+41d24m13.73s,41.403815,13.4054,4.09565,2.859,3.98687,F
5466,ups And c,ups And,2,3,Radial Velocity,1999,Multiple Observatories,0,241.258000,0.827774,...,4.13,01h36m47.60s,24.198353,+41d24m13.73s,41.403815,13.4054,4.09565,2.859,3.98687,F
5467,ups And d,ups And,2,3,Radial Velocity,1999,Multiple Observatories,0,1276.460000,2.513290,...,4.13,01h36m47.60s,24.198353,+41d24m13.73s,41.403815,13.4054,4.09565,2.859,3.98687,F
5468,ups Leo b,ups Leo,1,1,Radial Velocity,2021,Okayama Astrophysical Observatory,0,385.200000,1.180000,...,2.46,11h36m56.93s,174.237219,-00d49m24.83s,-0.823564,52.5973,4.30490,2.184,4.03040,G


In [86]:
select_columns = ["hostname", "st_spectype", "discoverymethod"]
def plot_basic_characteristic(column, data):

      if column == "st_spectype":

            def bolometric_cor_calc(row):
                  if "B" in row["st_spectype"]:
                        return "B"
                  elif "A" in row["st_spectype"]:
                        return "A"
                  elif "F" in row["st_spectype"]:
                        return "F"
                  elif "G" in row["st_spectype"]:
                        return "G"
                  elif "K" in row["st_spectype"]:
                        return "G"
                  elif "M" in row["st_spectype"]:
                        return "M"
                  else:
                        return "Undefined"

            data["st_spectype"].fillna('', inplace=True)
            data["st_spectype_modified"] = data.apply(bolometric_cor_calc, axis=1)
            column = "st_spectype_modified"
            
      unique_c = data[column].unique().tolist()
      findtop10 = exo.groupby([column]).size().to_frame()
      findtop10 = findtop10.reset_index()
      findtop10.columns = [column, "count"]
      findtop10 = findtop10.sort_values(by=["count"], ascending=False)
      top10list = findtop10[column].iloc[0:10].tolist()
      data = data[data[column].isin(top10list)]
      values = [len(data[data[column] == item]) for item in top10list]
      fig = px.pie(data, values=values, names=top10list)
      fig.show()

plot_basic_characteristic("disc_facility", exo)

In [87]:
def exploratory_pie_chart(col_list):
      pie_fig = go.Figure()
      button_list = []
      for ind, column in enumerate(col_list):
            find_unique_cols = exo[column].nunique()
            labels = list(set(exo[column].tolist()))
            if find_unique_cols > 10:
                  findtop10 = exo.groupby([column]).size().to_frame()
                  findtop10 = findtop10.reset_index()
                  findtop10.columns = [column, "count"]
                  findtop10 = findtop10.sort_values(by=["count"], ascending=False)
                  top10list = findtop10[column].iloc[0:10].tolist()
                  values = [len(exo[exo[column] == item]) for item in top10list]
                  
            else:
                  values = [len(exo[exo[column] == item]) for item in labels]

            pie_fig.add_trace(go.Pie(labels=labels, values=values, name=column, visible=True if ind == 0 else False))

            col_count = [False] * len(col_list)
            modified_list = []
            for ind_count, col_name in enumerate(col_count):
                  modified_list.append(True if ind_count == ind else False)
            button = dict(label=column, args=[{"visible":modified_list, "title":column}], method="update")
            button_list.append(button)

      pie_fig.update_layout(
            updatemenus=[dict(buttons=list(button_list),
                      direction="down",
                      showactive=True,
                      x=-0.1,
                      xanchor="left",
                      y=1.3,
                      yanchor="top"),
                ],
                title=go.layout.Title(text="Exoplanet Categorical Variable Observation",
                                            x=0.5,
                                            y=0.95),
                                            width=800,
                                            height=500,
                                            autosize=True
                )
                                         
      pie_fig.show()

exploratory_pie_chart(["hostname", "generalized_spectype", "discoverymethod"])

### Exoplanet Classification
- Gas Giant: planets the size of Saturn or Jupiter, the largest planet in our solar system, or much, much larger.
- Neptunian: similar in size to Neptune or Uranus in our solar system. (Neptune is about four times the size, or radius, of Earth and almost 17 times its mass, or weight.)
- Super-Earth: more massive than Earth, but lighter than Neptune.
- Terrestrial: Earth sized and smaller, composed of rock, silicate, water or carbon.

Source: 
- https://exoplanets.nasa.gov/what-is-an-exoplanet/planet-types/overview/#:~:text=So%20far%20scientists%20have%20categorized,%2C%20super%2DEarth%20and%20terrestrial.
- https://iopscience.iop.org/article/10.3847/1538-4357/aab205#apjaab205s4

In [88]:
all_list = {"rocky" : np.arange(0.5, 1.0, step=0.1),
            "super_earth" : np.arange(1.0, 1.75, step=0.1),
            "sub_neptune" : np.arange(1.75, 3.5, step=0.1),
            "sub_jovian" : np.arange(3.5, 6.0, step=0.1),
            "jovians" : np.arange(6.0, 14.3, step=0.1)}

def classify_on_rad(val):
    if val in all_list["rocky"]:
        return "Rocky"
    elif val in all_list["super_earth"]:
        return "Super Earth"
    elif val in all_list["sub_neptune"]:
        return "Sub Neptune"
    elif val in all_list["sub_jovian"]:
        return "Sub Jovian"
    elif val in all_list["jovians"]:
        return "Jovian"
    else:
        return "Undefined"

exo["rad_classification"] = exo["pl_rade"].apply(lambda row: classify_on_rad(row))
display(exo.groupby(["rad_classification"]).size().to_frame())
exo.head()

Unnamed: 0_level_0,0
rad_classification,Unnamed: 1_level_1
Jovian,1
Rocky,4
Sub Jovian,14
Sub Neptune,27
Super Earth,21
Undefined,5403


Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,rastr,ra,decstr,dec,sy_dist,sy_vmag,sy_kmag,sy_gaiamag,generalized_spectype,rad_classification
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,1.29,...,12h20m42.91s,185.178779,+17d47m35.71s,17.793252,93.1846,4.72307,2.282,4.44038,G,Undefined
1,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,15h17m05.90s,229.274595,+71d49m26.19s,71.823943,125.321,5.013,1.939,4.56216,K,Undefined
2,14 And b,14 And,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,0,185.84,0.83,...,23h31m17.80s,352.82415,+39d14m09.01s,39.235837,75.4392,5.23133,2.331,4.91781,K,Undefined
3,14 Her b,14 Her,1,2,Radial Velocity,2002,W. M. Keck Observatory,0,1765.0389,2.774,...,16h10m24.50s,242.602101,+43d48m58.90s,43.816362,17.9323,6.61935,4.714,6.383,K,Undefined
4,16 Cyg B b,16 Cyg B,3,1,Radial Velocity,1996,Multiple Observatories,0,798.5,1.66,...,19h41m51.75s,295.465642,+50d31m00.57s,50.516824,21.1397,6.215,4.651,6.06428,G,Undefined


In [89]:
apply_koi = lambda row: "KOI" if "Kepler" in row["hostname"] else "Exoplanet"
exo["koi_classification"] = exo.apply(apply_koi, axis=1)
exo = exo.replace(np.nan, 0)
display(exo.groupby(["koi_classification"]).size().to_frame())
exo.head()

Unnamed: 0_level_0,0
koi_classification,Unnamed: 1_level_1
Exoplanet,2725
KOI,2745


Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,ra,decstr,dec,sy_dist,sy_vmag,sy_kmag,sy_gaiamag,generalized_spectype,rad_classification,koi_classification
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,1.29,...,185.178779,+17d47m35.71s,17.793252,93.1846,4.72307,2.282,4.44038,G,Undefined,Exoplanet
1,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,229.274595,+71d49m26.19s,71.823943,125.321,5.013,1.939,4.56216,K,Undefined,Exoplanet
2,14 And b,14 And,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,0,185.84,0.83,...,352.82415,+39d14m09.01s,39.235837,75.4392,5.23133,2.331,4.91781,K,Undefined,Exoplanet
3,14 Her b,14 Her,1,2,Radial Velocity,2002,W. M. Keck Observatory,0,1765.0389,2.774,...,242.602101,+43d48m58.90s,43.816362,17.9323,6.61935,4.714,6.383,K,Undefined,Exoplanet
4,16 Cyg B b,16 Cyg B,3,1,Radial Velocity,1996,Multiple Observatories,0,798.5,1.66,...,295.465642,+50d31m00.57s,50.516824,21.1397,6.215,4.651,6.06428,G,Undefined,Exoplanet


In [90]:
pie_fig = go.Figure()
pie_fig.add_trace(go.Pie(labels=["Exoplanet", "KOI"], values=exo["koi_classification"].value_counts(), 
                         name="KOI", visible=True))
pie_fig.add_trace(go.Pie(labels=exo["rad_classification"].unique().tolist(),
                         values=exo["rad_classification"].value_counts(),
                        name="Mass", visible=False))
pie_fig.update_layout(
    updatemenus=[dict(buttons=list([dict(label="KOI Classification",
                                         args=[{"visible": [True, False]},
                                               {"title": "KOI Classification"}],
                                         method="update"),
                                    dict(args=[{"visible": [False, True]},
                                               {"title": "Mass Classification"}],
                                         label="Mass Classification",
                                         method="update")
                                   ]),
                      direction="down",
                      showactive=True,
                      x=-0.1,
                      xanchor="left",
                      y=1.3,
                      yanchor="top"),
                ]
)

pie_fig.update_layout(title=go.layout.Title(text="Exoplanet Classifications",
                                            x=0.5,
                                            y=0.95),
                      width=700,
                      height=500,
                      autosize=False)
                                            

pie_fig.show()

## 2. Distinguish KOI
- What factor contributes the most in classifying an exoplanet as KOI?

In [91]:
all_int_cols = list(exo.select_dtypes(include=['int', 'float']).columns)
all_int_cols.append("koi_classification")
koi = exo[all_int_cols]
koi.head()

Unnamed: 0,sy_snum,sy_pnum,disc_year,pl_controv_flag,pl_orbper,pl_orbsmax,pl_rade,pl_radj,pl_bmasse,pl_bmassj,...,st_met,st_lum,st_logg,ra,dec,sy_dist,sy_vmag,sy_kmag,sy_gaiamag,koi_classification
0,2,1,2007,0,326.03,1.29,12.1,1.08,6165.6,19.4,...,-0.35,2.243,2.31,185.178779,17.793252,93.1846,4.72307,2.282,4.44038,Exoplanet
1,1,1,2009,0,516.21997,1.53,12.3,1.09,4684.8142,14.74,...,-0.02,2.43,1.93,229.274595,71.823943,125.321,5.013,1.939,4.56216,Exoplanet
2,1,1,2008,0,185.84,0.83,12.9,1.15,1525.5,4.8,...,-0.24,1.763,2.63,352.82415,39.235837,75.4392,5.23133,2.331,4.91781,Exoplanet
3,1,2,2002,0,1765.0389,2.774,12.6,1.12,2559.47216,8.053,...,0.405,-0.153,4.45,242.602101,43.816362,17.9323,6.61935,4.714,6.383,Exoplanet
4,3,1,1996,0,798.5,1.66,13.5,1.2,565.7374,1.78,...,0.06,0.097,4.36,295.465642,50.516824,21.1397,6.215,4.651,6.06428,Exoplanet


In [92]:
X_train, X_test, y_train, y_test =  train_test_split(koi.iloc[:, :-1],koi["koi_classification"],test_size=0.3,random_state=0)
# categorical_dtypes = list(X_train.select_dtypes(include=['object']).columns)
# col_transformer = [("cat", encoder, categorical_dtypes)]
# col_transform = ColumnTransformer(transformers=col_transformer)


#### Logistic Regression

In [128]:
def generate_h_bar_fig(df, model_type):
      h_bar = px.bar(df,x="coef", y="feature", orientation="h", height=600, color="color",
                        title=f"{model_type} Feature Importance for KOI Classification")
      h_bar.update_layout(
            title={'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'})
      
      h_bar.show()

In [135]:
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train,  y_train)
lr_y_hat = lr_model.predict(X_test)
print(f"LR Model score is: {accuracy_score(y_test, lr_y_hat)}")
lr_importance = list(zip(lr_model.coef_[0], X_train.columns.tolist()))

lr_df = pd.DataFrame(columns=["feature", "coef"])
for ind, item in lr_importance:
    lr_df.loc[len(lr_df)] = item, ind
color = lambda row: "Positive" if row["coef"] >= 0 else "Negative"
lr_df["color"] = lr_df.apply(color, axis=1)
    
display(lr_df.sort_values(by="coef", ascending=False))
generate_h_bar_fig(lr_df.sort_values(by="coef"), "Logistic Regression")

LR Model score is: 0.9762340036563071


Unnamed: 0,feature,coef,color
25,sy_kmag,0.951765,Positive
19,st_lum,0.42922,Positive
17,st_mass,0.171139,Positive
0,sy_snum,0.153302,Positive
24,sy_vmag,0.145462,Positive
22,dec,0.104282,Positive
13,ttv_flag,0.052219,Positive
21,ra,0.031487,Positive
18,st_met,0.028916,Positive
26,sy_gaiamag,0.01719,Positive


#### Decision Tree

In [134]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_importance = list(zip(tree_model.feature_importances_, X_train.columns.tolist()))
tree_y_hat = tree_model.predict(X_test)
print(f"Decision Tree Classifier Model score is: {accuracy_score(y_test, tree_y_hat)}")

tree_df = pd.DataFrame(columns=["feature", "coef"])
for ind, item in tree_importance:
    tree_df.loc[len(tree_df)] = item, ind
color = lambda row: "Positive" if row["coef"] >= 0 else "Negative"
tree_df["color"] = tree_df.apply(color, axis=1)

display(tree_df.sort_values(by="coef", ascending=False))
generate_h_bar_fig(tree_df.sort_values(by="coef"), "Decision Tree")

Decision Tree Classifier Model score is: 0.9865935405240707


Unnamed: 0,feature,coef,color
22,dec,0.739307,Positive
21,ra,0.209544,Positive
5,pl_orbsmax,0.010379,Positive
1,sy_pnum,0.006039,Positive
23,sy_dist,0.005133,Positive
2,disc_year,0.004575,Positive
24,sy_vmag,0.004491,Positive
7,pl_radj,0.003228,Positive
18,st_met,0.003031,Positive
17,st_mass,0.002623,Positive


In [133]:
r_tree_model = RandomForestClassifier()
r_tree_model.fit(X_train, y_train)
r_tree_importance = list(zip(r_tree_model.feature_importances_, X_train.columns.tolist()))
r_tree_y_hat = r_tree_model.predict(X_test)
print(f"Decision Tree Classifier Model score is: {accuracy_score(y_test, r_tree_y_hat)}")

r_tree_df = pd.DataFrame(columns=["feature", "coef"])
for ind, item in r_tree_importance:
    r_tree_df.loc[len(r_tree_df)] = item, ind
color = lambda row: "Positive" if row["coef"] >= 0 else "Negative"
r_tree_df["color"] = r_tree_df.apply(color, axis=1)

r_tree_df.sort_values(by="coef", ascending=False)
generate_h_bar_fig(r_tree_df.sort_values(by="coef"), "Random Forest Classifier")

Decision Tree Classifier Model score is: 0.9932967702620353


## 3. Define Habitable Zone
![formula](data/hz_formula_with_e.png)
- Source: https://www.planetarybiology.com/calculating_habitable_zone.html

#### Based on Stellar Luminosity and Radius

In [96]:
solar_lum = np.log(3.828 * (10 ** 26))
earth_rad = 6371000

def revert_to_km_rade(value):
    km_converted = value * 1.496 * (10 ** 8)
    earth_converted = km_converted / earth_rad
    return earth_converted

def calculate_lum(df):
    df["r_inner"] = revert_to_km_rade((df["st_lum"] / 1.1) ** (1/2))
    df["r_outer"] = revert_to_km_rade((df["st_lum"] / 0.53) ** (1/2))
    return df

def identify_habitability(rade, r_inner, r_outer):
    return True if (rade > r_inner) & (rade < r_outer) else False

exo = calculate_lum(exo)
exo["habitability"] = exo.apply(lambda row: identify_habitability(row.pl_rade, row.r_inner, row.r_outer), axis=1)
display(exo.groupby(["habitability"]).size().to_frame())
exo.head()

Unnamed: 0_level_0,0
habitability,Unnamed: 1_level_1
False,5224
True,246


Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,sy_dist,sy_vmag,sy_kmag,sy_gaiamag,generalized_spectype,rad_classification,koi_classification,r_inner,r_outer,habitability
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,1.29,...,93.1846,4.72307,2.282,4.44038,G,Undefined,Exoplanet,33.530674,48.305957,False
1,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,125.321,5.013,1.939,4.56216,K,Undefined,Exoplanet,34.90043,50.279296,False
2,14 And b,14 And,1,1,Radial Velocity,2008,Okayama Astrophysical Observatory,0,185.84,0.83,...,75.4392,5.23133,2.331,4.91781,K,Undefined,Exoplanet,29.727186,42.826463,False
3,14 Her b,14 Her,1,2,Radial Velocity,2002,W. M. Keck Observatory,0,1765.0389,2.774,...,17.9323,6.61935,4.714,6.383,K,Undefined,Exoplanet,,,False
4,16 Cyg B b,16 Cyg B,3,1,Radial Velocity,1996,Multiple Observatories,0,798.5,1.66,...,21.1397,6.215,4.651,6.06428,G,Undefined,Exoplanet,6.972901,10.045508,False


In [100]:
exo_mass = pd.read_csv("data/planetary_data_with_mass.csv", skiprows=50)
exo_merge = exo.merge(exo_mass[["pl_name", "pl_msinie", "pl_masse"]], on="pl_name", how="outer")
print(exo_merge.shape)
exo_merge["pl_msinie"].count()
exo_merge.head()

(34923, 44)


Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,sy_kmag,sy_gaiamag,generalized_spectype,rad_classification,koi_classification,r_inner,r_outer,habitability,pl_msinie,pl_masse
0,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,1.29,...,2.282,4.44038,G,Undefined,Exoplanet,33.530674,48.305957,False,5434.7,
1,11 Com b,11 Com,2,1,Radial Velocity,2007,Xinglong Station,0,326.03,1.29,...,2.282,4.44038,G,Undefined,Exoplanet,33.530674,48.305957,False,6165.6,
2,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,1.939,4.56216,K,Undefined,Exoplanet,34.90043,50.279296,False,3432.4,
3,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,1.939,4.56216,K,Undefined,Exoplanet,34.90043,50.279296,False,4684.8142,
4,11 UMi b,11 UMi,1,1,Radial Velocity,2009,Thueringer Landessternwarte Tautenburg,0,516.21997,1.53,...,1.939,4.56216,K,Undefined,Exoplanet,34.90043,50.279296,False,3337.07,


In [112]:
exo_masse_non=exo_merge[exo_merge["pl_masse"].notna()]
habitability_fig = px.scatter(exo_masse_non, x="st_lum", y="pl_rade", color="habitability", size="pl_masse", hover_data=["pl_name"],
title="Relationship of Radius, Luminosity and Mass(in Earth Unit) in Determining Planet Habitability",
labels={"st_lum":"Stellar Luminosity",
        "pl_rade": "Planet Radius (in Earth Unit)",
        "pl_masse": "Planet Mass (in Earth Unit)",
        "habitability": "Habitability"
                 })
habitability_fig.update_layout(
    title={
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        height=600)
habitability_fig.show()

In [99]:
def apply_rocky(row):
    return True if ((row["pl_rade"] > 0.5) & (row["pl_rade"] <= 1.6)) or ((row["pl_msinie"] > 0.1) & (row["pl_msinie"] <= 3)) else False

exo_merge["rocky"] = exo_merge.apply(apply_rocky, axis=1)
display(exo_merge.groupby(["rocky"]).size().to_frame())
exo_merge[(exo_merge["rocky"] == True)]

Unnamed: 0_level_0,0
rocky,Unnamed: 1_level_1
False,26459
True,8464


Unnamed: 0,pl_name,hostname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,pl_controv_flag,pl_orbper,pl_orbsmax,...,sy_kmag,sy_gaiamag,generalized_spectype,rad_classification,koi_classification,r_inner,r_outer,habitability,pl_msinie,rocky
341,DMPP-3 A b,DMPP-3 A,2,1,Radial Velocity,2019,La Silla Observatory,0,6.67320,0.06620,...,7.061,8.854298,K,Undefined,Exoplanet,,,False,2.58,True
342,DMPP-3 A b,DMPP-3 A,2,1,Radial Velocity,2019,La Silla Observatory,0,6.67320,0.06620,...,7.061,8.854298,K,Undefined,Exoplanet,,,False,2.58,True
348,EPIC 201170410.02,EPIC 201170410,1,1,Transit,2020,K2,0,6.79870,0.03490,...,12.619,16.438600,Undefined,Undefined,Exoplanet,,,False,,True
351,EPIC 201427007 b,EPIC 201427007,1,1,Transit,2021,K2,0,0.72091,0.00000,...,12.581,14.147000,Undefined,Undefined,Exoplanet,,,False,,True
352,EPIC 201497682 b,EPIC 201497682,1,1,Transit,2019,K2,0,2.13174,0.00000,...,11.438,13.736800,Undefined,Undefined,Exoplanet,,,False,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34810,YZ Cet d,YZ Cet,1,3,Radial Velocity,2017,La Silla Observatory,0,4.65626,0.02851,...,6.420,10.429400,M,Undefined,Exoplanet,,,False,1.21,True
34811,YZ Cet d,YZ Cet,1,3,Radial Velocity,2017,La Silla Observatory,0,4.65626,0.02851,...,6.420,10.429400,M,Undefined,Exoplanet,,,False,1.09,True
34812,YZ Cet d,YZ Cet,1,3,Radial Velocity,2017,La Silla Observatory,0,4.65626,0.02851,...,6.420,10.429400,M,Undefined,Exoplanet,,,False,1.14,True
34897,tau Cet g,tau Cet,1,4,Radial Velocity,2017,Multiple Observatories,0,20.00000,0.13300,...,1.794,3.248540,G,Undefined,Exoplanet,,,False,1.75,True
