# LOGISTIC REGRESSION MODEL 1: ALL SCRUBBED VARIABLES

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

### Exoplanet Identification
#### The variables in this dataset have been selected based on their ability to predict whether a Kepler Object of Interest or "koi" is a potential exoplanet.

#### There is a data dictionary that explains the meaning of each of the variables contained here in.
#### koi_disposition is the "label" that identifies each koi as either a FALSE POSITIVE, CONFIRMED, or CANDIDATE


In [2]:
df = pd.read_csv("data/clean_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()


Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_model_snr,koi_tce_plnt_num,koi_srad
0,CONFIRMED,0,0,0,0,54.418383,162.51384,0.586,4.507,874.8,2.83,443,25.8,2,0.927
1,FALSE POSITIVE,0,1,0,0,19.89914,175.850252,0.969,1.7822,10829.0,14.6,638,76.3,1,0.868
2,FALSE POSITIVE,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,505.6,1,0.791
3,CONFIRMED,0,0,0,0,2.525592,171.59555,0.701,1.6545,603.3,2.75,1406,40.9,1,1.046
4,CONFIRMED,0,0,0,0,4.134435,172.97937,0.762,3.1402,686.0,2.77,1160,40.2,2,0.972


In [3]:
# Assign X (data) and y (target)
X = df.drop("koi_disposition", axis=1)
y = df['koi_disposition']
print(X.shape, y.shape)

(6991, 14) (6991,)


##### Split data into training and testing

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#### Create a Logistic Regression Model

In [5]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

#### Train the model

In [6]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

#### Validate the model using the test data

In [7]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6061415220293725
Testing Data Score: 0.6046910755148741


#### Make predictions

In [8]:
predictions = classifier.predict(X_test)


In [9]:
results=pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,CANDIDATE,CONFIRMED
1,CANDIDATE,FALSE POSITIVE
2,FALSE POSITIVE,FALSE POSITIVE
3,CONFIRMED,CONFIRMED
4,FALSE POSITIVE,FALSE POSITIVE
5,FALSE POSITIVE,FALSE POSITIVE
6,CONFIRMED,FALSE POSITIVE
7,CONFIRMED,CONFIRMED
8,FALSE POSITIVE,CANDIDATE
9,FALSE POSITIVE,FALSE POSITIVE


In [10]:
actuals = results['Actual'].value_counts()
actuals

FALSE POSITIVE    909
CONFIRMED         435
CANDIDATE         404
Name: Actual, dtype: int64

In [11]:
actual_confirmed = actuals[1]
actual_candidate = actuals[2]
actual_FPs = actuals[0]
actuals = [actual_confirmed, actual_candidate, actual_FPs]

In [12]:
## Finding the True Positives
conditions = [
    (results["Prediction"]=="CANDIDATE") & (results['Actual']=="CANDIDATE"),
    (results["Prediction"]=="CONFIRMED") & (results['Actual']=="CONFIRMED"),
    (results["Prediction"]=="FALSE POSITIVE") & (results['Actual']=="FALSE POSITIVE"),
    ]

In [13]:
values = ['TP_Candidate', 'TP_Confirmed','TP_False']

In [14]:
results['True_Positives']=np.select(conditions,values)
results.head()

Unnamed: 0,Prediction,Actual,True_Positives
0,CANDIDATE,CONFIRMED,0
1,CANDIDATE,FALSE POSITIVE,0
2,FALSE POSITIVE,FALSE POSITIVE,TP_False
3,CONFIRMED,CONFIRMED,TP_Confirmed
4,FALSE POSITIVE,FALSE POSITIVE,TP_False


In [15]:
True_Positives = results['True_Positives'].value_counts()
True_Positives

TP_False        736
0               691
TP_Confirmed    284
TP_Candidate     37
Name: True_Positives, dtype: int64

In [16]:
TP_confirmed = True_Positives[2]
TP_falsepositive = True_Positives[0]
TP_candidate = True_Positives[3]
TPs = [TP_confirmed, TP_candidate, TP_falsepositive]

In [17]:
d = {'Label':['Confirmed','Candidate','False Positive'],'Actuals': actuals, 'True_Positives': TPs}
d

{'Label': ['Confirmed', 'Candidate', 'False Positive'],
 'Actuals': [435, 404, 909],
 'True_Positives': [284, 37, 736]}

In [18]:
summary_table = pd.DataFrame(d)
summary_table

Unnamed: 0,Label,Actuals,True_Positives
0,Confirmed,435,284
1,Candidate,404,37
2,False Positive,909,736


In [19]:
summary_table['Recall'] = summary_table['True_Positives']/summary_table['Actuals']
summary_table

Unnamed: 0,Label,Actuals,True_Positives,Recall
0,Confirmed,435,284,0.652874
1,Candidate,404,37,0.091584
2,False Positive,909,736,0.809681


# EVALUATION OF MODEL

#### This model has a much better performance (81%) identifying "False Positives" than identifying Confirmed or Candidate Exoplanets.  This model performs the worst identifying "Candidate" Exoplanets.