In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# import train_test_split
from sklearn.model_selection import train_test_split
# import MinMaxScaler to scale data
from sklearn.preprocessing import MinMaxScaler
# import LogisticRegression for model1
from sklearn.linear_model import LogisticRegression
# import KNeighborsClassifier for model2
from sklearn.neighbors import KNeighborsClassifier

# Read the CSV and Perform Basic Data Cleaning

In [None]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head(30)

# Select your features (columns)

In [None]:
# Set features. This will also be used as your x values.
koi_ft = df[['koi_disposition', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec','koi_period','koi_period_err1','koi_period_err2','ra','dec','koi_kepmag']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
# Assign X and y values
X = koi_ft.drop('koi_disposition',axis=1)
y = koi_ft['koi_disposition']

In [None]:
# Use train_test_split to create test and train sets of data
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=12)

In [None]:
X_train.head()

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [None]:
# Scale data using MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



### Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier

In [None]:
classifier.fit(X_train,y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
predictions = classifier.predict(X_test)

print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

In [None]:
model1 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

### K Nearest Neighbors

In [None]:
train_scores = []
test_scores = []
for k in range(1, 30, 2):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)

plt.plot(range(1,30,2), train_scores)
plt.plot(range(1,30,2), test_scores)
plt.xlabel("K Neighbors")
plt.ylabel("Testing Accuracy Score")
plt.show()

In [None]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)