## Original Astronomy Dataset #
### Stars, Galaxies and Quasars ##
https://www.sdss.org/dr17/
http://skyserver.sdss.org/dr17/SearchTools/sql

# Predictive Data Analytics

In [1]:
# Import library
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import joblib

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC

In [5]:
# Load Dataset
df=pd.read_csv('clean_Skyserver_SQL1_5_2022 11_26_53 PM.csv', low_memory=False)
df.head(2)

Unnamed: 0,objid,ra,dec,u,g,r,i,z,redshift,class
0,1237678598097404025,0.044032,0.035457,0.393903,0.352869,0.337332,0.331551,0.330089,0.000495,STAR
1,1237678598076366970,0.910465,0.030273,0.387076,0.348962,0.337407,0.333596,0.332157,0.000466,STAR


In [6]:
df.drop(['objid'], axis=1, inplace =True)
df.head(2)

Unnamed: 0,ra,dec,u,g,r,i,z,redshift,class
0,0.044032,0.035457,0.393903,0.352869,0.337332,0.331551,0.330089,0.000495,STAR
1,0.910465,0.030273,0.387076,0.348962,0.337407,0.333596,0.332157,0.000466,STAR


In [7]:
# categorical type - pandas
df['class']=pd.Categorical(df['class'])

In [8]:
# label encoder
label_class = LabelEncoder()
df['label_class'] = label_class.fit_transform(df['class'])
df.head(3)

Unnamed: 0,ra,dec,u,g,r,i,z,redshift,class,label_class
0,0.044032,0.035457,0.393903,0.352869,0.337332,0.331551,0.330089,0.000495,STAR,2
1,0.910465,0.030273,0.387076,0.348962,0.337407,0.333596,0.332157,0.000466,STAR,2
2,0.030819,0.132237,0.44728,0.406822,0.392707,0.386791,0.384882,0.00045,STAR,2


In [9]:
# train_test_split
X = df.drop(['class', 'label_class'], axis=1, inplace =False).values
y = df['label_class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [10]:
y_train

array([0, 2, 1, ..., 0, 2, 0])

In [11]:
model_names=[KNeighborsClassifier(n_neighbors=5,weights="uniform"),
             RandomForestClassifier(),
             GradientBoostingClassifier(),
             DecisionTreeClassifier(),
             SVC(kernel='rbf'),
             GaussianNB(),
             BernoulliNB()
            ]

In [12]:
acc=[]
eval_acc={}
for classification_model in model_names:   
    print(classification_model)
    classification_model.fit(X_train,y_train)
    pred=classification_model.predict(X_test)
    acc.append(accuracy_score(pred,y_test))
eval_acc={'Modelling Algorithm':model_names,'Accuracy':acc}

KNeighborsClassifier()
RandomForestClassifier()
GradientBoostingClassifier()
DecisionTreeClassifier()
SVC()
GaussianNB()
BernoulliNB()


In [13]:
eval_acc={'Modelling Algorithm':model_names,'Accuracy':acc}
pd.DataFrame(eval_acc)

Unnamed: 0,Modelling Algorithm,Accuracy
0,KNeighborsClassifier(),0.901
1,"(DecisionTreeClassifier(max_features='auto', r...",0.9892
2,([DecisionTreeRegressor(criterion='friedman_ms...,0.986167
3,DecisionTreeClassifier(),0.985
4,SVC(),0.961033
5,GaussianNB(),0.973967
6,BernoulliNB(),0.517033


In [14]:
# search best parameters
reg = RandomForestClassifier()

parametros = {
    'n_estimators' : range(4,16),
    'criterion' : ['gini'],
    'max_depth' : range(2,16)
}

rand_est = RandomizedSearchCV(reg, parametros , n_iter=10, cv=3, scoring='neg_mean_absolute_error').fit(X_train,y_train)

print(rand_est.best_estimator_)
print(rand_est.best_params_)
print(rand_est.predict(X_test)[0])

RandomForestClassifier(max_depth=12, n_estimators=10)
{'n_estimators': 10, 'max_depth': 12, 'criterion': 'gini'}
0


In [15]:
# search best parameters
classifiers = {'SVC' : SVC(),
       'GRADIENT' : GradientBoostingClassifier(),
      'RandomForest':RandomForestClassifier()}

params = {
    'SVC' : {'kernel' : ['linear', 'poly', 'rbf'],
             'gamma' : ['auto', 'scale'],
             'C' : [1,5,10]},
    'GRADIENT' : {'loss' : ['deviance'],
                  'learning_rate' : [0.01, 0.05, 0.1]},
    'RandomForest' : {'n_estimators' : range(4,16),
                      'criterion' : ['gini'],
                      'max_depth' : range(2,16)}
    }

In [16]:
best_score = 999
best_model = None

for name, cls in classifiers.items():

    grid_search = GridSearchCV(cls, params[name], cv=3).fit(X_train, y_train)
    score = np.abs(grid_search.best_score_)

    if score < best_score:
        best_score = score
        best_model = grid_search.best_estimator_

In [17]:
print(best_model)
print(best_score)

SVC(C=10, kernel='poly')
0.9844142865242391


In [18]:
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

In [22]:
# prediction with best model
pred=best_model.predict(X_test)
pred

array([0, 1, 0, ..., 0, 2, 0])

In [23]:
accuracy_score(pred,y_test)

0.9843666666666666