# Sloan Digital Sky Survey (SDSS) Galaxy Classification using ML



## I. Data Collection and Preparation

### 1. Reading The Dataset

In [None]:
# Importing the required libraries
import numpy as np
import pandas as pd

In [None]:
# Reading the downloaded dataset
'''
About the Dataset:
The Sloan Digital Sky Survey (SDSS) has searched about one-third of the sky and
found around 1 billion objects and almost 3 million of those are galaxies.
It contains 100,000 rows of photometric image data and the galaxy subclass is
limited to two types, 'STARFORMING' or 'STARBURST'.
Dataset link: https://www.kaggle.com/datasets/bryancimo/sdss-galaxy-classification-dr18/
'''
df = pd.read_csv('sdss_100k_galaxy_form_burst.csv')

In [None]:
# Peeking into the dataset
df.head()

In [None]:
# Getting the shape of the dataset
# The first value in the tuple represents the Number of Rows
# The second value in the tuple represents the Number of Cols
df.shape

In [None]:
# Getting general information about the columns in the dataset
df.info()

### 2. Handling Missing Values

In [None]:
# Finding column wise null values in the dataset
df.isnull().sum()

### 3. Changing Datatype of "subclass" from Object to Int

In [None]:
# Extracting the total counts of each subclass
df["subclass"].value_counts()

In [None]:
# Changing the datatype of Subclass from Object to Int
mapping = {"STARFORMING":0, "STARBURST":1}
df["subclass"].replace(mapping, inplace=True)
print("Completed")

In [None]:
# Removing the unneccessary columns from the dataset
df.drop(columns = ["objid", "specobjid","class"], inplace = True)
print("Completed")

# II. Exploratory Data Analysis

### 1. Descriptive Statistical

In [None]:
df.describe()

### 2. Univariate Analysis

In [None]:
# Importing neccessary libraries with required settings
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'svg'

In [None]:
# Getting the total counts of each subclass
sub_class = df["subclass"].replace({0:"STARFORMING",1:"STARBURST"}).value_counts()
sub_class

In [None]:
# Depicting the Univariate distribution of "Subclass" column using Pie plot
plt.pie(sub_class, labels = ["STARFORMING", "STARBURST"], autopct = '%.2f%%')
plt.show()

In [None]:
# Collecting the Numerical and Categorical columns in seperate variables
numerical_vars = df.select_dtypes(include = ["float64","int64"]).columns.tolist()
categorical_vars = df.select_dtypes(include = ["object", "bool"]).columns.tolist()

In [None]:
# Depicting Univariate distribution using Box plots
count = len(numerical_vars)
fig, axes = plt.subplots(count, 1, figsize = (15,200))
for i, var in enumerate(numerical_vars):
  sns.boxplot(x=df[var], ax= axes[i])
plt.show()

### 3. Bivariate Analysis

In [None]:
mapping = {0: "STARFORMING", 1: "STARBURST"}
numerical_vars2 = ['ra','dec','u','g','r','i','z','modelFlux_u','modelFlux_g','modelFlux_r','modelFlux_i','modelFlux_z','petroRad_u','petroRad_g','petroRad_i','petroRad_r','petroRad_z','petroFlux_u','petroFlux_g','petroFlux_i','petroFlux_r','petroFlux_z','petroR50_u','petroR50_g','petroR50_i','petroR50_r','petroR50_z','psfMag_u','psfMag_r','psfMag_g','psfMag_i','psfMag_z','expAB_u','expAB_g','expAB_r','expAB_i','expAB_z','redshift','redshift_err']
for i in numerical_vars2:
  sns.boxplot(data=df, x=df['subclass'].map(mapping), y=df[i])
  print()
  plt.show()

### 4. Multivariate Analysis

In [None]:
# Correlation Heatmap
plt.figure(figsize = (40,40))
correlation_matrix = df[numerical_vars].corr()
sns.heatmap(correlation_matrix, annot = True, annot_kws = {'size':10} )
plt.show()

### 5. Handling Outliers - IQR Method

In [None]:
# IQR method
def handle_outliers(column):
  quant = df[column].quantile(q=[0.75,0.25])
  Q3 = quant.loc[0.75]
  Q1 = quant.loc[0.25]
  IQR = Q3 - Q1
  upper_bound = Q3 + 1.5 * IQR
  lower_bound = Q1 - 1.5 * IQR
  df[column] = np.where(df[column]>upper_bound, upper_bound, df[column])
  df[column] = np.where(df[column]<lower_bound, lower_bound, df[column])
  x=[lower_bound, upper_bound]
  return x

In [None]:
# Implementing on all columns
for i in numerical_vars:
  x= handle_outliers(i)
  print("Column: ",i)
  print("Upper Bound: ", x[1])
  print("Lower Bound: ", x[0])
  sns.boxplot(x=df[i])
  plt.show()

### 6. Selecting Best Features using Select K Best

In [None]:
# Selecting Best Features
from sklearn.feature_selection import SelectKBest, f_classif

X=df.drop(columns=["subclass"]) # All columns except "subclass"
y=df["subclass"] # Only "subclass" column
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X,y)
selected_features = X.columns[selector.get_support()]
print("Selected Features: ")
for i in selected_features:
  print(i)

### 7. Balancing Value Counts using SMOTE

In [None]:
pd.Series(y).value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)
pd.Series(y_resampled).value_counts()

### 8. Splitting Data Into Train and Test

In [None]:
from sklearn.model_selection import train_test_split
train_df = df[['r', 'i', 'z', 'petroRad_g', 'petroRad_r', 'petroR50_u', 'petroR50_g', 'petroR50_i', 'petroR50_r', 'petroR50_z','subclass']]
X = train_df[['r', 'i', 'z', 'petroRad_g', 'petroRad_r', 'petroR50_u', 'petroR50_g', 'petroR50_i', 'petroR50_r', 'petroR50_z']]
y = train_df["subclass"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)
print("Completed")

### 9. Scaling The Feature Variables Using Standardscaler Method

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
scaled_data=sc.fit_transform(X_train)
print("Completed")

# III. Model Building

### 1. Decision Tree Classifier

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

clf=DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
report=classification_report(y_test,y_pred)

print("Classification Report:\n",report)
print("Accuracy Score: ",accuracy_score(y_pred,y_test))

### 2. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

lg=LogisticRegression()
log=lg.fit(X_train,y_train)
y_pred=lg.predict(X_test)

print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Classification report:\n",classification_report(y_test,y_pred))
print("Accuracy Score: ",accuracy_score(y_pred,y_test))

### 3. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,precision_score, recall_score, f1_score

RF=RandomForestClassifier()
RF.fit(X_train,y_train)
RFtrain=RF.predict(X_train)
RFtest=RF.predict(X_test)

print("Confusion Matrix (Train):\n",confusion_matrix(RFtrain,y_train))
print("Confusion Matrix (Test):\n",confusion_matrix(RFtest,y_test))
print("Classification report (Train):\n",classification_report(RFtrain,y_train))
print("Classification report (Test):\n",classification_report(RFtest,y_test))
print("Accuracy Score (Train):", accuracy_score(RFtrain,y_train))
print("Accuracy Score: (Test)",accuracy_score(RFtest,y_test))

# IV. Model Deployment

## 1. Saving the Model

In [None]:
import joblib

filename = 'RFmodel.sav'
joblib.dump(RF, filename)


## 2. Testing the Model

In [None]:
import joblib
model = joblib.load('RFmodel.sav')

output = model.predict([[12.458694,16.708910,203.597861,4.180779,4.096248,194.731000,1.999653,2.149080,2.047596,2.055798]])
if output == 0:
    print("Predicted output is: STARFORMING")
else:
    print("Predicted output is: STARBURST")

## 3. Flask Deployment

The contents for this section is contained in the run.py file.