In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
def filter_outliers(x2):
    columns = x2.columns.tolist()

    print('Shape before filtering: ', x2.shape)

    for col in columns:
        if col == 'class':continue
        q1 = x2[col].quantile(0.25)
        q3 = x2[col].quantile(0.75)
        iqr = q3 - q1

        filter = (x2[col] >= q1 - 1.5 * iqr) & (x2[col] <= q3 + 1.5 * iqr)
        x2 = x2.loc[filter]

    print('Shape after filtering: ', x2.shape)

    return x2

In [3]:
# Load data from csv file
data = pd.read_csv("star_classification.csv")
data = data.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','spec_obj_ID','plate','MJD','fiber_ID', 'u', 'z'], axis = 1)

# Convert Class from string to int
data["class"] = [0 if i == "GALAXY" else 1 if i == "STAR" else 2 for i in data["class"]]

# Remove outliers
data = filter_outliers(data)

# Before balancing the features remove class label
x = data.drop(['class'], axis = 1)
y = data.loc[:,'class'].values

# Generate examples for QSO and Star class
sm = SMOTE(random_state=42)
print('Original dataset shape %s' % Counter(y))
x, y = sm.fit_resample(x, y)
print('Resampled dataset shape %s' % Counter(y))

# Scaling
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

# Train and Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42)

Shape before filtering:  (100000, 5)
Shape after filtering:  (90765, 5)
Original dataset shape Counter({0: 59236, 1: 21544, 2: 9985})
Resampled dataset shape Counter({0: 59236, 2: 59236, 1: 59236})


In [4]:
def parameter_tunning(X_train, X_test, y_train, y_test, clf, param_grid):
    # Perform grid search cross-validation to find the best hyperparameters
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and the corresponding score
    best_params = grid_search.best_params_
    print("Best hyperparameters: ", best_params)
    print("Best score: ", grid_search.best_score_)

    # Classification report
    y_pred = grid_search.predict(X_test)
    print("Classification Report: ")
    print()
    target_names = ['GALAXY', 'STAR', 'QUASAR']
    print(classification_report(y_test, y_pred, target_names=target_names))

    return best_params

## Logistic Regression without regularization

In [10]:
clf = LogisticRegression()

param_grid = {
    'solver': ['saga', 'lbfgs', 'liblinear'], 
    'max_iter': [200, 500, 1000, 1500],
    'C': [1, 10, 100, 1000, 10000], 
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)



Best hyperparameters:  {'C': 10000, 'max_iter': 200, 'solver': 'lbfgs'}
Best score:  0.9363318461352949
Classification Report: 

              precision    recall  f1-score   support

      GALAXY       0.89      0.92      0.90     17707
        STAR       0.99      1.00      1.00     17742
      QUASAR       0.93      0.88      0.91     17864

    accuracy                           0.94     53313
   macro avg       0.94      0.94      0.94     53313
weighted avg       0.94      0.94      0.94     53313



## Logistic Regression with regularization

In [5]:
clf = LogisticRegression()

param_grid = {
    'solver': ['saga', 'liblinear'], 
    'max_iter': [1000, 5000, 10000],
    'C': [1, 10, 100, 1000, 5000],
    'penalty': ['l1'] 
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)



Best hyperparameters:  {'C': 100, 'max_iter': 5000, 'penalty': 'l1', 'solver': 'saga'}
Best score:  0.9360424454359098
Classification Report: 

              precision    recall  f1-score   support

      GALAXY       0.89      0.92      0.90     17707
        STAR       0.99      1.00      0.99     17742
      QUASAR       0.93      0.88      0.91     17864

    accuracy                           0.93     53313
   macro avg       0.94      0.93      0.93     53313
weighted avg       0.94      0.93      0.93     53313



In [6]:
clf = LogisticRegression()

param_grid = {
    'solver': ['saga', 'liblinear', 'lbfgs'], 
    'max_iter': [500, 1000, 1500],
    'C': [1, 10, 100, 1000, 5000],
    'penalty': ['l2'] 
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)



Best hyperparameters:  {'C': 5000, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score:  0.936307729410346
Classification Report: 

              precision    recall  f1-score   support

      GALAXY       0.89      0.92      0.90     17707
        STAR       0.99      1.00      1.00     17742
      QUASAR       0.93      0.88      0.91     17864

    accuracy                           0.94     53313
   macro avg       0.94      0.94      0.94     53313
weighted avg       0.94      0.94      0.94     53313



## SVM

In [None]:
clf = svm()

param_grid = {
    'C': [1, 10, 100, 1000], 
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'sigmoid']
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)

## XGBoost

In [7]:
clf = XGBClassifier()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5],
    'n_estimators': [100, 1000],
    'gamma': [0, 0.1, 0.3]
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)

Best hyperparameters:  {'gamma': 0, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 1000}
Best score:  0.9684553237670324
Classification Report: 

              precision    recall  f1-score   support

      GALAXY       0.95      0.96      0.95     17707
        STAR       1.00      1.00      1.00     17742
      QUASAR       0.96      0.95      0.95     17864

    accuracy                           0.97     53313
   macro avg       0.97      0.97      0.97     53313
weighted avg       0.97      0.97      0.97     53313



## Random Forest

In [8]:
clf = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 1000],
    'max_depth': [5, 10, 12, 15],
    'min_samples_split': [2, 10, 14],
    'min_samples_leaf': [2, 4 , 6],
    'max_features': ['sqrt', 'log2'],
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)

Best hyperparameters:  {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}
Best score:  0.96991840508059
Classification Report: 

              precision    recall  f1-score   support

      GALAXY       0.94      0.97      0.96     17707
        STAR       1.00      1.00      1.00     17742
      QUASAR       0.97      0.94      0.96     17864

    accuracy                           0.97     53313
   macro avg       0.97      0.97      0.97     53313
weighted avg       0.97      0.97      0.97     53313



## Neural Network

In [None]:
clf = MLPClassifier()

param_grid = {
    'hidden_layer_sizes': [(8,), (8,6)],
    'activation': ['relu', 'tanh'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

best_log_params = parameter_tunning(x_train, x_test, y_train, y_test, clf, param_grid)