In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression, f_regression, SelectKBest
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.stats import pearsonr
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.model_selection import StratifiedKFold

In [2]:
# dfStars = pd.read_csv('Skyserver_12_15_2020 3 45 07 AM.csv', na_values="?")
dfStars = pd.read_csv('FileCSV/star_classification.csv', na_values="?")

In [3]:
# Encoding the classed-class with numerical data
dfStars['class'] = dfStars['class'].replace({'GALAXY': 0, 'STAR': 1, 'QSO': 2})

In [4]:
# Train and Test
X = dfStars.drop('class', axis=1)
y = dfStars['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [5]:
X_train.drop(['objid', 'ra', 'run','rerun','camcol','field','fiberid'], axis = 1, inplace=True) # type: ignore

In [7]:
#DBSCAN

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Selezionare le colonne di interesse
cols = ['u', 'g', 'r', 'i', 'z', 'specobjid', 'redshift']
df_merged = pd.concat([X_train, y_train], axis=1) # type: ignore
data = df_merged[cols]

# Normalizzare i dati
scaler = StandardScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns=['u', 'g', 'r', 'i', 'z', 'specobjid', 'redshift'])

# Definire i parametri di ingresso
eps_list = [0.1, 0.5, 1]
min_samples_list = [2, 4, 8, 10]

for eps_i in eps_list:
    for min_samples_i in min_samples_list:

        # Eseguire DBSCAN
        dbscan = DBSCAN(eps=eps_i, min_samples=min_samples_i)
        dbscan.fit(data)

        # Identificare gli outlier
        outliers = data[dbscan.labels_ == -1]

        # Calcolare la silhouette score
        score = silhouette_score(data, dbscan.labels_)

        # Stampa del risultato
        print(f"Silhouette score for EPS = {eps_i} e MIN_SAMPLES = {min_samples_i}: ", score)

Silhouette score for EPS = 0.1 e MIN_SAMPLES = 2:  -0.6382729104157495
Silhouette score for EPS = 0.1 e MIN_SAMPLES = 4:  -0.6851897386920022
Silhouette score for EPS = 0.1 e MIN_SAMPLES = 8:  -0.569159589777764
Silhouette score for EPS = 0.1 e MIN_SAMPLES = 10:  -0.4949190923878875
Silhouette score for EPS = 0.5 e MIN_SAMPLES = 2:  -0.5459116711181403
Silhouette score for EPS = 0.5 e MIN_SAMPLES = 4:  -0.38431007899298303
Silhouette score for EPS = 0.5 e MIN_SAMPLES = 8:  -0.25374005594808696
Silhouette score for EPS = 0.5 e MIN_SAMPLES = 10:  -0.2927925892207605
Silhouette score for EPS = 1 e MIN_SAMPLES = 2:  -0.11432325995002039
Silhouette score for EPS = 1 e MIN_SAMPLES = 4:  0.18087522784821394
Silhouette score for EPS = 1 e MIN_SAMPLES = 8:  0.4141007483057308
Silhouette score for EPS = 1 e MIN_SAMPLES = 10:  0.4204837299300606


In [40]:
# Definire i parametri di ingresso
eps_list = 4
min_samples_list = 5000


# Eseguire DBSCAN
dbscan = DBSCAN(eps=eps_list, min_samples=min_samples_list)
dbscan.fit(data)

# Identificare gli outlier
outliers = data[dbscan.labels_ == -1]

# Calcolare la silhouette score
score = silhouette_score(data, dbscan.labels_)

# Stampa del risultato
print(f"Silhouette score for EPS = {eps_list} e MIN_SAMPLES = {min_samples_list}: ", score)

Silhouette score for EPS = 4 e MIN_SAMPLES = 5000:  0.62214534699581


In [32]:
df_filtered = dfStars[dfStars['u'] < 0]
print("u: " + str(df_filtered.shape))

df_filtered = dfStars[dfStars['g'] < 0]
print("g: " + str(df_filtered.shape))

df_filtered = dfStars[dfStars['z'] < 0]
print("z: " + str(df_filtered.shape))

u: (1, 18)
g: (1, 18)
z: (1, 18)


In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import LocalOutlierFactor

# Definizione del modello LOF
lof = LocalOutlierFactor()

# Definizione del range di valori per i parametri
param_grid = {'n_neighbors': [1, 5, 10, 20, 30], 'contamination': [0.001, 0.01, 0.1, 'auto']}

# Creazione della griglia di parametri
grid_lof = GridSearchCV(lof, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit del modello sui dati e ricerca dei migliori parametri
grid_lof.fit(X)

# Stampa dei migliori parametri
print("I migliori parametri sono: ", grid_lof.best_params_)


In [7]:
#OUTLIER 

df_merged = pd.concat([X_train, y_train], axis=1) # type: ignore
print(df_merged.shape)

# Creates the LocalOutlierFactor object with n_neighbors and contamination parameters
lof = LocalOutlierFactor(n_neighbors=1, contamination=0.01) # type: ignore

# Select the DataFrame for the GALAXY class 
# Extract the features from the DataFrame
# Get the outliers with the fit_predict method
XX = df_merged[['u', 'g', 'r', 'i', 'z', 'specobjid', 'redshift']]
outliers = lof.fit_predict(XX)
df_after_outlier = df_merged[outliers == 1]

print(df_after_outlier.shape)

X_train = df_after_outlier.drop('class', axis=1)
y_train = df_after_outlier['class']

(70000, 11)
(69300, 11)


In [8]:
# REBALANCING
# using oversampling with SMOTE to deal with imbalanced data
sm = SMOTE(random_state=42)
X_train_after_balancing, y_train_after_balancing = sm.fit_resample(X_train, y_train) # type: ignore

print(X_train_after_balancing.shape) # type: ignore
y_train_after_balancing.value_counts() # type: ignore

(123768, 10)


0    41256
1    41256
2    41256
Name: class, dtype: int64

In [9]:
# Split the dataset into features and classs
X_train_class = X_train_after_balancing
y_train_class = y_train_after_balancing

X_test_drop = X_test.drop(['objid', 'ra', 'run', 'rerun', 'camcol', 'field', 'fiberid'], axis = 1) # type: ignore

In [10]:
from sklearn.preprocessing import StandardScaler

# Normalizzare i dati
scaler = StandardScaler()
X_train_class_npArr = scaler.fit_transform(X_train_class)

X_train_class_norm = pd.DataFrame(X_train_class_npArr, columns=['dec', 'u', 'g', 'r', 'i', 'z', 'specobjid', 'redshift', 'plate', 'mjd'])