# SDSS QUERY

# Import

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from astropy.io import fits
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import requests
import time


# funzioni per visualizzazione

In [None]:
# Funzione per fare scatter plot e print delle metriche 
# le metriche sono calcolate sul delta z norm come si fa tradizionalmente nel settore dei redshift
# Delta_z_norm= (z_spec-z_phot)/(1+z_spec)
# gli outlier sono definiti in base ad  |delta_z_norm| > 0.15
# anche questo è tradizionale negli articoli di settore

def scatter_plot(y_true, y_pred, title):
    residuals = (y_pred - y_true) / (1 + y_true)
    mean_res = np.mean(residuals)
    std_res = np.std(residuals)
    outlier_threshold = 0.15
    outlier_mask = np.abs(residuals) > outlier_threshold
    outlier_percentage = np.mean(outlier_mask) * 100
    
    plt.figure(figsize=(6,6))
    sns.scatterplot(x=y_true, y=y_pred, alpha=0.8, s=0.1)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], linestyle='--', color='red')  # Diagonale
    plt.plot([min(y_true), max(y_true)], [min(y_true) * (1 + outlier_threshold), max(y_true) * (1 + outlier_threshold)], linestyle='--', color='blue')
    plt.plot([min(y_true), max(y_true)], [min(y_true) * (1 - outlier_threshold), max(y_true) * (1 - outlier_threshold)], linestyle='--', color='blue')
    plt.xlabel("z Reale")
    plt.ylabel("z Predetto")
    plt.title(title)
    plt.ylim(0, 6)
    plt.xlim(0,6)
    plt.show()
    print(f"{title} - Mean Residual: {mean_res}, Std Residual: {std_res}, Outliers > {outlier_threshold}: {outlier_percentage:.2f}%")

    
# Funzione per visualizzare le distribuzioni delle feature
def plot_feature_distributions(df, features, cols=5):
    rows = int(np.ceil(len(features) / cols))
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 4, rows * 4))
    axes = axes.flatten()
    
    for i, feature in enumerate(features):
        plot1=sns.histplot(df[feature], ax=axes[i], bins=30, kde=False,element="step")
        plot1.set_yscale("log")
        axes[i].set_title(feature)
    
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()    

# Scarica i dati, legge il file, splitta in train e test

In [None]:
start_time = time.time()

# Scarica il file FITS se non esiste
url = "http://dame.na.astro.it/fileShare/catania_cavuoti.fit"
file_name = "catania_cavuoti.fit"
if not os.path.exists(file_name):
    response = requests.get(url)
    with open(file_name, 'wb') as file:
        file.write(response.content)

# Legge il file FITS
with fits.open(file_name) as hdul:
    data = hdul[1].data
    df = pd.DataFrame(data.tolist(), columns=data.names)
    
    
# Ingora alcune colonne che non servono, identifica la colonna di target e utilizza le restati come feature
ignore = ["specObjID", "objid", "ra", "dec", "targetObjID", "zErr"]
target = "z"
features = [col for col in df.columns if col not in ignore + [target]]


# Filtra i valori negativi nelle feature

objinthecatalog=df.shape[0]
df = df[(df[features] >= 0).all(axis=1)]
remainingobj=df.shape[0]

plot_feature_distributions(df, features, 5)


print("from ",objinthecatalog,"intial objects we have now", remainingobj)
print("object discarded:", objinthecatalog-remainingobj)

# Train-test split
seed = 42
train_size = 0.2

# ho scelto il 20% di training solo per fare durare "poco" gli esperimenti ovviamente ha poco senso

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], train_size=train_size, random_state=seed)
print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])



In [None]:
start_time = time.time()

# Esperimento semplice con i Random Forest
rf = RandomForestRegressor(n_estimators=500,random_state=seed,n_jobs=10)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Feature importance
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": features, "Importance": importances}).sort_values(by="Importance", ascending=False)
# Plot della Feature Importance
plt.figure(figsize=(10,20))
sns.barplot(x=feature_importance_df["Importance"], y=feature_importance_df["Feature"])
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.title("Importanza delle Feature - Random Forest")
plt.show()

# plot dei risultati
scatter_plot(y_test, y_pred_rf,"Random Forest: z Reale vs Predetto")



print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])

In [None]:
start_time = time.time()

# Grid Search per Random Forest
param_grid = {'max_depth': [10, 30, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=seed,n_estimators=500,n_jobs=10), param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)
print("Migliori parametri per Random Forest:", grid_search_rf.best_params_)

# Feature importance
importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": features, "Importance": importances}).sort_values(by="Importance", ascending=False)
# Plot della Feature Importance
plt.figure(figsize=(10,20))
sns.barplot(x=feature_importance_df["Importance"], y=feature_importance_df["Feature"])
plt.xlabel("Importanza")
plt.ylabel("Feature")
plt.title("Importanza delle Feature - Random Forest")
plt.show()

scatter_plot(y_test, y_pred_rf,"Random Forest: z Reale vs Predetto")

scatter_plot(y_test, y_pred_best_rf,"Random Forest: z Reale vs Predetto Best Hyper")

print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])

In [None]:
start_time = time.time()

# Esperimento con KNN
scaler = MinMaxScaler()
scaler2=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_standard_scaled = scaler2.fit_transform(X_train)
X_test_standard_scaled = scaler2.transform(X_test)

knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

knn_scaled = KNeighborsRegressor(n_neighbors=10)
knn_scaled.fit(X_train_scaled, y_train)
y_pred_knn_scaled = knn_scaled.predict(X_test_scaled)

knn_standard_scaled = KNeighborsRegressor(n_neighbors=10)
knn_standard_scaled.fit(X_train_standard_scaled, y_train)
y_pred_knn_standard_scaled = knn_standard_scaled.predict(X_test_standard_scaled)

scatter_plot(y_test, y_pred_knn,"KNN senza normalizzazione")
scatter_plot(y_test, y_pred_knn_scaled,"KNN con normalizzazione")
scatter_plot(y_test, y_pred_knn_standard_scaled,"KNN con standard scaler")

print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])

In [None]:
start_time = time.time()

# Esperimento con MLP
scaler = MinMaxScaler()
scaler2=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_standard_scaled = scaler2.fit_transform(X_train)
X_test_standard_scaled = scaler2.transform(X_test)

mlp = MLPRegressor(random_state=seed, max_iter=1000)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

mlp_scaled = MLPRegressor(random_state=seed, max_iter=1000)
mlp_scaled.fit(X_train_scaled, y_train)
y_pred_mlp_scaled = mlp_scaled.predict(X_test_scaled)

mlp_standard_scaled = MLPRegressor(random_state=seed, max_iter=1000)
mlp_standard_scaled.fit(X_train_standard_scaled, y_train)
y_pred_mlp_standard_scaled = mlp_standard_scaled.predict(X_test_standard_scaled)

scatter_plot(y_test, y_pred_mlp,"mlp senza normalizzazione")
scatter_plot(y_test, y_pred_mlp_scaled,"mlp con normalizzazione")
scatter_plot(y_test, y_pred_mlp_standard_scaled,"mlp con standard scaler")

print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])




In [None]:
start_time = time.time()

# Esperimento con le 10 migliori feature
top_10_features = feature_importance_df.head(10)["Feature"].tolist()
X_train_top10, X_test_top10 = X_train[top_10_features], X_test[top_10_features]
X_train_scaled_top10 = scaler2.fit_transform(X_train_top10)
X_test_scaled_top10 = scaler2.transform(X_test_top10)

mlp_top10 = MLPRegressor(random_state=seed, max_iter=1000)
mlp_top10.fit(X_train_top10, y_train)
y_pred_mlp_top10 = mlp_top10.predict(X_test_top10)

scatter_plot(y_test, y_pred_mlp,"mlp all feat")
scatter_plot(y_test, y_pred_mlp_top10,"mlp top 10 features")

mlp_top10.fit(X_train_scaled_top10, y_train)
y_pred_mlp_scaled_top10 = mlp_top10.predict(X_test_scaled_top10)

scatter_plot(y_test, y_pred_mlp_scaled_top10,"KNN 10 features scaled")
print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])

In [None]:
start_time = time.time()

# Esperimento con le 10 migliori feature
top_10_features = feature_importance_df.head(10)["Feature"].tolist()
X_train_top10, X_test_top10 = X_train[top_10_features], X_test[top_10_features]
X_train_scaled_top10 = scaler2.fit_transform(X_train_top10)
X_test_scaled_top10 = scaler2.transform(X_test_top10)

knn.fit(X_train_top10, y_train)
y_pred_knn_top10 = knn.predict(X_test_top10)

scatter_plot(y_test, y_pred_knn,"KNN all feat")
scatter_plot(y_test, y_pred_knn_top10,"KNN top 10 features")

knn.fit(X_train_scaled_top10, y_train)
y_pred_knn_scaled_top10 = knn.predict(X_test_scaled_top10)

# scatter_plot(y_test, y_pred_knn_scaled_top10,"KNN 10 features scaled")
print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])

In [None]:
start_time = time.time()

# KNN, MLP, RF sulle 10 migliori feature
mlp = MLPRegressor(random_state=seed, max_iter=1000)
rf_top10 = RandomForestRegressor(random_state=seed, max_depth= 30, min_samples_leaf= 2, min_samples_split= 5,n_estimators=500,n_jobs=10)
knn_top10 = KNeighborsRegressor()

mlp.fit(X_train_top10, y_train)
rf_top10.fit(X_train_top10, y_train)
knn_top10.fit(X_train_top10, y_train)

y_pred_mlp = mlp.predict(X_test_scaled_top10)
y_pred_rf_top10 = rf_top10.predict(X_test_top10)
y_pred_knn_top10 = knn_top10.predict(X_test_scaled_top10)

scatter_plot(y_test, y_pred_mlp,"MLP top 10 features" )
scatter_plot(y_test, y_pred_rf_top10,"RF top 10 features")
scatter_plot(y_test, y_pred_knn_top10,"KNN top 10 features")
print(["--- "+ str(round((time.time() - start_time),3)) +" seconds ---"])