In [None]:
import pandas as pd
#!pip install wget
import wget
import numpy as np
from tqdm.notebook import tqdm
from astropy.io import fits
from astropy.table import Table
import csv
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for displaying DataFrames
import seaborn as sns
import matplotlib.pyplot as plt

# IMPORT PLATE NAMES

In [None]:
#These files are provided by the data survey and contain the names and parameters of the stars

wget.download('https://data.sdss.org/sas/dr16/sdss/sspp/ssppOut-dr12.fits') #optic data
wget.download('https://data.sdss.org/sas/dr16/apogee/spectro/aspcap/r12/l33/allStar-r12-l33.fits') #IR data


In [None]:
#In this notebook we preprocess the IR data
df = Table.read('allStar-r12-l33.fits', hdu=1)
df[0]

In [None]:
#We are going to download the spectra here:
save_path = 'Downloaded_IR/'

# DOWNLOAD SPECTRA

In [None]:
used_plates = []
error_plates = []

for i in tqdm(range(0,50000)):
    

    TELESCOPE = str(df[i]['TELESCOPE'])
    FIELD = str(df[i]['FIELD'])
    FILE = str(df[i]['FILE'])


    url = 'https://data.sdss.org/sas/dr16/apogee/spectro/aspcap/r12/l33/'+TELESCOPE+'/'+FIELD+'/'+FILE

    try:
   
        wget.download(url, out = save_path)
        used_plates.append(i)

    except:
        
        try:
            
            FILE = 'aspcap' + FILE[2:]
            url = 'https://data.sdss.org/sas/dr16/apogee/spectro/aspcap/r12/l33/'+TELESCOPE+'/'+FIELD+'/'+FILE
            wget.download(url, out = save_path)
            print('SUCCESS')
            used_plates.append(i)
            
        except:
            error_plates.append(i)
            print('ERROR:')
            print(url)

np.save(save_path+'used_plates', used_plates) 
np.save(save_path+'error_plates', error_plates)

# VISUALIZE DATA

In [None]:
i = 10

ID = str(df[i]["APOGEE_ID"])
FeH = str(df[i]["FE_H"])
TEff = str(df[i]["TEFF"])
SpType = str(df[i]["ASPCAP_CLASS"])
LogG = str(df[i]["LOGG"])
print([ID,FeH,TEff,SpType,LogG])

FILE = str(df[i]['FILE'])
FILE = 'aspcap' + FILE[2:]
data = fits.open('Downloaded_IR/' + FILE)
data.info()

In [None]:
#This is the real data that we are interested in.
plt.plot(data[1].data)
plt.ylabel('Flux')
plt.xlabel('Wavelength [log]')
plt.show()

In [None]:
#This is simulated data, not real.
plt.plot(data[3].data)
plt.ylabel('Flux')
plt.xlabel('Wavelength [log]')
plt.show()

# FROM DOWNLOADED TO .csv

In [None]:
#We want to specify the number of the channels as columns in the final data file. This block
#creates the vector with the numbre of channels.
#We decide to reduce the resolution of the spectra to 1/3 to reduce the weight of the file.
row = 0

ID = str(df[row]["APOGEE_ID"])
FeH = str(df[row]["FE_H"])
TEff = str(df[row]["TEFF"])
SpType = str(df[row]["ASPCAP_CLASS"])
LogG = str(df[row]["LOGG"])

    
FILE = str(df[row]['FILE'])
FILE = 'aspcap' + FILE[2:]

print([FILE,ID,FeH,TEff,SpType,LogG])


data = fits.open(save_path + FILE)
flux = data[1].data

compressed_wave = []
number_compressed_entries = len(flux)//3

for i in range(1,number_compressed_entries+1):
    compressed_wave.append(i)

In [None]:
first_index = 0
last_index = 50000

with open('data_IR.csv', 'w', newline='') as file:
  
    writer = csv.writer(file)
    writer.writerow(np.concatenate((np.asarray(["ID", "Fe/H", "T", "Spec_Type", "logG"]), np.asarray(compressed_wave))))
  
    for row in tqdm(range(first_index,last_index)):  

        FILE = str(df[row]['FILE'])
        FILE = 'aspcap' + FILE[2:]
        

        try:
            
            data = fits.open(save_path + FILE)
            flux = data[1].data

            compressed_flux = []
            number_compressed_entries = len(flux)//3

            for i in range(number_compressed_entries):
                total_sum = (float(flux[3*(i + 1) - 3]) + float(flux[3*(i + 1) - 2]) + float(flux[3*(i + 1) - 1]))
                average = total_sum/3
                compressed_flux.append(average)

            ID = str(df[row]["APOGEE_ID"])
            FeH = str(df[row]["FE_H"])
            TEff = str(df[row]["TEFF"])
            SpType = str(df[row]["ASPCAP_CLASS"])
            LogG = str(df[row]["LOGG"])
            print([FILE,ID,FeH,TEff,SpType,LogG])

            array_to_save = np.concatenate((np.asarray([ID, FeH, TEff, SpType, LogG]), np.asarray(compressed_flux)))
            writer.writerow(array_to_save)


        except:
            print('ERROR')
            continue


# EDA AND PREPROCESSING

In [None]:
df = pd.read_csv('data_IR.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

Remove zeros:

In [None]:
estadisticas = df.describe()
estadisticas
est_tras = estadisticas.T

In [None]:
boss = est_tras[est_tras['mean']==0]
boss.index

In [None]:
df_1 = df
df_1 = df_1.drop(columns=boss.index,axis=0)
df_1.head()

In [None]:
etiquetas =pd.DataFrame(data=df,columns=['T','Fe/H','Spec_Type','logG'])
etiquetas

Remove outliers:

In [None]:
df_1['T'].hist()

In [None]:
df_1 = df_1[df_1['T']>0]
df_1.head()
df_1['T'].hist()

In [None]:
df_1['Fe/H'].hist()

In [None]:
df_1 = df_1[df_1['Fe/H'] > -2000]
df_1['Fe/H'].hist()

In [None]:
df_1['logG'].hist(bins=20)

In [None]:
df_1 = df_1[df_1['logG'] > -2000]
df_1['logG'].hist(bins=20)

In [None]:
tipo_esp = df_1[df_1['logG'] <0]['Spec_Type']
tipo_esp

In [None]:
#save clean data set
df_1.to_csv('data_IR_clean.csv')

Explore labels:

In [None]:
plt.figure(figsize=(50, 50))
sns.pairplot(df_1,vars=['T','Fe/H','logG'],hue='Spec_Type')

In [None]:
sns.catplot(data=df_1, y = df_1['T'], kind = 'box')

In [None]:
sns.catplot(data=df_1, y = df_1['Fe/H'], kind = 'box')

In [None]:
dfdummies= pd.get_dummies(etiquetas)
dfdummies

In [None]:
correlation = dfdummies[dfdummies['Spec_B']==1].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(correlation, annot=True)