In [1]:
# importing packages
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from astropy.coordinates import search_around_sky, SkyCoord
from astropy import units as u
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', 999)

In [6]:
# matching LC and mass data
dr14_BH = pd.read_csv('/Users/SnehPandya/Desktop/DeepLearningAGN/data/dr14_BH.csv')
dr14_LC = pd.read_csv('/Users/SnehPandya/Desktop/DeepLearningAGN/data/dr14_LC.csv')

# quasar catalog redshift more reliable
dr14_LC = dr14_LC.drop(columns=['z'])

# Match data attributes in the 2 data sets using astropy's SkyCoord
COORD1 = SkyCoord(dr14_BH['ra'], dr14_BH['dec'], frame='icrs', unit='deg')
COORD2 = SkyCoord(dr14_LC['ra'], dr14_LC['dec'], frame='icrs', unit='deg')
IDX1, IDX2, OTHER1, OTHER2 = search_around_sky(COORD1, COORD2, seplimit=0.5 * u.arcsec)

# Generating columns for the matched
X_TRAIN = []
for i in range(len(IDX1)):
    result = dr14_BH.iloc[IDX1[i]].append(dr14_LC.iloc[IDX2[i]])
    X_TRAIN.append(result)
X_TRAIN = pd.concat(X_TRAIN, axis=1)
X_TRAIN = X_TRAIN.T

X_TRAIN = X_TRAIN.drop(columns= ['SDSS_Name','train_id'])
X_TRAIN = X_TRAIN.loc[:, ~X_TRAIN.columns.str.contains('^Unnamed')]

In [7]:
# remove repeat columns
# X_TRAIN = X_TRAIN.drop(columns=['MJD'])
X_TRAIN.columns

Index(['ID', 'MJD', 'ra', 'dec', 'Mass', 'z', 'ERR', 'M_i', 'spec_mjd', 'ra',
       'dec', 'u_band', 'g_band', 'r_band', 'i_band', 'z_band', 'ug', 'gr',
       'ri', 'iz', 'zu'],
      dtype='object')

In [8]:
# convert to numeric
X_TRAIN = X_TRAIN.apply(pd.to_numeric, errors='ignore')
X_TRAIN.dtypes

ID           object
MJD           int64
ra          float64
dec         float64
Mass        float64
z           float64
ERR         float64
M_i         float64
spec_mjd    float64
ra          float64
dec         float64
u_band      float64
g_band      float64
r_band      float64
i_band      float64
z_band      float64
ug          float64
gr          float64
ri          float64
iz          float64
zu          float64
dtype: object

In [9]:
# cleaning
print(X_TRAIN.shape)
X_TRAIN = X_TRAIN.dropna()
X_TRAIN = X_TRAIN.drop(X_TRAIN[X_TRAIN.Mass <= 0].index)
# X_TRAIN = X_TRAIN.drop(X_TRAIN[X_TRAIN.Mass < 0].index)
print(X_TRAIN.shape)

(28444, 21)
(28055, 21)


In [10]:
# split data
train, test = train_test_split(X_TRAIN, test_size=0.15)
# check
test.shape[0] + train.shape[0] == X_TRAIN.shape[0]

True

In [11]:
X_TRAIN.to_csv('/Users/SnehPandya/Desktop/DeepLearningAGN/data/matched_dr14.csv')

In [12]:
train.to_csv('/Users/SnehPandya/Desktop/DeepLearningAGN/data/TRAIN_dr14.csv')
test.to_csv('/Users/SnehPandya/Desktop/DeepLearningAGN/data/TEST_dr14.csv')