# Pre-Processing Dataset

Here I generate the `gal_fname1` which cointains the columns needed to this study: `Mr, R, R/R200, v_los, v_circ, orbital, infall, interloper`.


In [1]:
%cd ../

/Users/jesteves/Documents/GitHub/galaxyFormation


In [2]:
import sys
sys.path.append('./scripts')
from file_loc import FileLocs

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from astropy.io.fits import getdata
import astropy.table as Table
import astropy.io.ascii as at

# Reading Initial Data

In [None]:
fl = FileLocs(dataset='sdss')

cat = fl.load_catalogs('cluster/main')
gal = fl.load_catalogs('galaxy/raw')

Loading Catalog: ./data/catalogs/SDSS/groupCatalog_Yang_deCarvalho2017.csv


In [None]:
cat

In [None]:
gal

# Task to perform

1. Create New Variables
2. Create Masks
3. Assign Dynamical Classification

## New Variables

In this section we compute the distance from the center and the phase space normalized variables (`R/R_200`, `v_los/v_circ`). Also, we assign k-correction and absolute magnitudes.


In [None]:
cid  = np.array(cat['Yang'])

ra_c = np.array(cat['RA'])
de_c = np.array(cat['DEC'])
zcls = np.array(cat['redshift'])

theta200m = np.array(cat['thetaR200m']) # degrees
theta200c = np.array(cat['thetaR200']) # degrees

r200m = np.array(cat['R200m'])
r200c = np.array(cat['R200c'])
m200c = np.array(cat['logM200c'])

vcirc_m =  np.array(cat['vcirc_m'])
vcirc_c =  np.array(cat['vcirc_c'])

In [None]:
import esutil
gcid = np.array(gal['Yang'])
cid  = np.array(cat['Yang'])

key = esutil.numpy_util.match(cid,gcid)

In [None]:
into_cls, into_gal = key[0], key[1]

### Assigning new variables

In [None]:
data = gal[into_gal].copy()

data['Yang'] = cid[into_cls]

data['R200c'] = r200c[into_cls]
data['R200m'] = r200m[into_cls]

data['M200c'] = m200c[into_cls]
data['redshift'] = zcls[into_cls]

In [None]:
distance = np.array(gal['distance'])/60. # degrees
data['distance'] = distance
data['Rm'] = distance[into_gal]/theta200m[into_cls]
data['Rc'] = distance[into_gal]/theta200c[into_cls]

In [None]:
from astropy.constants import G, c
def get_los_velocity(z_gal,z_BCG,c_kms = c.value/1000):
    return np.array(c_kms*(z_gal - z_BCG)/(1 + z_BCG))

In [None]:
zgal = np.array(data['z'])
vlos = get_los_velocity(zgal,data['redshift'])

data['vlos'] = vlos
# data['vlos'] = data['vlos']/vcirc_c[into_cls]
data['vlosn'] = data['vlos']/vcirc_m[into_cls]

## Ploting Distributions

### Radial Distribution

In [None]:
_ = plt.hist(data['Rm'],bins=31)
# _ = plt.hist(data['Rc'],bins=31)

In [None]:
_ = plt.hist(data['Rc'],bins=31)

#### Cluster Example

In [None]:
mask1 = into_gal==0
mask2 = into_cls==0

In [None]:
plt.scatter(data['ra'][mask2],data['dec'][mask2],c=data['distance'][mask2],alpha=0.2)
plt.colorbar()

### Line of Sight Velocity

In [None]:
vmask = np.abs(vlos)<=3000.
zm = np.median(vlos[vmask])
_ = plt.hist(vlos,bins=np.linspace(-3000.,3000.))
plt.axvline(zm,ls='--',lw=3,color='k',label='Median: %i km/s'%zm)
plt.legend()

In [None]:
vm_mask = np.abs(data['vlosn'])<=3.
zm = np.median(data['vlosn'][vmask])
_ = plt.hist(data['vlosn'],bins=np.linspace(-3.,3.))
plt.axvline(zm,ls='--',lw=3,color='k',label='Median: %.1f %%'%(zm*100))
plt.legend()

In [None]:
## inital l.o.s velocity cut of 10.000 km/s
velocity_mask = (data['vlos']<10*1e3)&(data['z']>0.)

vlos_mask = (data['vlosn']<3.0)&(data['z']>0.)

In [None]:
print(f'Total of good galaxies: {np.count_nonzero(velocity_mask)}')
print(f'Galaxies within vlos/vcirc < 3: {np.count_nonzero(vlos_mask)}')

## Create Masks

I use a volume limited and a magnitude threshold sample.

In [None]:
from utils import calc_kcor
from preProcessCluster import AngularDistance

In [None]:
smass = 10**np.array(data['mass'])
mg = np.array(data['mg'])
mr = np.array(data['mr'])
gr = mg-mr

In [None]:
zbins = np.arange(0.,0.3+0.01,0.01)
dmbins = AngularDistance(zbins)

In [None]:
kr_gr = calc_kcor('r',zgal,'gr',gr)
DM = 5*np.log10(np.interp(zgal,zbins,dmbins)*1e6+1e-9)-5
Mr = mr-DM-kr_gr

In [None]:
def get_percentile_curve(x,y,xbins,alpha=5,Npoints=15):
    #xbins  = np.percentile(x,np.linspace(0,100,Npoints))
    indices= [(x>=xl)&(x<xh) for xl, xh in zip(xbins[:-1],xbins[1:])]
    ycurve = np.array([np.percentile(y[idx],alpha) for idx in indices])
    return ycurve, 0.5*(xbins[1:]+xbins[:-1])

In [None]:
zmask       = velocity_mask
zinterp =  np.arange(0.01,np.percentile(zgal[zmask],99.9)+0.01,0.005)
ylow, zbins2 = get_percentile_curve(zgal[zmask], Mr[zmask], zinterp, Npoints=51, alpha=98)


Mr_thrshold = np.interp(0.1,zbins2,ylow)
Mr_th_mask = Mr<=Mr_thrshold
print(f'Mr thrshold: {Mr_thrshold:.2f} mag')

Mr_lim = np.interp(zgal,zbins2,ylow)
Mr_lim = np.where(zgal<0.015, np.max(ylow), Mr_lim)
Mr_mask = Mr<Mr_lim


In [None]:
plt.scatter(zgal[Mr_mask&zmask],Mr[Mr_mask&zmask],c=(data['mass'][Mr_mask&zmask]),vmin=8.75,vmax=12.,s=30,cmap='jet')
plt.plot(zbins2, ylow, ls='--',lw=4,color='w')
plt.axhline(Mr_thrshold,ls='--',lw=3,color='k')
plt.ylim(-15.,-25)
plt.colorbar()

In [None]:
plt.scatter(zgal[zmask],smass[zmask],s=10,alpha=0.3)
plt.scatter(zgal[zmask&~Mr_mask],smass[zmask&~Mr_mask],s=10,alpha=0.3,color='lightgray')
plt.ylim(1e7,3e12)
plt.yscale('log')

### Assigning new variables

In [None]:
data['Mr'] = Mr
data['kcorr_r_gr'] = kr_gr 
data['mag_lim_mask'] = Mr_mask
data['mag_th_mask'] = Mr_th_mask

## Assign Dynamical Classification

Assign probability for three dynamical classes: `interloper`, `orbital`, `infall`.


In [None]:
from sklearn.ensemble import RandomForestClassifier
import joblib
loaded_rf = joblib.load("/Users/jesteves/Downloads/phase_space/classification_2d_rf.joblib")

In [None]:
rnorm_c = np.array(data['Rc'])
rnorm_m = np.array(data['Rm'])

vlosn_c = np.array(data['vlos'])
vlosn_m = np.array(data['vlosn'])

In [None]:
import pandas as pd
df2   = pd.DataFrame ({ 'r2d':rnorm_m, 'vlos':vlosn_m})
tags2 = loaded_rf.predict(df2) 
tag_probability2 = loaded_rf.predict_proba(df2) 

### Plot Distributions

In [None]:
fig = plt.figure(figsize=(11,4))
plt.subplot(1, 2, 1)
cut = (np.abs(vlosn_m)<3.)&(rnorm_c<3.)
labels = ['orbital','infall','interlopers']
for i in range(3):
    plt.hist(rnorm_c[cut],bins=51,weights=tag_probability2[cut,i],histtype='step', lw=3, label=labels[i],density=False)

plt.legend(fontsize=14)
plt.xlabel(r'$R/R_{200c}$',fontsize=16)
# plt.title('Normalized by: Vcir(M200c), R200c',fontsize=16)
# plt.ylim(-0.025,1.5)

plt.subplot(1, 2, 2)
cut2 = (np.abs(vlosn_m)<3.)&(rnorm_m<3.)
labels = ['orbital','infall','interlopers']
for i in range(3):
    plt.hist(rnorm_m[cut2],bins=51,weights=tag_probability2[cut2,i],histtype='step', lw=3, label=labels[i],density=False)

plt.legend(fontsize=14)
plt.xlabel(r'$R/R_{200m}$',fontsize=16)
# plt.title('Normalized by: Vcir(M200m), R200m',fontsize=16)
# plt.ylim(-0.025,1.5)
plt.tight_layout()
# plt.savefig('./plots/radial_distribtution_prob_weighted.png',dpi=100,facecolor='w',transparent=False)


In [None]:
cut = (np.abs(vlosn_m)<3.)&(rnorm_c<3.)
plt.figure(figsize=(8,4))
plt.scatter(rnorm_c[cut],vlosn_m[cut],c=tag_probability2[cut],alpha=0.7,s=30,cmap='coolwarm_r')
plt.xlabel(r'$R/R_{200c}$',fontsize=16)
plt.ylabel(r'$v_{los}/ v_{circ} $',fontsize=16)
plt.colorbar()


In [None]:
cut = (np.abs(vlosn_m)<3.)&(rnorm_m<3.)
plt.figure(figsize=(8,4))
plt.scatter(rnorm_m[cut],vlosn_m[cut],c=tags2[cut],alpha=0.7,s=30,cmap='coolwarm_r')
plt.xlabel(r'$R/R_{200m}$',fontsize=16)
plt.ylabel(r'$v_{los}/ v_{circ} $',fontsize=16)
plt.colorbar()


In [None]:
for i,li in enumerate(labels):
    data[li] = tags2 == i
    data['p_%s'%li] = tag_probability2[:,i]

In [None]:
a = plt.hist(data['Rm'][cut2],weights=data['p_orbital'][cut2],histtype='step',lw=3,bins=31)
_ = plt.hist(data['Rm'][cut2],weights=data['p_infall'][cut2],histtype='step',lw=3,bins=a[1])
_ = plt.hist(data['Rm'][cut2],weights=data['p_interlopers'][cut2],histtype='step',lw=3,bins=a[1])

# Add Other Public Datasets

## Match with Morphology

We match our sample with Dominguez et al. 2018 morphology sample.

In [None]:
from astropy.io.fits import getdata
from astropy.table import Table

morp_fname = fl.data_loc+"DL_morphology_SDSS_DS18.fit"
morph = Table(getdata(morp_fname,1))

In [None]:
import smatch

nside=4096 # healpix nside
maxmatch=1 # return closest match

# ra,dec,radius in degrees
matches0 = smatch.match(data['ra'], data['dec'], 3.0/3600,
                        morph['_RAJ2000'], morph['_DEJ2000'], nside=nside, maxmatch=maxmatch)

dist = np.sqrt((data['ra'][matches0['i1'] ]-morph['_RAJ2000'][matches0['i2']])**2+
               (data['dec'][matches0['i1']]-morph['_DEJ2000'][matches0['i2']])**2)*3600

In [None]:
plt.figure(figsize=(5,4))
_ = plt.hist(dist,bins=np.linspace(0.,2.5))
plt.axvline(1.95,ls='--',color='k',label='cut = %.2f arcsec'%(1.95))
plt.yscale('log')
plt.xlabel('arcsec',fontsize=18)
plt.legend(fontsize=14)
plt.title('distance between matches',fontsize=18)
plt.tight_layout()
# plt.savefig('../plots/'+'match_distance.png')

In [None]:
matches = smatch.match(data['ra'], data['dec'], 2.9/3600, 
                       morph['_RA'], morph['_DE'], nside=nside, maxmatch=maxmatch)

## euclidean distance for small than 10 arcsec
dist = np.sqrt((data['ra'][matches0['i1']]-morph['_RA'][matches0['i2']])**2+
               (data['dec'][matches0['i1']]-morph['_DE'][matches0['i2']])**2)*3600

In [None]:
indice1 = matches['i1']
indice2 = matches['i2']
fraction = len(data[indice1])/len(data)

print('Fraction matched: %.3f'%fraction)

In [None]:
columns = morph.colnames[4:-3]

print('Selected Columns:',columns)
data['ZOO_MASK'] = False
data['ZOO_MASK'][indice1] = True

for col in columns:
    data[col] = -99.
    data[col][indice1] = morph[col][indice2]

## Match with Bulge+Disk 

We match our sample with Mendel et al. 2014 sample.

<b> ToDos: </b>
- Save Output



In [None]:
root='/Users/jesteves/Documents/localProjects/Catalogs/'
morp_fname = root+"Mendel_et_al_2014/J_ApJS_210_3_dusty.csv"
morph = Table(at.read(morp_fname))

In [None]:
!cat /Users/jesteves/Documents/localProjects/Catalogs/Mendel_et_al_2014/README.txt

In [None]:
morph

In [None]:
import smatch

nside=4096 # healpix nside
maxmatch=1 # return closest match

# ra,dec,radius in degrees
matches0 = smatch.match(data['ra'], data['dec'], 3.0/3600,
                        morph['_RAJ2000'], morph['_DEJ2000'], nside=nside, maxmatch=maxmatch)

dist = np.sqrt((data['ra'][matches0['i1'] ]-morph['_RAJ2000'][matches0['i2']])**2+
               (data['dec'][matches0['i1']]-morph['_DEJ2000'][matches0['i2']])**2)*3600

In [None]:
plt.figure(figsize=(5,4))
_ = plt.hist(dist,bins=np.linspace(0.,2.5))
plt.axvline(1.05,ls='--',color='k',label='cut = %.2f arcsec'%(1.95))
plt.yscale('log')
plt.xlabel('arcsec',fontsize=18)
plt.legend(fontsize=14)
plt.title('distance between matches',fontsize=18)
plt.tight_layout()
# plt.savefig('../plots/'+'match_distance.png')

In [None]:
matches = smatch.match(data['ra'], data['dec'], 2.5/3600, 
                       morph['_RAJ2000'], morph['_DEJ2000'], nside=nside, maxmatch=maxmatch)

## euclidean distance for small than 10 arcsec
dist = np.sqrt((data['ra'][matches0['i1']]-morph['_RAJ2000'][matches0['i2']])**2+
               (data['dec'][matches0['i1']]-morph['_DEJ2000'][matches0['i2']])**2)*3600

In [None]:
indice1 = matches['i1']
indice2 = matches['i2']

mmask = np.abs(data['z'][indice1]-morph['z'][indice2])<0.1*data['z'][indice1]

fraction = len(data[indice1[[mmask]]])/len(data)

print('Fraction matched: %.3f'%fraction)

In [None]:
morph.colnames

### Computing B/T ratio

Mainly following the procedure described in Thanjavur et al. (2016) and Bluck et al. 2014, 2022 with a small modification.

If the there is not strong statistical evidence in favour of Bulge+Disk decomposition. In the cases where, $P_{pS}>0.32$ and and the the measured B/T is within 0.3 of an extremal value (i.e. zero or one), the Sérsic stellar mass are used to the best component and the other is deemed to be negligible (set to zero). 


In [None]:
PpS = np.array(morph['PpS'])

# Bulge+Disk decomposition
massb = np.array(morph['logMb'])
massd = np.array(morph['logMd'])
# massT = np.array(morph['logMt'])
massT = np.log10(10**massb+10**massd)

# Sersic model
massS = np.array(morph['logM'])

In [None]:
bt = 10**(massb-massT)

In [None]:
bulge_mask= (bt>0.7)
disk_mask = (bt<0.3)

massb_new = np.where((PpS>0.32)&bulge_mask, massS, massb)
massd_new = np.where((PpS>0.32)&bulge_mask,  1e-6, massd)

massd_new = np.where((PpS>0.32)&disk_mask, massS, massd_new)
massb_new = np.where((PpS>0.32)&disk_mask,  1e-6, massb_new)

massT_new = np.log10(10**massb_new+10**massd_new)

bt_new = 10**(massb_new-massT_new)

In [None]:
plt.scatter(bt,bt_new,color='k',alpha=0.01,s=100)
plt.axvline(0.7,lw=3,ls='--',color='r')
plt.axvline(0.3,lw=3,ls='--',color='r')
plt.xlabel('B/T - wo correction',fontsize=14)
plt.ylabel('B/T - w/ correction',fontsize=14)

In [None]:
# columns = morph.colnames[3:-3]
# print('Selected Columns:',columns)
data['BT_MASK'] = False
data['BT_MASK'][indice1[mmask]] = True

data['BT'] = -99.
data['BT'][indice1[mmask]] = bt_new[indice2[mmask]]

data['massT'] = -99.
data['massT'][indice1[mmask]] = massT_new[indice2[mmask]]

data['massB'] = -99.
data['massB'][indice1[mmask]] = massb_new[indice2[mmask]]

data['massD'] = -99.
data['massD'][indice1[mmask]] = massd_new[indice2[mmask]]

data['massS'] = -99.
data['massS'][indice1[mmask]] = massS[indice2[mmask]]

In [None]:
plt.scatter(zgal[Mr_mask&zmask],Mr[Mr_mask&zmask],c=(data['massT'][Mr_mask&zmask]),vmin=8.75,vmax=12.,s=30,cmap='jet')
plt.plot(zbins2, ylow, ls='--',lw=4,color='w')
plt.axhline(Mr_thrshold,ls='--',lw=3,color='k')
plt.ylim(-15.,-25)
plt.colorbar()

In [None]:
plt.scatter(data['z'][cut&data['BT_MASK']],10**data['mass'][cut&data['BT_MASK']],s=50,alpha=0.01)
plt.axhline(1e10)
plt.yscale('log')
plt.ylim(1e8,1e12)

In [None]:
plt.loglog()
plt.scatter(10**data['mass'][cut&data['BT_MASK']],10**data['massT'][cut&data['BT_MASK']],s=50,alpha=0.01)
plt.ylim(1e8,1e12)
plt.xlim(1e8,1e12)

In [None]:
np.count_nonzero(data['BT_MASK'][velocity_mask])/len(data[velocity_mask])

In [None]:
np.count_nonzero(data['ZOO_MASK'][velocity_mask])/len(data[velocity_mask])

# Save Output Samples

In [None]:
len(data)

In [None]:
data[velocity_mask]

In [None]:
print(f'Saving file: {fl.galaxy}')
print(f'Number of Galaxies: {len(data[velocity_mask])}')
data[velocity_mask].write(fl.galaxy,format='csv',overwrite=True)