#### <font color='purple'> [Define the Dataframe and Filter the Dataset](#1)
#### <font color='purple'> [The Raw Dataset](#2)
#### <font color='purple'> [The Normalized Dataset (or, Dimensionless Dataset)](#3)
#### <font color='purple'> [Datasets by Morphology](#4)

In [1]:
import time
import numpy as np
import pandas as pd

In [7]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 100)

### Define functions

In [2]:
def log_mass_size(df):
    '''
    Converts to log10 non-zero size and mass entries (without distinguishing galaxy type)
    output: df with new columns of log mass, log size added to it

    '''
    #first let's remove all galaxy_mass=0 since you are going to at the end anyway
#     df=df[(df['GalpropMstar'] > 0) & (df['HalopropMhot'] > 0)]
#     df=df[df['GalpropMbulge'] >= 0]

#     #subhalo_flag = subhalos[‘SubhaloFlag’]
#     if type_of_galaxy == 'central':
#         df=df[df['subhalo_flag']==True]
#     elif type_of_galaxy == 'satellite':
#         df=df[df['subhalo_flag']==False]
#     else: 
#         print("type_of_galaxy must be set to central or satellite")

    #since we got rid of all zero mass we can just take the log10
    df.loc[:,'GalpropLogMstar']=np.log10(df.loc[:,'GalpropMstar'])
    df.loc[:,'GalpropLogHalfRadius']=np.log10(df.loc[:, 'GalpropHalfRadius'])
    df.loc[:,'HalopropLogMvir']=np.log10(df.loc[:,'HalopropMvir'])
    df.loc[:,'HalopropLogMhot']=np.log10(df.loc[:,'HalopropMhot'])
    df.loc[:,'GalpropLogMbulge']=df.loc[:,'GalpropMbulge'].apply(lambda x: np.log10(x) if x>0 else 0)

    return df

### <font color='blue'> <a id =1> </a> <br> Define data frame

In [8]:
# latest version, downloaded on Feb 13, 2023. This is the vdisk, vvir corrected version of Santa Cruz SAM
df=pd.read_hdf('/Users/festabu/Desktop/ML_galaxy_size_project/Codes/TNG_Data_SAM_and_SIM/data/tng300-sam.h5')

In [9]:
# There are a total of 1,183,265 galaxies in the TNG300-SAM (the one corrected for vdisk)
df

Unnamed: 0,GalpropMBH,GalpropMH2,GalpropMHI,GalpropMHII,GalpropMaccdot,GalpropMaccdot_radio,GalpropMbulge,GalpropMcold,GalpropMstar,GalpropMstar_merge,GalpropMstrip,GalpropMu_merger,GalpropMvir,GalpropOutflowRate_Mass,GalpropOutflowRate_Metal,GalpropRbulge,GalpropRdisk,GalpropRfric,GalpropRhalo,GalpropSatType,GalpropSfr,GalpropSfrave100myr,GalpropSfrave1gyr,GalpropSfrave20myr,GalpropSigmaBulge,GalpropTmerger,GalpropTmerger_major,GalpropTsat,GalpropVdisk,GalpropZcold,GalpropZstar,GalpropX,GalpropVx,GalpropY,GalpropVy,GalpropZ,GalpropVz,HalopropC_nfw,HalopropMaccdot_metal,HalopropMaccdot_pristine,HalopropMaccdot_radio,HalopropMaccdot_reaccreate,HalopropMaccdot_reaccreate_metal,HalopropMass_ejected,HalopropMcooldot,HalopropMdot_eject,HalopropMdot_eject_metal,HalopropMetal_ejected,HalopropMhot,HalopropMstar_diffuse,HalopropMvir,HalopropSpin,HalopropZhot,GalpropVvir,GalpropMdisk,GalpropHalfmassRadius
3,70113.099355,5.334980e+06,8.410620e+07,8.798940e+07,0.0,1.041410e-08,4.956340e+07,2.401300e+08,1.498290e+08,1.260270e+06,2.258640e+10,0.139166,2.258640e+10,0.273785,0.016982,1.285890,1.489360,0.000000,74.467804,0.0,0.004072,0.003734,0.004101,0.002496,29.518999,3.092340,113.702003,-99.00000,56.500000,0.014896,0.011260,37.243801,281.679993,15.855300,-270.540009,23.517500,-112.489998,30.196501,0.0,0.0,1.041410e-08,0.312146,0.0,6.264690e+09,3.124880e+08,0.0,0.0,0.294780,3.334920e+03,3.150670e+05,2.258640e+10,0.01981,2.068510e+02,36.113822,1.002656e+08,2.172346
4,40086.601075,5.297840e+06,1.181020e+08,1.186890e+08,0.0,5.833360e-09,4.106860e+07,3.281030e+08,2.677310e+08,1.462520e+06,3.417480e+10,0.123353,3.417480e+10,0.524337,0.032891,1.609260,1.709830,0.000000,85.491302,0.0,0.009404,0.008752,0.011879,0.005768,33.430000,0.869185,12.606100,-99.00000,62.529999,0.020568,0.020358,37.437302,360.320007,15.644100,-307.429993,23.580400,-16.270000,31.364300,0.0,0.0,5.833360e-09,0.622176,0.0,1.248560e+10,6.242520e+08,0.0,0.0,0.535400,2.018700e+04,8.143430e+05,3.417480e+10,0.01449,9.254700e+02,41.459710,2.266624e+08,2.727576
6,48031.299229,1.542500e+07,2.352650e+08,1.868280e+08,0.0,6.910570e-09,1.530520e+07,5.911500e+08,1.552730e+08,8.551790e+06,4.377520e+10,0.105218,4.377520e+10,0.496575,0.026464,2.337160,1.856920,0.000000,92.845901,0.0,0.009393,0.008679,0.010228,0.005760,34.869999,7.639330,10.537300,-99.00000,64.290001,0.031502,0.007912,37.627800,-248.869995,15.829200,-535.789978,23.960800,-0.690000,20.742901,0.0,0.0,6.910570e-09,0.490248,0.0,9.848680e+09,4.925720e+08,0.0,0.0,0.295355,2.264280e+04,2.137950e+06,4.377520e+10,0.03836,1.206710e+03,45.026369,1.399678e+08,3.067694
7,253278.994933,1.498190e+07,2.066300e+08,1.830710e+08,0.0,2.488670e-08,2.456220e+08,5.479110e+08,3.689190e+08,7.245730e+07,5.105800e+10,0.533221,5.105800e+10,0.580681,0.049870,2.783750,1.954660,0.000000,97.733002,0.0,0.015327,0.014238,0.018177,0.009401,36.725300,1.755980,1.755980,-99.00000,71.239998,0.047015,0.033617,37.528301,-224.550003,15.825300,-370.500000,24.082001,44.070000,23.954399,0.0,0.0,2.488670e-08,0.688583,0.0,1.381770e+10,6.927810e+08,0.0,0.0,0.744319,4.085450e+04,1.819430e+07,5.105800e+10,0.01631,3.090210e+03,47.396416,1.232970e+08,3.013642
22,40070.401155,1.809950e+07,2.270450e+08,1.901650e+08,0.0,4.324690e-09,1.621790e+07,5.877880e+08,1.770030e+08,3.641170e+06,4.838100e+10,0.349188,4.838100e+10,0.384356,0.027619,1.426350,1.919880,0.000000,95.994194,0.0,0.011067,0.010234,0.012381,0.006787,35.989300,12.606100,12.606100,-99.00000,71.040001,0.042284,0.013952,38.245399,145.789993,14.436200,470.290009,24.617201,-97.000000,24.825500,0.0,0.0,4.324690e-09,0.340443,0.0,6.844610e+09,3.426380e+08,0.0,0.0,0.324327,2.138570e+04,9.198190e+05,4.838100e+10,0.03210,1.536710e+03,46.553160,1.607851e+08,3.093642
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5985032,389436.987462,2.649780e+07,3.684750e+07,2.741310e+07,0.0,4.309700e-08,1.229680e+08,1.175010e+08,5.210760e+08,2.649820e+07,2.581930e+10,0.132893,2.581930e+10,1.220130,0.113940,0.858562,1.557270,0.000000,77.863701,0.0,0.019080,0.009538,0.011731,0.009593,42.851101,8.523450,12.464400,-99.00000,60.759998,0.011068,0.062515,302.384003,36.540001,296.097992,366.660004,294.838989,47.419998,37.033699,0.0,0.0,4.309700e-08,0.702343,0.0,1.410790e+10,7.030710e+08,0.0,0.0,1.056100,2.411770e+04,6.643380e+06,2.581930e+10,0.02224,1.819180e+03,37.760604,3.981080e+08,2.238148
5985270,10019.500223,0.000000e+00,4.336810e-10,2.065750e+06,0.0,1.544280e-09,1.236160e+08,2.791030e+06,1.255120e+08,0.000000e+00,3.340320e+10,0.000000,3.340320e+10,0.286403,0.017819,0.127728,1.696860,0.000000,84.843002,0.0,0.005694,0.005409,0.008143,0.003496,23.390200,113.702003,113.702003,-99.00000,38.509998,0.000177,0.041280,298.363007,72.849998,277.878998,215.789993,289.779999,-37.189999,6.808350,0.0,0.0,1.544280e-09,0.276727,0.0,5.560120e+09,2.863500e+08,0.0,0.0,0.222301,9.328610e+04,0.000000e+00,3.340320e+10,0.06907,1.172070e+03,41.145301,1.896001e+06,0.131286
5987488,10008.699974,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,5.473510e+08,0.000000e+00,5.475710e+08,0.000000e+00,4.762650e+09,0.000000,9.607320e+09,117.511002,0.000000,0.688557,0.961089,0.016262,48.054501,1.0,0.163359,0.153028,0.195078,0.100241,16.902100,113.702003,113.702003,2.96912,26.639999,0.001911,0.086052,275.682007,466.476013,293.325989,338.010986,269.873993,-281.791992,9.361450,0.0,0.0,1.622200e-08,0.292089,0.0,5.850320e+09,1.009390e+09,0.0,0.0,0.043318,6.268910e+11,2.653420e+04,2.796030e+10,0.07894,1.069610e+07,29.320294,2.200007e+05,0.688895
5987834,824788.003229,1.857910e+06,2.691140e+07,6.153290e+07,0.0,5.423410e-08,8.339530e+08,1.219330e+08,8.375820e+08,3.927690e+06,3.513580e+10,0.346080,3.513580e+10,0.273052,0.034979,0.387833,1.725710,0.000000,86.285301,0.0,0.001116,0.001038,0.001250,0.000684,46.439400,4.730840,4.730840,-99.00000,37.849998,0.015619,1.060400,292.904999,73.849998,280.065002,293.750000,291.777008,-273.070007,13.299800,0.0,0.0,5.423410e-08,0.262849,0.0,5.281840e+09,2.637140e+08,0.0,0.0,0.648673,8.409520e+03,9.819220e+05,3.513580e+10,0.06677,1.033070e+03,41.844727,3.628969e+06,0.390753


In [10]:
# Rename GalpropR50 as GalpropHalfRadius because that's how it was named in the old SAM (hence all code uses this name)
# Similarly for HalopropMvir -> GalpropMvir. Update(2/10/23): Don't rename HalopropMvir to GalpropMvir because the two are 
# identical for centrals, but not for satellites. Moreover, in the rest of the old SAM analysis I have used
# HalopropMvir to normalize all masses, and not GalpropMvir. However, Ari suggests it would be better to keep
# GalpropMVir for the sake of consistency with the other names for halo properties, like GalpropRhalo, GalpropVvir, etc.
df_new = df.rename(columns={'GalpropHalfmassRadius': 'GalpropHalfRadius'}) #, 'HalopropMvir' : 'GalpropMvir'})
df_new.head(2)

Unnamed: 0,GalpropMBH,GalpropMH2,GalpropMHI,GalpropMHII,GalpropMaccdot,GalpropMaccdot_radio,GalpropMbulge,GalpropMcold,GalpropMstar,GalpropMstar_merge,GalpropMstrip,GalpropMu_merger,GalpropMvir,GalpropOutflowRate_Mass,GalpropOutflowRate_Metal,GalpropRbulge,GalpropRdisk,GalpropRfric,GalpropRhalo,GalpropSatType,GalpropSfr,GalpropSfrave100myr,GalpropSfrave1gyr,GalpropSfrave20myr,GalpropSigmaBulge,GalpropTmerger,GalpropTmerger_major,GalpropTsat,GalpropVdisk,GalpropZcold,GalpropZstar,GalpropX,GalpropVx,GalpropY,GalpropVy,GalpropZ,GalpropVz,HalopropC_nfw,HalopropMaccdot_metal,HalopropMaccdot_pristine,HalopropMaccdot_radio,HalopropMaccdot_reaccreate,HalopropMaccdot_reaccreate_metal,HalopropMass_ejected,HalopropMcooldot,HalopropMdot_eject,HalopropMdot_eject_metal,HalopropMetal_ejected,HalopropMhot,HalopropMstar_diffuse,HalopropMvir,HalopropSpin,HalopropZhot,GalpropVvir,GalpropMdisk,GalpropHalfRadius
3,70113.099355,5334980.0,84106200.0,87989400.0,0.0,1.04141e-08,49563400.0,240130000.0,149829000.0,1260270.0,22586400000.0,0.139166,22586400000.0,0.273785,0.016982,1.28589,1.48936,0.0,74.467804,0.0,0.004072,0.003734,0.004101,0.002496,29.518999,3.09234,113.702003,-99.0,56.5,0.014896,0.01126,37.243801,281.679993,15.8553,-270.540009,23.5175,-112.489998,30.196501,0.0,0.0,1.04141e-08,0.312146,0.0,6264690000.0,312488000.0,0.0,0.0,0.29478,3334.919938,315067.009069,22586400000.0,0.01981,206.850999,36.113822,100265600.0,2.172346
4,40086.601075,5297840.0,118102000.0,118689000.0,0.0,5.83336e-09,41068600.0,328103000.0,267731000.0,1462520.0,34174800000.0,0.123353,34174800000.0,0.524337,0.032891,1.60926,1.70983,0.0,85.491302,0.0,0.009404,0.008752,0.011879,0.005768,33.43,0.869185,12.6061,-99.0,62.529999,0.020568,0.020358,37.437302,360.320007,15.6441,-307.429993,23.5804,-16.27,31.3643,0.0,0.0,5.83336e-09,0.622176,0.0,12485600000.0,624252000.0,0.0,0.0,0.5354,20187.0007,814342.987724,34174800000.0,0.01449,925.469976,41.45971,226662400.0,2.727576


In [11]:
# this df contains only central galaxies; features are from TNG300-SAM
df_centrals = df_new[df_new.loc[:, 'GalpropSatType'] == 0]

In [12]:
# There are 813,838  central galaxies in TNG300-SAM 
df_centrals.shape

(813838, 56)

### <font color='blue'> Filter the dataset

In [13]:
# Remove all galaxies with stellar mass log10Mstar<=8 and re-define centrals as such:
df_centrals = df_centrals.loc[df_centrals['GalpropMstar']>1e8, :]

# 813,838 central galaxies with stellar mass >1e8 (for comparison, there were ~39,000 centrals with Mstar>1e8 in TNG100-NewSAM)

In [14]:
# All centrals in this dataset are already more massive than 1e8 because Ari made the mass cut before uploading the dataset
df_centrals.shape

(813838, 56)

## Add HalopropVvir: halo velocity

In [15]:
from astropy.cosmology import FlatLambdaCDM
from astropy import units as u
from astropy import constants as cons


def Delta(z,Om0=0.279,Ob0=0.046,h=0.7):
    #Bryan & Norman 99
    cosmo = FlatLambdaCDM(H0=100*h, Om0=Om0, Ob0=Ob0)
    x=cosmo.Om(z)-1
    return 18*np.pi**2+82*x-39*x**2

def rho_crit(z,Om0=0.279,Ob0=0.046,h=0.7):
    cosmo = FlatLambdaCDM(H0=100*h, Om0=Om0, Ob0=Ob0)
    return cosmo.critical_density(z).to(u.Msun/u.kpc**3).value

def rho_vir(z,Om0=0.279,Ob0=0.046,h=0.7):
    D=Delta(z,Om0=Om0,Ob0=Ob0,h=h)
    cosmo = FlatLambdaCDM(H0=100*h, Om0=Om0, Ob0=Ob0)
    rho_c = cosmo.critical_density(z).to(u.Msun/u.kpc**3).value
    return D*rho_c

def Rvir_to_Mvir(Rvir,z,Om0=0.279,Ob0=0.046,h=0.7):
    rho_v = rho_vir(z,Om0=Om0,Ob0=Ob0,h=h)
    return rho_v*4./3.*np.pi*Rvir**3   #Msun

def Mvir_to_Rvir(Mvir,z,Om0=0.279,Ob0=0.046,h=0.7):
    rho_v = rho_vir(z,Om0=Om0,Ob0=Ob0,h=h)
    return  (Mvir/(rho_v*4./3.*np.pi))**(1./3.)  #kpc

def Mvir_to_Vvir(Mvir,z,Om0=0.279,Ob0=0.046,h=0.7):
    Rvir = Mvir_to_Rvir(Mvir,z,Om0=Om0,Ob0=Ob0,h=h)
    return (np.sqrt(cons.G*Mvir*u.Msun/(Rvir*u.kpc))).to(u.km/u.s)

In [16]:
# Define Halo property Vvir (as it does not exist in the dataset above)
v= Mvir_to_Vvir (np.array(df_centrals.loc[:,'HalopropMvir']),0) # has units

In [17]:
# add Halo property Vvir
df_centrals.loc[:, 'HalopropVvir']=v.value # shape of df at this point: df_log_mass_filtered.shape (35390, 36)

In [18]:
df_centrals.shape

(813838, 57)

Note: Ari made one number a little more accurate for GalpropVvir in the recent catalog, but there is a very small difference between GalpropVvir and HalopropVvir calculated above. I have used HalopropVvir in all analysis.

### Calculate the percentage of zeros each column has

In [19]:
# 1) replace 0s with NaN 2) .isnull() gives boolean (whether NaN is true or false)
# 3) .sum() sums the true instances (that is, all 0s in a column)
# 4) dividing by shape[0] divides by the number of total entries (1,346,899) 5) *100 to convert to percentages
100*df_centrals.replace(0, np.nan).isnull().sum()/df.shape[0] 

GalpropMBH                           0.000000
GalpropMH2                           0.307201
GalpropMHI                           0.234436
GalpropMHII                          0.059581
GalpropMaccdot                      68.066663
GalpropMaccdot_radio                 0.000000
GalpropMbulge                        0.365345
GalpropMcold                         0.059496
GalpropMstar                         0.000000
GalpropMstar_merge                   0.395262
GalpropMstrip                        0.000000
GalpropMu_merger                     7.062746
GalpropMvir                          0.000000
GalpropOutflowRate_Mass              0.074201
GalpropOutflowRate_Metal             0.102513
GalpropRbulge                        0.365345
GalpropRdisk                         0.000000
GalpropRfric                        68.779014
GalpropRhalo                         0.000000
GalpropSatType                      68.779014
GalpropSfr                           0.074201
GalpropSfrave100myr               

In [20]:
# calculate the percentage of 0s in each column in order to 1) remove columns of all 0s;
# 2) decide what strategy to use for columns with mostly 0s
df_percentage=pd.DataFrame(100*df_centrals.replace(0, np.nan).isnull().sum()/df_centrals.shape[0] )

In [21]:
df_percentage

Unnamed: 0,0
GalpropMBH,0.0
GalpropMH2,0.446649
GalpropMHI,0.340854
GalpropMHII,0.086627
GalpropMaccdot,98.96429
GalpropMaccdot_radio,0.0
GalpropMbulge,0.531187
GalpropMcold,0.086504
GalpropMstar,0.0
GalpropMstar_merge,0.574684


In [22]:
# Remove all the columns that are mostly 0s
df_centrals = df_centrals.drop(columns = ['GalpropMaccdot', 'GalpropRfric', 'GalpropSatType',
                                       'HalopropMaccdot_metal', 'HalopropMaccdot_reaccreate_metal',
                                       'HalopropMdot_eject', 'HalopropMdot_eject_metal' ])

In [23]:
df_centrals.shape

(813838, 50)

### Remove other columns like position and 3D velocities

In [24]:
df_centrals = df_centrals.drop(columns = ['GalpropX', 'GalpropVx', 'GalpropY', 'GalpropVy',
                                         'GalpropZ', 'GalpropVz'])

In [25]:
df_centrals.shape

(813838, 44)

### <font color='blue'> Logarithmic scale of stellar mass, SFR, Mhalo

In [26]:
df_centrals_log=log_mass_size(df_centrals)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [27]:
df_centrals_log.head(2)

Unnamed: 0,GalpropMBH,GalpropMH2,GalpropMHI,GalpropMHII,GalpropMaccdot_radio,GalpropMbulge,GalpropMcold,GalpropMstar,GalpropMstar_merge,GalpropMstrip,GalpropMu_merger,GalpropMvir,GalpropOutflowRate_Mass,GalpropOutflowRate_Metal,GalpropRbulge,GalpropRdisk,GalpropRhalo,GalpropSfr,GalpropSfrave100myr,GalpropSfrave1gyr,GalpropSfrave20myr,GalpropSigmaBulge,GalpropTmerger,GalpropTmerger_major,GalpropTsat,GalpropVdisk,GalpropZcold,GalpropZstar,HalopropC_nfw,HalopropMaccdot_pristine,HalopropMaccdot_radio,HalopropMaccdot_reaccreate,HalopropMass_ejected,HalopropMcooldot,HalopropMetal_ejected,HalopropMhot,HalopropMstar_diffuse,HalopropMvir,HalopropSpin,HalopropZhot,GalpropVvir,GalpropMdisk,GalpropHalfRadius,HalopropVvir,GalpropLogMstar,GalpropLogHalfRadius,HalopropLogMvir,HalopropLogMhot,GalpropLogMbulge
3,70113.099355,5334980.0,84106200.0,87989400.0,1.04141e-08,49563400.0,240130000.0,149829000.0,1260270.0,22586400000.0,0.139166,22586400000.0,0.273785,0.016982,1.28589,1.48936,74.467804,0.004072,0.003734,0.004101,0.002496,29.518999,3.09234,113.702003,-99.0,56.5,0.014896,0.01126,30.196501,0.0,1.04141e-08,0.312146,6264690000.0,312488000.0,0.29478,3334.919938,315067.009069,22586400000.0,0.01981,206.850999,36.113822,100265600.0,2.172346,36.256929,8.175596,0.336929,10.353847,3.523085,7.695161
4,40086.601075,5297840.0,118102000.0,118689000.0,5.83336e-09,41068600.0,328103000.0,267731000.0,1462520.0,34174800000.0,0.123353,34174800000.0,0.524337,0.032891,1.60926,1.70983,85.491302,0.009404,0.008752,0.011879,0.005768,33.43,0.869185,12.6061,-99.0,62.529999,0.020568,0.020358,31.3643,0.0,5.83336e-09,0.622176,12485600000.0,624252000.0,0.5354,20187.0007,814342.987724,34174800000.0,0.01449,925.469976,41.45971,226662400.0,2.727576,41.62402,8.427699,0.435777,10.533706,4.305072,7.61351


In [28]:
df_centrals_log.shape

(813838, 49)

In [29]:
def log_sfr(df):
    '''
    Converts to log10 non-zero SFR entries (without distinguishing galaxy type)
    output: df with new columns of log SFR

    '''
    df.loc[:,'GalpropLogSfr']=df.loc[:,'GalpropSfr'].apply(lambda x: np.log10(x) if x>0 else 0)
    df.loc[:,'GalpropLogSfrave100myr']=df.loc[:,'GalpropSfrave100myr'].apply(lambda x: np.log10(x) if x>0 else 0)
    df.loc[:,'GalpropLogSfrave1gyr']=df.loc[:,'GalpropSfrave1gyr'].apply(lambda x: np.log10(x) if x>0 else 0)
    df.loc[:,'GalpropLogSfrave20myr']=df.loc[:,'GalpropSfrave20myr'].apply(lambda x: np.log10(x) if x>0 else 0)

    return df

In [30]:
# add a column with the logarithmic value of SFR, setting any SFR=0 to a fixed value of -2
df_centrals_log = log_sfr(df_centrals_log)

In [31]:
df_centrals_log.head(2)

Unnamed: 0,GalpropMBH,GalpropMH2,GalpropMHI,GalpropMHII,GalpropMaccdot_radio,GalpropMbulge,GalpropMcold,GalpropMstar,GalpropMstar_merge,GalpropMstrip,GalpropMu_merger,GalpropMvir,GalpropOutflowRate_Mass,GalpropOutflowRate_Metal,GalpropRbulge,GalpropRdisk,GalpropRhalo,GalpropSfr,GalpropSfrave100myr,GalpropSfrave1gyr,GalpropSfrave20myr,GalpropSigmaBulge,GalpropTmerger,GalpropTmerger_major,GalpropTsat,GalpropVdisk,GalpropZcold,GalpropZstar,HalopropC_nfw,HalopropMaccdot_pristine,HalopropMaccdot_radio,HalopropMaccdot_reaccreate,HalopropMass_ejected,HalopropMcooldot,HalopropMetal_ejected,HalopropMhot,HalopropMstar_diffuse,HalopropMvir,HalopropSpin,HalopropZhot,GalpropVvir,GalpropMdisk,GalpropHalfRadius,HalopropVvir,GalpropLogMstar,GalpropLogHalfRadius,HalopropLogMvir,HalopropLogMhot,GalpropLogMbulge,GalpropLogSfr,GalpropLogSfrave100myr,GalpropLogSfrave1gyr,GalpropLogSfrave20myr
3,70113.099355,5334980.0,84106200.0,87989400.0,1.04141e-08,49563400.0,240130000.0,149829000.0,1260270.0,22586400000.0,0.139166,22586400000.0,0.273785,0.016982,1.28589,1.48936,74.467804,0.004072,0.003734,0.004101,0.002496,29.518999,3.09234,113.702003,-99.0,56.5,0.014896,0.01126,30.196501,0.0,1.04141e-08,0.312146,6264690000.0,312488000.0,0.29478,3334.919938,315067.009069,22586400000.0,0.01981,206.850999,36.113822,100265600.0,2.172346,36.256929,8.175596,0.336929,10.353847,3.523085,7.695161,-2.390215,-2.42787,-2.387158,-2.602757
4,40086.601075,5297840.0,118102000.0,118689000.0,5.83336e-09,41068600.0,328103000.0,267731000.0,1462520.0,34174800000.0,0.123353,34174800000.0,0.524337,0.032891,1.60926,1.70983,85.491302,0.009404,0.008752,0.011879,0.005768,33.43,0.869185,12.6061,-99.0,62.529999,0.020568,0.020358,31.3643,0.0,5.83336e-09,0.622176,12485600000.0,624252000.0,0.5354,20187.0007,814342.987724,34174800000.0,0.01449,925.469976,41.45971,226662400.0,2.727576,41.62402,8.427699,0.435777,10.533706,4.305072,7.61351,-2.026706,-2.057871,-1.925213,-2.238952


In [32]:
df_centrals_log.shape

(813838, 53)

### Gas fraction

In [33]:
df_centrals_log.loc[:,'Galprop_neutral_H_mass']=(df_centrals_log.loc[:,'GalpropMHI']+df_centrals_log.loc[:,'GalpropMH2'])
df_centrals_log.loc[:,'Galprop_baryon_mass']=df_centrals_log.loc[:,'Galprop_neutral_H_mass']+df_centrals_log.loc[:,'GalpropMstar']
df_centrals_log.loc[:,'Galprop_gas_fraction']=df_centrals_log.loc[:,'Galprop_neutral_H_mass']/df_centrals_log.loc[:,'Galprop_baryon_mass']

In [34]:
df_centrals_log.loc[: , 'MstarMvir_ratio'] = df_centrals_log.loc[:, 'GalpropMstar']/df_centrals_log.loc[:, 'HalopropMvir']

# Add BulgeMstar ratio and its corresponding quantities (in order to use them in the physical formula)

df_centrals_log.loc[:, 'BulgeMstar_ratio'] = df_centrals_log.loc[:,'GalpropMbulge']/df_centrals_log.loc[:, 'GalpropMstar']
df_centrals_log.loc[:, 'DiskMstar_ratio']  = df_centrals_log.loc[:, 'GalpropMdisk']/df_centrals_log.loc[:, 'GalpropMstar']

In [35]:
df_centrals_log.shape

(813838, 59)

# <font color='red'> Conclusion: The final dataset has 3 cuts and 1 re-definition:
    
### <font color='red'> 1) remove all galaxies with $log_{10}M_{star}$<9.0, 
                                                                          
### <font color='red'> 2) reomve all galaxies with $f_{disk}$<0.0205 and disk galaxies with $M_{bulge}/M_{star}$<0.4 , 
    
### <font color='red'> 3) remove all non-physical galaxies with $M_{star}/M_{vir}$>0.2,
                                                                                        
### <font color='red'> 4) Re-define spin where $spin_{effective}$ = 0.02 for all spin<=0.02.

## <font color='purple'> <a id =2> </a> <br> The Raw Dataset

#### 1) remove all galaxies with $log_{10}M_{star}$<9.0, 

In [36]:
# Remove all galaxies with stellar mass log10Mstar<9 and re-define centrals as such:
df_centrals_log = df_centrals_log.loc[df_centrals_log['GalpropMstar']>=1e9, :]

# There are 540,072 (~66%) galaxies with log10Mstar=[8.0, 9.0).  
# That is, 273,766 galaxies (~34%) have a log10Mstar>=9.

In [37]:
df_centrals_log.shape

(273766, 59)

#### 2) reomve all galaxies with $f_{disk}$<0.0205 and disk galaxies with $M_{bulge}/M_{star}$<0.4 ,

In [38]:
# Add fdisk field
# From the True Physical formula, fdisk is defined as below:
# mdisk = (mstar_disk + mass_cold_gas) Important Note: Use Mdisk (and NOT Mstar) here!!!
# fdisk = mdisk/m_halo
# In our normalized dataset, NormMstar=Mstar/Mhalo and NormMcold=Mcold/Mhalo, so fdisk = NormMstar + NormMcold
df_centrals_log.loc[:, 'fdisk'] = (df_centrals_log.loc[:, 'GalpropMdisk'] + df_centrals_log.loc[:, 'GalpropMcold'])/df_centrals_log.loc[:, 'HalopropMvir']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [39]:
df_centrals_log.shape

(273766, 60)

In [40]:
# Choose all df_centrals_log, but remove (fdisk<0.0205 & Mbulge/Mstar<0.4) galaxies
df_centrals_log = df_centrals_log[~((df_centrals_log.fdisk<0.0205)&(df_centrals_log.BulgeMstar_ratio<0.4))]
df_centrals_log.shape

(207848, 60)

In [41]:
df_centrals_log = df_centrals_log.drop(columns=['fdisk'])
df_centrals_log.shape

(207848, 59)

##### Note: There are ~66,000 galaxies (65,918; 24% of galaxies) with $f_{disk}$<0.0205 AND $M_{bulge}/M_{star}$<0.4 in the  dataset with $log_{10}M_{star}$>9.0 mass cut. Therefore, by imposing the morphology cut together with the low fdisk cut, we are removing 24% of galaxies.

#### 3) remove all non-physical galaxies with $M_{star}/M_{vir}$>0.2,

In [42]:
# Remove non-physical galaxies whose Mstar/Mvir > 0.2
df_centrals_log = df_centrals_log[df_centrals_log.MstarMvir_ratio < 0.2]
df_centrals_log.shape
# There are 381 non-physical galaxies in this dataset

(207467, 59)

The dataset so far is saved as:
df_centrals_log.to_csv ('TNG300-SAM_images/v1_TNG300-SAM_cleanup_normalize_dataset/TNG300-NewSAM_Raw_Dataset_fromv1_wo_nonphys_mstar9_and_diskgals_w_smallfdisk.csv', index=False) 

#### 4) Re-define spin where $spin_{effective}$ = 0.02 for all spin<=0.02.

In [45]:
# There are 28,062 galaxies with Spin<0.02
df_centrals_log[df_centrals_log.HalopropSpin<0.02].shape

(28096, 59)

In [46]:
# Set all Spin<0.02 equal to 0.02
df_centrals_log.loc[:,'HalopropSpin_effective']=df_centrals_log.loc[:,'HalopropSpin'].apply(lambda x: 0.02 if x<0.02 else x)

In [47]:
df_not_normalized = df_centrals_log.drop(columns=['HalopropSpin'])

In [48]:
# Rename Spin_effective back to Spin because all the rest of the code has this name used
df_not_normalized = df_not_normalized.rename(columns={'HalopropSpin_effective': 'HalopropSpin'})
df_not_normalized.shape

(207467, 59)

In [53]:
df_not_normalized[df_not_normalized.HalopropSpin==0.02].shape

(28096, 59)

### <font color='blue'> Save the Raw Dataset and use it for all analysis
This is the final dataset used in all analysis. It is saved as:
    
df_not_normalized.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/v6_TNG300-NewSAM_Raw_Dataset_fromv1_wo_nonphys_mstar9_and_diskgals_w_smallfdisk_w_spineff.csv', index=False)

## <font color='purple'> <a id =3> </a> <br> The Normalized Dataset (or, Dimensionless Dataset)

In [55]:
df_normalized_step1 =df_not_normalized.drop(columns =['GalpropTsat',
                                               'GalpropLogMstar', 'GalpropLogHalfRadius', 'HalopropLogMvir',
                                               'HalopropLogMhot', 'GalpropLogMbulge', 'GalpropLogSfr',
                                               'GalpropLogSfrave100myr', 'GalpropLogSfrave1gyr', 
                                               'GalpropLogSfrave20myr', 'Galprop_neutral_H_mass',
                                               'Galprop_baryon_mass', 'Galprop_gas_fraction',
                                               'GalpropRbulge',  'GalpropRdisk', 'GalpropVvir', 'GalpropMvir'])
df_normalized_step1.shape

(207467, 42)

In [56]:
def normalization_func(df):
    '''
    Normalizes the dataset by dividing all masses by halo mass, and galaxy size by halo size

    '''
#     could also use the method below
#     df_log_mass_filtered.loc[:,'GalpropNormMstar']=df_log_mass_filtered.loc[:,'GalpropMstar'].div(df_log_mass_filtered.HalopropMvir, axis=0)
    
    df =df.copy()
    
    halomass=df.loc[:,'HalopropMvir']
    halorad= df.loc[:,'GalpropRhalo']
    
    df.loc[:,'GalpropNormHalfRadius']=df.loc[:,'GalpropHalfRadius']/halorad
    df.loc[:,'GalpropNormRhalo']=df.loc[:,'GalpropRhalo']/halorad

    df.loc[:,'GalpropNormMstar']=df.loc[:,'GalpropMstar']/halomass
#     df.loc[:,'GalpropNormMvir']=df.loc[:,'GalpropMvir']/halomass
    df.loc[:,'HalopropNormMhot']=df.loc[:,'HalopropMhot']/halomass
    df.loc[:,'GalpropNormMbulge']=df.loc[:,'GalpropMbulge']/halomass
    df.loc[:,'GalpropNormMdisk']=df.loc[:,'GalpropMdisk']/halomass

    
    df.loc[:,'GalpropNormMBH']=df.loc[:,'GalpropMBH']/halomass
    df.loc[:,'GalpropNormMH2']=df.loc[:,'GalpropMH2']/halomass
    df.loc[:,'GalpropNormMHI']=df.loc[:,'GalpropMHI']/halomass
    df.loc[:,'GalpropNormMHII']=df.loc[:,'GalpropMHII']/halomass
    df.loc[:,'GalpropNormMcold']=df.loc[:,'GalpropMcold']/halomass
    df.loc[:,'GalpropNormMstar_merge']=df.loc[:,'GalpropMstar_merge']/halomass
#     df.loc[:,'GalpropNormMstrip']=df.loc[:,'GalpropMstrip']/halomass
    
    df.loc[:,'HalopropNormMass_ejected']=df.loc[:,'HalopropMass_ejected']/halomass
    df.loc[:,'HalopropNormMstar_diffuse']=df.loc[:,'HalopropMstar_diffuse']/halomass
    df.loc[:,'HalopropNormMvir']=df.loc[:,'HalopropMvir']/halomass
    
    df.loc[:,'GalpropNormSigmaBulge']=df.loc[:,'GalpropSigmaBulge']/df.loc[:,'HalopropVvir']
    df.loc[:,'GalpropNormVdisk']=df.loc[:,'GalpropVdisk']/df.loc[:,'HalopropVvir']
    
    return df

In [57]:
df_normalized_step2 = normalization_func(df_normalized_step1)

In [58]:
df_normalized_step2.shape

(207467, 59)

In [59]:
# drop the original columns of normalized features + columns used for normalization + MstarMvir_ratio(bec same as NormMstar now)
df_normalized=df_normalized_step2.drop(columns=['GalpropHalfRadius',  'GalpropMstar','HalopropMhot',
                                                  'GalpropMbulge','GalpropMdisk', 'GalpropMBH','GalpropMH2',
                                                  'GalpropMHI','GalpropMHII','GalpropMcold',
                                                  'GalpropMstar_merge','HalopropMass_ejected',
                                                  'HalopropMstar_diffuse','HalopropMvir', 'HalopropNormMvir', 
                                                  'GalpropRhalo','GalpropNormRhalo', 'HalopropVvir',
                                                  'GalpropSigmaBulge', 'GalpropVdisk', 'MstarMvir_ratio'])

In [60]:
df_normalized.shape

(207467, 38)

### <font color='blue'><a id =1> </a> <br> Save the Normalized Dataset and use it for all analysis
This is the final normalized dataset used in all analysis. It is saved as:
This is the Normalized Dataset used in all analysis. It is saved as:

df_normalized.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/v6_TNG300-NewSAM_Normalized_Dataset_fromv1_wo_mstar9_nonphys_and_diskgals_w_smallfdsik_w_spineff.csv', index=False)


## <font color='purple'> <a id =4> </a> <br> Datasets by Morphology

In [63]:
# 52,053 galaxies in df_1
df_1 = df_normalized.loc[df_normalized.loc[:, 'BulgeMstar_ratio']<=0.10] 
df_1_raw = df_not_normalized.loc[df_not_normalized.loc[:, 'BulgeMstar_ratio']<=0.10] 
# df_1.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_1_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_1_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_1_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_1.shape", df_1.shape)
print("df_1_raw.shape", df_1_raw.shape)

# 64,494 galaxies in df_2
df_2 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.10) & (df_normalized.BulgeMstar_ratio <= 0.20)] 
df_2_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.10) & (df_not_normalized.BulgeMstar_ratio <= 0.20)] 
# df_2.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_2_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_2_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_2_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_2.shape", df_2.shape)
print("df_2_raw.shape", df_2_raw.shape)

# 28,372 galaxies in df_3
df_3 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.20) & (df_normalized.BulgeMstar_ratio <= 0.30)] 
df_3_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.20) & (df_not_normalized.BulgeMstar_ratio <= 0.30)] 
# df_3.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_3_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_3_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_3_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_3.shape", df_3.shape)
print("df_3_raw.shape", df_3_raw.shape)

# 13,582 galaxies in df_4
df_4 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.30) & (df_normalized.BulgeMstar_ratio <= 0.40)] 
df_4_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.30) & (df_not_normalized.BulgeMstar_ratio <= 0.40)] 
# df_4.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_4_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_4_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_4_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_4.shape", df_4.shape)
print("df_4_raw.shape", df_4_raw.shape)

# 13,539 galaxies in df_5
df_5 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.40) & (df_normalized.BulgeMstar_ratio <= 0.50)] 
df_5_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.40) & (df_not_normalized.BulgeMstar_ratio <= 0.50)] 
# df_5.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_5_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_5_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_5_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_5.shape", df_5.shape)
print("df_5_raw.shape", df_5_raw.shape)

# 10,167 galaxies in df_6
df_6 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.50) & (df_normalized.BulgeMstar_ratio <= 0.60)] 
df_6_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.50) & (df_not_normalized.BulgeMstar_ratio <= 0.60)] 
# df_6.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_6_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_6_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_6_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_6.shape", df_6.shape)
print("df_6_raw.shape", df_6_raw.shape)

# 8,367 galaxies in df_7
df_7 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.60) & (df_normalized.BulgeMstar_ratio <= 0.70)] 
df_7_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.60) & (df_not_normalized.BulgeMstar_ratio <= 0.70)] 
# df_7.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_7_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_7_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_7_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_7.shape", df_7.shape)
print("df_7_raw.shape", df_7_raw.shape)

# 6,853 galaxies in df_8
df_8 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.70) & (df_normalized.BulgeMstar_ratio <= 0.80)] 
df_8_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.70) & (df_not_normalized.BulgeMstar_ratio <= 0.80)] 
# df_8.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_8_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_8_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_8_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_8.shape", df_8.shape)
print("df_8_raw.shape", df_8_raw.shape)

# 10,040 galaxies in df_9
df_9 = df_normalized.loc[df_normalized.loc[:, 'BulgeMstar_ratio']>0.80] 
df_9_raw = df_not_normalized.loc[df_not_normalized.loc[:, 'BulgeMstar_ratio']>0.80] 
# df_9.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_9_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_9_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_9_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_9.shape", df_9.shape)
print("df_9_raw.shape", df_9_raw.shape)

# Combined Disk morphologies 1 to 4 in order to fit one physical model to all of them; 
# 158,501 galaxies in the 1-4 morphologies, out of a total of ~207,000
df_14 = df_normalized.loc[df_normalized.BulgeMstar_ratio <= 0.40] #normalized
df_14_raw = df_not_normalized.loc[df_not_normalized.BulgeMstar_ratio <= 0.40] 
# df_14.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_14_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_14_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_14_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_14.shape", df_14.shape)
print("df_14_raw.shape", df_14_raw.shape)

# Combined Elliptical morphologies 5 to 8 in order to fit one physical model to all of them; 
# 38,926 galaxies in the 5-8 morphologies, out of a total of ~207,000
df_58 = df_normalized.loc[(df_normalized.BulgeMstar_ratio > 0.40) & (df_normalized.BulgeMstar_ratio <= 0.80)] #normalized
df_58_raw = df_not_normalized.loc[(df_not_normalized.BulgeMstar_ratio > 0.40) & (df_not_normalized.BulgeMstar_ratio <= 0.80)] 
# df_58.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_58_Normalized_as_defined_in_TNG300notebook_v6', index=False)
# df_58_raw.to_csv('TNG300-SAM_images/v6_TNG300-SAM_Morphologies_definition/df_58_Raw_as_defined_in_TNG300notebook_v6', index=False)
print("df_58.shape", df_58.shape)
print("df_58_raw.shape", df_58_raw.shape)

df_1.shape (52053, 38)
df_1_raw.shape (52053, 59)
df_2.shape (64494, 38)
df_2_raw.shape (64494, 59)
df_3.shape (28372, 38)
df_3_raw.shape (28372, 59)
df_4.shape (13582, 38)
df_4_raw.shape (13582, 59)
df_5.shape (13539, 38)
df_5_raw.shape (13539, 59)
df_6.shape (10167, 38)
df_6_raw.shape (10167, 59)
df_7.shape (8367, 38)
df_7_raw.shape (8367, 59)
df_8.shape (6853, 38)
df_8_raw.shape (6853, 59)
df_9.shape (10040, 38)
df_9_raw.shape (10040, 59)
df_14.shape (158501, 38)
df_14_raw.shape (158501, 59)
df_58.shape (38926, 38)
df_58_raw.shape (38926, 59)
