In [1]:
%load_ext line_profiler
import numpy as np
import pandas as pd
from astropy.io import ascii, fits
from scipy.stats import anderson_ksamp, ks_2samp
from halotools.sim_manager import HaloTableCache, CachedHaloCatalog
from halotools.empirical_models import PrebuiltSubhaloModelFactory
import random
import time

In [2]:
n_loops = 1

print n_loops, 'loops'

1 loops


In [3]:
def find_nearest_index(array,value):
    idx = (np.abs(np.array(array)-value)).argmin()

    return idx


def find_nearest_indices(array,value, n):
    idx = (np.abs(np.array(array)-value)).argsort()[:n]

    return idx

def find_nearest_indices_pandas(pd_array,value, n):
    idx = (np.abs(pd_array-value)).argsort()[:n]

    return idx

def find_nearest_indices_for_multiple_values(array,values, n):
    
    idxs = [(np.abs(np.array(array)-value)).argsort()[:n] for value in values]

    return idxs

def find_nearest_indices_pandas(pd_array,value, n):
    nearest_rows = np.abs(np.log10(pd_array['stellar_mass']) - value).sort_values()[:n]

    return pd_array.loc[nearest_rows.index]

In [4]:
#Bol-Planck model
simname = 'bolshoi-planck'
halo_finder = 'rockstar'
version_name = 'bplanck_dwarfs'
ptcl_version_name='bplanck_dwarfs_downsampled2'
ptcl_version_name='bplanck_dwarfs'

redshift = 0.278625 #(1/0.78209)-1 ; a=0.78209
Lbox, particle_mass = 250, 1.5e8

#read in halocat
cache = HaloTableCache()
halocat = CachedHaloCatalog(simname = simname, halo_finder = halo_finder,
                            version_name = version_name, redshift = redshift, ptcl_version_name=ptcl_version_name) # doctest: +SKIP

print(halocat.redshift) # doctest: +SKIP
print(halocat.ptcl_version_name) # doctest: +SKIP


model = PrebuiltSubhaloModelFactory('behroozi10', redshift=redshift,
                                    scatter_abscissa=[12, 15], scatter_ordinates=[0.4, 0.2])
model.populate_mock(halocat)

0.2786
bplanck_dwarfs


In [5]:
#COSMOS data
dwarf_sample_file = '/Users/fardila/Documents/GitHub/dwarf_lensing/Data/cosmos/dwarf_sample_for_paper.fits'

dwarf_sample_data  = fits.open(dwarf_sample_file)[1].data
dwarf_masses = dwarf_sample_data['mass_med']

In [7]:
min(dwarf_masses)

7.73324

In [8]:
max(dwarf_masses)

8.99998

In [6]:
#galaxy table limited to mass range of dwarfs
mock_galaxies = model.mock.galaxy_table
mock_galaxies = mock_galaxies['galid', 'x', 'y', 'z', 'stellar_mass']
mock_galaxies = mock_galaxies[(np.log10(mock_galaxies['stellar_mass'])>=min(dwarf_masses)) & \
                              (np.log10(mock_galaxies['stellar_mass'])<9.0)]

In [165]:
init_time=time.time()

# create subsample with same distribution
subsample=[]
copy_mock_galaxies = [list(a) for a in mock_galaxies]
gal_masses=np.array([np.log10(g[-1]) for g in copy_mock_galaxies])

#number of times to sample COSMOS catalog
n_loops = 1

for dwarf in dwarf_masses:

    #reduce search space
    gal_masses_reduced = gal_masses[(gal_masses<dwarf+0.1) & \
                          (gal_masses>dwarf-0.1)]


    #find index of mock galaxy with mass closes to dwarf
    indices = find_nearest_indices(gal_masses_reduced, dwarf, n=n_loops)

    # append to subsample
    subsample += [copy_mock_galaxies[i] for i in indices]

    #do not replace in array of mock galaxy masses
#         del copy_mock_galaxies[index]
    gal_masses = np.delete(gal_masses, indices)

subsample_masses= np.array([np.log10(s[-1]) for s in subsample])

print 'subsample: ' + str(len(subsample))
print 'total mock galaxies: ' + str(len(mock_galaxies))
print 'remaining mock galaxies: ' + str(len(copy_mock_galaxies))

#check that they are indistinguishable
#anderson_ksamp([subsample,dwarf_masses])
print ks_2samp(subsample_masses,dwarf_masses)

# #save subsample
# outfile='/Users/fardila/Documents/GitHub/dwarf_lensing/bplanck_dwarfs_'+str(n_loops)+'.npy'
# np.save(outfile,subsample)

print time.time() - init_time, ' seconds'

subsample: 343800
total mock galaxies: 1190786
remaining mock galaxies: 1190786
Ks_2sampResult(statistic=0.356803374054683, pvalue=0.0)
187.736755848  seconds


In [229]:
# gal_masses=np.array([np.log10(g[-1]) for g in mock_galaxies])
gal_masses=np.log10(mock_galaxies['stellar_mass'])
def create_dwarf_catalog_with_matched_mass_distribution(dwarf_masses, mock_galaxies, gal_masses, n_loops =1):
    
    subsample=[]

    # n_loops: number of times to sample COSMOS catalog
    

    for dwarf in dwarf_masses:

        #reduce search space
        gal_masses_reduced = gal_masses[(gal_masses<dwarf+0.001) & \
                              (gal_masses>dwarf-0.001)]


        #find index of mock galaxy with mass closes to dwarf
        indices = find_nearest_indices(gal_masses_reduced, dwarf, n=n_loops)

        # append to subsample
        subsample += [mock_galaxies[i] for i in indices]

        #do not replace in array of mock galaxy masses
    #         del copy_mock_galaxies[index]
#         gal_masses = np.delete(gal_masses, indices)
        
    return subsample

In [169]:
subsample_masses= np.array([np.log10(s[-1]) for s in subsample])

print 'subsample: ' + str(len(subsample))
print 'total mock galaxies: ' + str(len(mock_galaxies))
print 'remaining mock galaxies: ' + str(len(copy_mock_galaxies))

#check that they are indistinguishable
#anderson_ksamp([subsample,dwarf_masses])
print ks_2samp(subsample_masses,dwarf_masses)

subsample: 343800
total mock galaxies: 1190786
remaining mock galaxies: 1190786
Ks_2sampResult(statistic=0.356803374054683, pvalue=0.0)


In [230]:
%lprun -f create_dwarf_catalog_with_matched_mass_distribution create_dwarf_catalog_with_matched_mass_distribution(dwarf_masses, mock_galaxies, gal_masses, n_loops =1)

In [305]:
%lprun -f create_dwarf_catalog_with_matched_mass_distribution2 create_dwarf_catalog_with_matched_mass_distribution2(dwarf_masses, np.array(mock_galaxies), n_loops =1)


In [7]:
def create_dwarf_catalog_with_matched_mass_distribution2(dwarf_masses, mock_galaxies, n_loops = 1):
    
    subsample=[]
    # n_loops: number of times to sample COSMOS catalog
    
    mock_galaxies_pd = pd.DataFrame(mock_galaxies.as_array())
   
    for dwarf in dwarf_masses[:500]:   

        #reduce search space
        gal_masses=np.log10(mock_galaxies_pd['stellar_mass'])
        mock_galaxies_reduced = mock_galaxies_pd[(gal_masses<dwarf+0.001) & \
                              (gal_masses>dwarf-0.001)]


        #find index of mock galaxy with mass closes to dwarf
        matched_galaxies = find_nearest_indices_pandas(mock_galaxies_reduced,dwarf, n=n_loops)

        # append to subsample
        subsample += [matched_galaxies]

        #do not replace in array of mock galaxy masses
#         mock_galaxies_pd.drop(matched_galaxies.index)
        
        mock_galaxies_pd['stellar_mass'].loc[matched_galaxies.index] = 0
    
#         exclusion_mask = mock_galaxies_pd.index.isin(matched_galaxies.index)
#         mock_galaxies_pd = mock_galaxies_pd[~exclusion_mask]

    return subsample

In [62]:
a = create_dwarf_catalog_with_matched_mass_distribution2(dwarf_masses, mock_galaxies, n_loops = 50)

In [85]:
%lprun -f create_dwarf_catalog_with_matched_mass_distribution2 create_dwarf_catalog_with_matched_mass_distribution2(dwarf_masses, mock_galaxies, n_loops = 50)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [86]:
mock_galaxies

galid,x,y,z,stellar_mass
int64,float32,float32,float32,float32
15,35.9465,42.8824,18.21089,756989700.0
47,36.10529,43.19333,18.27194,184989900.0
51,36.71859,42.76758,18.07091,62423230.0
59,35.3761,43.28727,18.47236,69486370.0
80,35.27654,43.45219,18.13028,728674750.0
90,35.16328,42.64965,17.97724,119375730.0
154,35.28469,43.02029,18.01919,87116850.0
170,36.96092,42.4537,18.58637,173842610.0
174,36.0785,43.09545,18.29132,59855444.0
176,35.47102,43.3781,18.2542,90208610.0


In [87]:
import random

In [92]:
random.sample(np.arange(len(mock_galaxies)),500000)

[15362,
 113109,
 409974,
 544464,
 473197,
 184260,
 1661,
 719013,
 66748,
 729779,
 1008675,
 911193,
 765805,
 32581,
 184131,
 1160261,
 720642,
 642577,
 540302,
 471951,
 521059,
 923445,
 136560,
 884612,
 310900,
 992392,
 1148885,
 1148806,
 416559,
 216495,
 319974,
 379831,
 1018806,
 1004751,
 520753,
 496054,
 1021733,
 737716,
 727346,
 469081,
 238247,
 556257,
 843990,
 779231,
 745863,
 937347,
 945918,
 179084,
 335693,
 583452,
 321501,
 145457,
 202241,
 276708,
 767789,
 848085,
 617943,
 166522,
 100334,
 339245,
 990669,
 310578,
 859637,
 653012,
 719000,
 695472,
 409942,
 731941,
 199700,
 130233,
 224442,
 610812,
 88098,
 452152,
 711577,
 253780,
 1000564,
 269924,
 810823,
 927394,
 881453,
 1102551,
 932928,
 403808,
 162784,
 168213,
 97064,
 123568,
 111103,
 196106,
 1088730,
 489380,
 1104354,
 773569,
 487823,
 1177654,
 974252,
 1087657,
 180340,
 515168,
 75118,
 402022,
 91453,
 439418,
 789185,
 71867,
 432600,
 763877,
 636772,
 300693,
 720556

In [8]:
mock_galaxies = mock_galaxies[random.sample(np.arange(len(mock_galaxies)),500000)]

In [9]:
%lprun -f create_dwarf_catalog_with_matched_mass_distribution2 create_dwarf_catalog_with_matched_mass_distribution2(dwarf_masses, mock_galaxies, n_loops = 50)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
dir()

['CachedHaloCatalog',
 'HaloTableCache',
 'In',
 'Lbox',
 'Out',
 'PrebuiltSubhaloModelFactory',
 '_',
 '_10',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__name__',
 '__package__',
 '_dh',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i2',
 '_i3',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 '_sh',
 'anderson_ksamp',
 'ascii',
 'cache',
 'create_dwarf_catalog_with_matched_mass_distribution2',
 'dwarf_masses',
 'dwarf_sample_data',
 'dwarf_sample_file',
 'exit',
 'find_nearest_index',
 'find_nearest_indices',
 'find_nearest_indices_for_multiple_values',
 'find_nearest_indices_pandas',
 'fits',
 'get_ipython',
 'halo_finder',
 'halocat',
 'ks_2samp',
 'mock_galaxies',
 'model',
 'n_loops',
 'np',
 'particle_mass',
 'pd',
 'ptcl_version_name',
 'quit',
 'random',
 'redshift',
 'simname',
 'time',
 'version_name']