In [None]:
import os
os.chdir("..")

In [None]:
import dask.dataframe as dd
import dask.array as da
from matplotlib import pyplot as plt


In [None]:
df = dd.read_hdf("data/allhosts4.h5", key="alldata", chunksize=100000)

In [None]:
def getRmin(df):
	mask = np.logical_and(df["rproj"]<10, np.abs(df["vproj"])<3)
	sliced = df.loc[mask]
	return sliced["r3d"].max()


In [None]:
result = df.map_partitions(getRmin).compute()

In [None]:
plt.scatter(np.arange(len(result)), result)

In [None]:
mask = da.logical_and(
	df["rproj"] < 10,
	da.abs(df["vproj"]) < 3
)
within_window = df.loc[mask]
Rmin = within_window["r3d"].min()

In [None]:
Rmin.compute()

In [None]:
Rmin.visualize(filename="tmp.png")

In [None]:
relevant = df[["r3d","rproj","vproj","Vradnorm","spin","galrow","hostrow"]]
midranged = relevant.loc[relevant["r3d"] < 30]

In [None]:
midranged.to_hdf("data/midranged.h5", key="galaxies", mode="w")

In [None]:
import h5py
import pandas as pd

In [None]:
df = pd.read_hdf("data/midranged.h5", key="galaxies")

In [None]:
df

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sb
import numpy as np
from numpy.lib import recfunctions
from matplotlib.patches import Rectangle
import h5py

In [None]:
datafolder = "data/"
imagefolder = "figures/"

In [None]:
with open(datafolder+"allhosts4.dat") as infile:
	tot_lines = sum([1 for line in infile if line.split()[0] != "#"])
with open(datafolder+"allhosts4.dat") as infile:
	for line in infile:
		names = line.split()
		if names[0] == "#":
			names = names[1:]
		break
	thedtype = np.dtype([(name, "f8") for name in names])

In [None]:
tot_lines

In [None]:
thedtype

In [None]:
with open(datafolder+"allhosts4.dat") as infile:
	for line in infile:
		print(line.split()[0])
		print(line)
		break

In [None]:
with open(datafolder+"allhosts4.dat") as infile:
	with h5py.File(datafolder+"newallhost4.h5", "a") as outfile:
		if "alldata" in outfile:
			del outfile["alldata"]
		chunksize = 5000
		dataset = outfile.create_dataset("alldata", shape=(tot_lines,), dtype=thedtype, chunks=(chunksize,), compression="gzip", compression_opts=9)
		values = []
		idx = 0
		for i,line in enumerate(infile):
			if line.split()[0] == "#":
				continue
			values.append(tuple([float(val) for val in line.split()]))
			if len(values) == chunksize:
				values = np.array(values, dtype=thedtype)
				dataset[idx:idx+values.shape[0]] = values
				idx += chunksize
				values = []
			if i % 10000 == 0:
				print(f"{i*100/tot_lines:.2f}%", end="\r", flush=True)
		if len(values) > 0:
			values = np.array(values, dtype=thedtype)
			dataset[idx:] = values
			idx += values.shape[0]

In [None]:
binswidth = 1
bins = np.arange(0, df["r3d"].max()+binswidth+1e-6, binswidth)
df["R3Dbins"] = pd.cut(df["r3d"], bins=bins, labels=(bins[1:]+bins[:-1])/2)
df["vradcorr"] = df["Vradnorm"] - 1/10*df["r3d"] + df["r3d"]**(-0.5)

In [None]:
df

### Check spins are all equally represented

In [None]:
fig, ax = plt.subplots()
ax.hist(df["spin"], bins=np.arange(0.5,51), color="black", edgecolor="white")
ax.set_title("Point of View")
ax.set_ylabel("Counts")
ax.set_xlabel("Direction")
ax.set_xlim([0,51])
fig.tight_layout()
fig.savefig(imagefolder+"spins.png")

### Check mass distribution

In [None]:
values,counts = np.unique(masses, return_counts=True)
print(f"There are {len(values)} clusters")
plt.scatter(values, counts)
plt.xlabel("Cluster mass")
plt.ylabel("Counts")

### Check distributions of input data

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 4))
sb.histplot(data=df, x="rproj", y="vproj", bins=100, ax=axs[0])
sb.histplot(data=df, x="r3d", y="vproj", bins=100, ax=axs[1])
sb.histplot(data=df, x="rproj", y="r3d", bins=100, ax=axs[2])
fig.tight_layout()
fig.savefig(imagefolder+"2d_distributions.png")

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12,4))

df["rproj"].hist(bins=50, density=True, alpha=0.7, ax=axs[0])
df["vproj"].hist(bins=50, density=True, alpha=0.7, ax=axs[1])
df["r3d"].hist(bins=50, density=True, alpha=0.7, ax=axs[2])

#all_colours = ["Blue","Red","Green","Orange","Purple","Cyan"]
#axs[0].add_patch(Rectangle((0.5,-0.01), 1, 0.32, facecolor="none", edgecolor=all_colours[0]))
#axs[0].add_patch(Rectangle((3,-0.01), 1, 0.32, facecolor="none", edgecolor=all_colours[1]))
#axs[0].add_patch(Rectangle((7.5,-0.01), 1, 0.32, facecolor="none", edgecolor=all_colours[2]))
#axs[1].add_patch(Rectangle((-2.5,-0.01), 1, 0.32, facecolor="none", edgecolor=all_colours[3]))
#axs[1].add_patch(Rectangle((1.5,-0.01), 1, 0.32, facecolor="none", edgecolor=all_colours[4]))
#axs[1].add_patch(Rectangle((-0.5,-0.01), 1, 0.32, facecolor="none", edgecolor=all_colours[5]))

#axs[0].plot([0,10],[0.02,0.18], linestyle="--", color="k")

fig.suptitle("Marginal distributions conditioning variables")
axs[0].set_xlabel(r"$r_\mathrm{proj}$")
axs[1].set_xlabel(r"$v_\mathrm{proj}$")
axs[2].set_xlabel(r"$R_\mathrm{3D}$")
axs[0].set_ylabel(r"$P(r_\mathrm{proj})$")
axs[1].set_ylabel(r"$P(v_\mathrm{proj})$")
axs[2].set_ylabel(r"$P(R_\mathrm{3D})$")
for ax in axs:
	ax.set_ylim(bottom=0)
	ax.grid(False)
#axs[0].set_xlim([0,10])
#axs[1].set_xlim([-3,3])
axs[2].set_xlim(left=0)
fig.tight_layout()
fig.savefig(imagefolder+"marginals.png")

### Checking Hubble Flow deviation

In [None]:
fig, ax = plt.subplots()
sb.histplot(data=df, x="r3d", y="Vradnorm", bins=100, label="data", ax=ax)
xs = np.linspace(0, 30, 101)
ax.plot(xs, 1/10*xs, color="k", linestyle="--", label="Hubble flow")
ax.axvline(2, color="red", linestyle=":", label="2 Mpc")
ax.legend()
fig.tight_layout()
fig.savefig(imagefolder+"hubble_flow.png")

In [None]:
fig, ax = plt.subplots(figsize=(4,8))
sb.violinplot(data=df, x="vradcorr", y="R3Dbins", orient="h", split=True, cut=0, density_norm="area", fill=False, palette="dark:#5A9_r", inner="quart", saturation=0.4, ax=ax)
ax.axvline(0, color="darkred", linestyle="--")
ax.set_xlabel("Radial velocity correction")
ax.set_ylabel("3D distance")
fig.tight_layout()
fig.savefig(imagefolder+"radialvelocitycorrection.png")

### Checking output distribution

In [None]:
n_bins = 2
continue_flag = True
while continue_flag:
    counts,_,_ = np.histogram2d(df["rproj"], df["vproj"], bins=n_bins, range=[[0,10],[-3,3]])
    print(f"Using {n_bins} bins, with min {counts.min()}", end="\r", flush=True)
    if counts.min() < 200:
        continue_flag = False
        n_bins -= 1
    else:
        n_bins += 1
print(f"Using {n_bins} bins, with min {counts.min()}")

In [None]:
fig, axs = plt.subplots(n_bins, n_bins, figsize=(n_bins*4,)*2)

vbins = np.linspace(3, -3, n_bins+1)
rbins = np.linspace(0, 10, n_bins+1)
binwidth = 1
r3dbins = np.arange(0, df["# r3d"].max()+binwidth, binwidth)

for i in range(n_bins):
	rmask = np.logical_and(df["rproj"]>=rbins[i], df["rproj"]<=rbins[i+1])
	for j in range(n_bins):
		vmask = np.logical_and(df["vproj"]>=vbins[j+1], df["vproj"]<=vbins[j])
		subset = df.loc[np.logical_and(rmask, vmask)]
		axs[j,i].hist(subset["# r3d"], bins=r3dbins, density=True)
		if j == n_bins - 1:
			axs[j,i].set_xlabel(r"$r_\mathrm{3D}$")
		else:
			axs[j,i].set_xticks([])
		if i == 0:
			axs[j,i].set_ylabel(r"$P(r_\mathrm{3D})$")
		else:
			axs[j,i].set_yticks([])
		axs[j,i].set_ylim([0,0.2])
		axs[j,i].set_xlim(left=0)
		axs[j,i].legend(title=f"r={rbins[i:i+1].mean():.1f}, v={vbins[j:j+1].mean():.1f}")
		print(f"{int(100*(i*n_bins+j)/n_bins**2)}%", flush=True, end="\r")

fig.tight_layout()
fig.savefig(imagefolder+"histograms.png")
plt.close()

In [None]:
r3dbins

In [None]:
fig, axs = plt.subplots(n_bins, n_bins, figsize=(n_bins*4,)*2)

vbins = np.linspace(3, -3, n_bins+1)
rbins = np.linspace(0, 10, n_bins+1)
r3drange = 5
r3dbins = np.linspace(0, r3drange, 51)

df_slice = df.loc[df["# r3d"]<=r3drange]

for i in range(n_bins):
	rmask = np.logical_and(df_slice["rproj"]>=rbins[i], df_slice["rproj"]<=rbins[i+1])
	for j in range(n_bins):
		vmask = np.logical_and(df_slice["vproj"]>=vbins[j+1], df_slice["vproj"]<=vbins[j])
		subset = df_slice.loc[np.logical_and(rmask, vmask)]
		axs[j,i].hist(subset["# r3d"], bins=r3dbins, density=True)
		if j == n_bins - 1:
			axs[j,i].set_xlabel(r"$r_\mathrm{3D}$")
		else:
			axs[j,i].set_xticks([])
		if i == 0:
			axs[j,i].set_ylabel(r"$P(r_\mathrm{3D})$")
		else:
			axs[j,i].set_yticks([])
#		axs[j,i].set_ylim([0,0.2])
		axs[j,i].set_xlim([0,r3drange])
		axs[j,i].legend(title=f"r={rbins[i:i+1].mean():.1f}, v={vbins[j:j+1].mean():.1f}")
		print(f"{int(100*(i*n_bins+j)/n_bins**2)}%", flush=True, end="\r")

fig.tight_layout()
fig.savefig(imagefolder+"histograms_cluster.png")
plt.close()

In [None]:
fig, axs = plt.subplots(n_bins, n_bins, figsize=(n_bins*4,)*2)

vbins = np.linspace(3, -3, n_bins+1)
rbins = np.linspace(0, 10, n_bins+1)
r3dbins = np.geomspace(0.1, df["# r3d"].max(), 100)

for i in range(n_bins):
	rmask = np.logical_and(df["rproj"]>=rbins[i], df["rproj"]<=rbins[i+1])
	for j in range(n_bins):
		vmask = np.logical_and(df["vproj"]>=vbins[j+1], df["vproj"]<=vbins[j])
		subset = df.loc[np.logical_and(rmask, vmask)]
		axs[j,i].hist(subset["# r3d"], bins=r3dbins, density=True)
		axs[j,i].set_xscale("log")
		if j == n_bins - 1:
			axs[j,i].set_xlabel(r"$r_\mathrm{3D}$")
		else:
			axs[j,i].set_xticks([])
		if i == 0:
			axs[j,i].set_ylabel(r"$P(r_\mathrm{3D})$")
		else:
			axs[j,i].set_yticks([])
		axs[j,i].set_ylim([0,0.2])
		axs[j,i].legend(title=f"r={rbins[i:i+1].mean():.1f}, v={vbins[j:j+1].mean():.1f}")
		print(f"{int(100*(i*n_bins+j)/n_bins**2)}%", flush=True, end="\r")

fig.tight_layout()
fig.savefig(imagefolder+"loghistograms.png")
plt.close()

### Velocity w.r.t. R3D

In [None]:
df_slice = df.loc[df["# r3d"]<=50]
sb.histplot(x=df_slice["# r3d"], y=np.abs(df_slice["vproj"]), bins=101)