# Data Visualization exploration

In [None]:
###### load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.preprocessing import binarize
from sklearn.preprocessing import MinMaxScaler # min/max scaling
import pickle
import seaborn as sns
from matplotlib.colors import LogNorm, Normalize
from mapseq_fxns import * # import all functions from this script
# from math import log10
from scipy import stats



# random sample consimment between runs
import random



%matplotlib inline


In [None]:

# set-up variables
in_path = '/Volumes/Data/Emily/MAPseq/combined_analysis_M194_M220/processed_data/'
out_path = '/Volumes/Data/Emily/MAPseq/combined_analysis_M194_M220/output/'

# import metadata
from M194_M220_metadata import *

# import colormaps
from colormaps import blue_cmp, orange_cmp


# binarized data

In [None]:

#### load data
with open(in_path +'M194_M220_OMC_bin4.pkl', 'rb') as f:
    omc_bin = pickle.load(f)

    


In [None]:
# seperate by cell type

omc_type = [sort_by_celltype(omc_bin[i]) for i in range(num_samples)]
omc_pt = [(omc_type[i][omc_type[i]['type']==1000]).drop(["type"], axis=1).reset_index(drop=True) for i in range(num_samples)]
omc_it = [(omc_type[i][omc_type[i]['type']==10]).drop(["type"], axis=1).reset_index(drop=True) for i in range(num_samples)]

omc_mm = []
omc_st = []
omc_pt_mm = []
omc_pt_st = []
omc_it_mm = []
omc_it_st = []
mice_mm=[]
mice_st=[]
dataset_mm = []
dataset_st = []

for i in range(num_samples):
    if metadata.species[i] == 'MMus':
        omc_mm.append(omc_type[i])
        omc_pt_mm.append(omc_pt[i])
        omc_it_mm.append(omc_it[i])
        mice_mm.append(mice[i])
        dataset_mm.append(dataset[i])
    else:
        omc_st.append(omc_type[i])
        omc_pt_st.append(omc_pt[i])
        omc_it_st.append(omc_it[i])
        mice_st.append(mice[i])
        dataset_st.append(dataset[i])

omc_mm_all = pd.concat(omc_mm)
omc_st_all = pd.concat(omc_st)


In [None]:


for i in range(len(omc_it_mm)):
    omc_it_mm[i] = clean_up_data(omc_it_mm[i])

for i in range(len(omc_pt_mm)):
    omc_pt_mm[i] = clean_up_data(omc_pt_mm[i])

for i in range(len(omc_it_st)):
    omc_it_st[i] = clean_up_data(omc_it_st[i])

for i in range(len(omc_pt_st)):
    omc_pt_st[i] = clean_up_data(omc_pt_st[i])

omc_it_mm_all = pd.concat(omc_it_mm)
omc_it_st_all = pd.concat(omc_it_st)

omc_pt_mm_all = pd.concat(omc_pt_mm)
omc_pt_st_all = pd.concat(omc_pt_st)

In [None]:
# need to change type values so that can visualize on heatmap

# change type so .25,.5,.75
mm_type_col = omc_mm_all['type']
mm_replace = np.where(mm_type_col==1000, 0.75, mm_type_col)
mm_replace = np.where(mm_replace==100, 0.5, mm_replace)
mm_replace = np.where(mm_replace==10, 0.25, mm_replace)
omc_mm_all['type'] = mm_replace

# change type so .25,.5,.75
st_type_col = omc_st_all['type']
st_replace = np.where(st_type_col==1000, 0.75, st_type_col)
st_replace = np.where(st_replace==100, 0.5, st_replace)
st_replace = np.where(st_replace==10, 0.25, st_replace)
omc_st_all['type'] = st_replace

print(omc_mm_all.shape)
print(omc_st_all.shape)
omc_mm_all.columns

In [None]:
plot = omc_mm_all #.sample(1000, random_state=10)

# sort by cell type
plot = plot.sort_values(by=['type']).reset_index(drop=True)

sns.heatmap(plot, cmap=blue_cmp, cbar=False)
# plt.gca().get_yaxis().set_visible(False)
# plt.savefig(out_path+"mm_single_heatmap.jpeg", dpi=300)


In [None]:
plot = omc_st_all #.sample(1000, random_state=10)

# sort by cell type
plot = plot.sort_values(by=['type']).reset_index(drop=True)

sns.heatmap(plot, cmap=orange_cmp, cbar=False)
# plt.gca().get_yaxis().set_visible(False)
# plt.savefig(out_path+"mm_single_heatmap.jpeg", dpi=300)


# count data


In [None]:
#### load data
with open(in_path +'M194_M220_OMC_count4.pkl', 'rb') as f:
    omc_count = pickle.load(f)

In [None]:
omc_count = [clean_up_data(df) for df in omc_count]


omc_count[0]

In [None]:
omc_values = []
for i in range(num_samples):
    values = omc_count[i].values
    flat = values.flatten() # make single array
    # nzeros = flat.nonzero() # drop zeros to find range
    # omc_values.append(nzeros)
    omc_values.append(flat)
1


In [None]:
# plot histogram of count values
fig, axs = plt.subplots(2,6, figsize=(25,9))
i=0
for ax in axs.flat:
    sns.histplot(omc_values[i], bins = 50, legend=False, ax=ax)
    ax.set_title(mice[i])
    i+=1
fig.suptitle("With zeros")

In [None]:
# plot = []
# for i in range(num_samples):
#     plot.append(omc_values[i].nonzero())

# plot histogram of count values
fig, axs = plt.subplots(2,6, figsize=(25,9))
i=0
for ax in axs.flat:
    idx = omc_values[i].nonzero()
    plot = omc_values[i][idx]
    sns.histplot(plot, bins = 50, legend=False, ax=ax)
    ax.set_title(mice[i])
    i+=1
fig.suptitle("Without zeros")



In [None]:
# use min/max scaling to get all count values within same range
# all min is 0, so formula (x-xmin/xmax-xmin) becomes (x/xmax)
# scale w/ data that includes 0 (?)
omc_scaled = []
for i in range(num_samples):
    df = omc_count[i].copy()
    max_c = df.values.flatten().max().max() # get max value for each brain
    df_scaled = df/max_c * 1000 # normalize to max and scale by 1000 so numbers are reasonable/interpretable
    omc_scaled.append(df_scaled)

In [None]:
# plot = []
# for i in range(num_samples):
#     plot.append(omc_values[i].nonzero())

# plot histogram of count values
fig, axs = plt.subplots(2,6, figsize=(25,9))
i=0
for ax in axs.flat:
    df = omc_scaled[i]
    vals = df.values.flatten()
    idx = vals.nonzero()
    plot = vals[idx]
    sns.histplot(plot, bins = 50, legend=False, ax=ax)
    ax.set_title(mice[i])
    i+=1
fig.suptitle("Scaled - Without zeros")



# combine datasets and plot
Not perfect yet, need to scale median value??? but decided to move on

In [None]:
omc_mm = []
omc_st = []
for i in range(num_samples):
    if species[i] == "MMus":
        omc_mm.append(omc_scaled[i])
    elif species[i] == "STeg":
        omc_st.append(omc_scaled[i])

omc_mm_all = pd.concat(omc_mm)
omc_st_all = pd.concat(omc_st)

In [None]:
sns.heatmap(omc_mm_all, cmap=blue_cmp, norm=LogNorm())
plt.show()
sns.heatmap(omc_st_all, cmap=orange_cmp, norm=LogNorm())
plt.show()

In [None]:
# sort cells?
omc_mm_sort = sort_by_celltype(omc_mm_all, pt_areas=["AMY","SNr","SCm","PG","PAG","BS"])
omc_st_sort = sort_by_celltype(omc_st_all, pt_areas=["AMY","SNr","SCm","PG","PAG","BS"])
sns.heatmap(omc_mm_sort, cmap=blue_cmp, norm=LogNorm())
plt.show()
sns.heatmap(omc_st_sort, cmap=orange_cmp, norm=LogNorm())
plt.show()


In [None]:
# sort cells?
fig, axs = plt.subplots(2, 6, figsize=(30,10))
i=0
for ax in axs.flat:
    sort_df = sort_by_celltype(omc_scaled[i], pt_areas=["AMY","SNr","SCm","PG","PAG","BS"])
    if species[i]=="MMus":
        sns.heatmap(sort_df, cmap=blue_cmp, norm=LogNorm(), ax=ax)
    else:
        sns.heatmap(sort_df, cmap=orange_cmp, norm=LogNorm(), ax=ax)
    ax.set_title(mice[i])
    
    i+=1
plt.suptitle("sorted heatmaps")
plt.show()

# covariance

In [None]:


fig, axs = plt.subplots(2, 6, figsize=(30,10))
i=0
for ax in axs.flat:
    df = omc_scaled[i]
    if species[i]=="MMus":
        sns.heatmap(df.cov(), cmap=blue_cmp, norm=LogNorm(), ax=ax)
    else:
        sns.heatmap(df.cov(), cmap=orange_cmp, norm=LogNorm(), ax=ax)
    ax.set_title(mice[i])
    
    i+=1
plt.suptitle("Covariance matrices",size=24)
plt.show()

In [None]:
# aggregated covariance plots
fig, axs = plt.subplots(1,2, figsize=(15,5))

sns.heatmap(omc_mm_all.cov(), cmap=blue_cmp, norm=LogNorm(), ax=axs[0])
axs[0].set_title("MMus all")

sns.heatmap(omc_st_all.cov(), cmap=orange_cmp, norm=LogNorm(), ax=axs[1])
axs[1].set_title("STeg all")

plt.suptitle("Covariance Matrix on Counts",size=24)
plt.show()


# PCA

In [None]:
print(omc_mm_sort.shape)
print(omc_mm_all.shape)

In [None]:
omc_st_all

In [None]:
def sort_by_celltype(proj, it_areas=["OMCc", "AUD", "STR"], ct_areas=["TH"], pt_areas=["AMY","SNr","SCm","PG","PAG","RN"]):
    """
    Function takes in projection matrix and outputs matrix sorted by the 3 major celltypes:
    - IT = intratelencephalic (projects to cortical and/or Striatum), type = 10
    - CT = corticalthalamic (projects to thalamus w/o projection to brainstem), type = 100
    - PT = pyramidal tract (projects to brainstem += other areas), type = 1000
    Returns single dataframe with cells sorted and labelled by 3 cell types (IT/CT/PT)
    
    default areas:
    - it_areas=["OMCc", "AUD", "STR"]
    - ct_areas=["TH"]
    - pt_areas=["AMY","SNr","SCm","PG","PAG","RN"]
    """
    # areas = proj.columns
    # if "OMCi" in areas:
    #     ds = proj.drop("OMCi", axis=1)
    # else:
    #     ds = proj

    ds=proj
 
    

    # 1. isolate PT cells
    pt_counts = ds[pt_areas].sum(axis=1)
    pt_idx = ds[pt_counts>0].index
    ds_pt = ds.loc[pt_idx,:]
    ds_pt = ds_pt.sort_values(['PAG','AMY'], ascending=False)
    ds_pt['type'] = 1000

    # Isolate remaining non-PT cells
    ds_npt = ds.drop(pt_idx)

    # Identify CT cells by thalamus projection
    th_idx = ds_npt['TH'] > 0
    ds_th = ds_npt[th_idx]
    ds_th = ds_th.sort_values('TH', ascending=False)
    ds_th['type'] = 100

    # Identify IT cells by the remaining cells (non-PT, non-CT)
    ds_nth = ds_npt[~th_idx]
    ds_nth = ds_nth.sort_values(['OMCc','AUD','STR'],ascending=False)
    ds_nth['type'] = 10

    # combine IT and CT cells
    ds_npt = pd.concat([ds_nth, ds_th])

    # combine IT/CT and PT cells
    sorted = pd.concat([ds_npt,ds_pt],ignore_index=True)

    sorted=sorted.reset_index(drop=True)

    # add back omci if there???
    
    return sorted

In [None]:
omc_st_sort = sort_by_celltype(omc_st_all, pt_areas=["AMY", "SNr", "SCm", "PG", "PAG", "BS"])
omc_st_sort

In [None]:
sorted_mm = []
sorted_st = []
for i in range(num_samples):
    # print(omc_scaled[i].shape)
    int_sort = sort_by_celltype(omc_scaled[i], pt_areas=["AMY", "SNr", "SCm", "PG", "PAG", "BS"])
    # print(int_sort.shape)
    if species[i]=="MMus":
        sorted_mm.append(int_sort)
    else:
        sorted_st.append(int_sort)

sorted_mm_all = pd.concat(sorted_mm)
sorted_st_all = pd.concat(sorted_st)
print(sorted_mm_all.shape)
print(sorted_st_all.shape)

In [None]:
names = []
for i in range(12):
    names.append('C'+str(i))
names

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
mm_type = sorted_mm_all['type']
mm_plot = sorted_mm_all.drop("type", axis=1)
components = pca.fit_transform(mm_plot)
print(mm_plot.shape)
print(components.shape)
components[:,0].shape
plot = pd.DataFrame(components, columns = names)
# plot['type'] = mm_type
plot
# sns.scatterplot(plot, x="C0", y="C1", hue="type")

# clustering?
