# Create one DataFrame out of the split participant 8

__Data Saved: (same as in 1v but only for participant 8)__

uid == "19a33ac1-149b-407c-a506-a2c7f4f3fea1"
- __Timestamps_try_uid.json__ --> json with substracted start from most behavioral timestamps
- __Timestamps_new_uid.json__ --> json with sorted timestamps (to match ETW)
- __Timestamps_misses_uid.json__ --> all the timestamps not matching ETW (mostly to check for errors)
- __Timestamps_overall_uid.json__ --> one timestream all other streams share
- __Behavior_new_uid.csv__ --> csv with behavioral columns, all with the same timestamps 
- __HitInfo_new_uid_raw.csv__ --> save all HitInfo with the same timestamp (for each timepoint save 30 rows, most are nulls)
- __HitInfo_new_uid.csv__ --> save HitInfo same as before but only rows that are not null
- __HitDistance_new_uid.csv__ --> save for each entry the smallest hit distance 
- __HitsSorted_new_uid.csv__ --> save the closest distance more HitInfo (fo sorted HitInfo_new df based on distance)
- __Timestamps_overall_uid.json__ --> one timestream all other streams share
- __recordings_village.csv__ --> we save the ids + length of all recordings in one csv (with corrected participant 8)

In [None]:
import copy  # copy big/deep objects by value
import datetime  # datetime operations
import itertools  # operate with iterators
import json  # read/write from/into json format
import os  # OS operations (read/write files/folders)
import uuid
import warnings  # hide warnings
# process parallelization
from multiprocessing import Manager, Pool, RawArray, cpu_count

import matplotlib.pyplot as plt  # mother of plots focr Python
import numpy as np  # array/matrix operations (e.g. linear algebra)
import pandas as pd  # operate with dataframes
import pyxdf  # read XDF files (LSL streams recordings)
import seaborn as sns  # matplotlib plotting nice with shortcuts
from IPython.display import Markdown, display  # print nicely
from tqdm.notebook import tqdm  # mother of progressbars

# from matplotlib.ticker import FormatStrFormatter  # tick formatter

In [None]:
warnings.simplefilter(action="ignore", category=FutureWarning)

# raw and processed data paths
PATH_RAW = "./data/raw"
PATH_PROC = "./data/processed"

# dtypes specification to avoid dtype guessing warning
CUSTOM_DTYPES = {
    "valid": "boolean",
    "leftBlink": "boolean",
    "rightBlink": "boolean",
}

def create_concat_df(df1, df2):
    """
    Given two dataframes and will return a combined one with the time axis corrected
    df1 will be first, followed by df2
    """
    times = df2.index.tolist()
    to_add = df1.index.tolist()
    to_add = to_add[-1] + 0.011
    new_times = [round(t + to_add, 3) for t in times]

    df3 = pd.concat([df1,df2], ignore_index=True)

    df3["new_time"] = df1.index.tolist() + new_times
    df3 = df3.set_index('new_time', drop=True)
    
    return df3

In [None]:
# new_uid = str(uuid.uuid4()) # only run this once!!
new_uid = "19a33ac1-149b-407c-a506-a2c7f4f3fea1"
new_uid

In [None]:
# the two files to combine are: "08_v_100621.xdf" followed by "08_v2_100621.xdf"
idd = [
    "6aebc6c2-6a6a-4038-b729-fddfdd0418c0",
    "ee9dac3c-a7e5-48b7-8187-6d9038651352",
]
uid1 = "6aebc6c2-6a6a-4038-b729-fddfdd0418c0"
uid2 = "ee9dac3c-a7e5-48b7-8187-6d9038651352"

__Create Combined DataFrames__

In [None]:
# list of all dataframes to change:
# beh_data.to_csv(f"{PATH_PROC}/Behavior_new_{uid}.csv", index=True)
# hit_data.to_csv(f"{PATH_PROC}/HitInfo_new_{uid}_raw.csv", index=True)
# h_res.to_csv(f"{PATH_PROC}/HitInfo_new_{uid}.csv", index=True)
# hits.to_csv(f"{PATH_PROC}/HitDistance_new_{uid}.csv", index=False)
# hits_sorted.to_csv(f"{PATH_PROC}/HitsSorted_new_{uid}.csv", index=True)

# Behavior_new:
beh1 = pd.read_csv(
    f"{PATH_PROC}/Behavior_new_{uid1}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
beh2 = pd.read_csv(
    f"{PATH_PROC}/Behavior_new_{uid2}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
beh_new = create_concat_df(beh1, beh2)

# HitInfo_new..._raw:
hit_raw1 = pd.read_csv(
    f"{PATH_PROC}/HitInfo_new_{uid1}_raw.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_raw2 = pd.read_csv(
    f"{PATH_PROC}/HitInfo_new_{uid2}_raw.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_raw_new = create_concat_df(hit_raw1, hit_raw2)

# HitInfo_new:
hit1 = pd.read_csv(
    f"{PATH_PROC}/HitInfo_new_{uid1}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit2 = pd.read_csv(
    f"{PATH_PROC}/HitInfo_new_{uid2}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_new = create_concat_df(hit1, hit2)

# HitDistance_new:
hit_dist1 = pd.read_csv(
    f"{PATH_PROC}/HitDistance_new_{uid1}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_dist2 = pd.read_csv(
    f"{PATH_PROC}/HitDistance_new_{uid2}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_dist_new = create_concat_df(hit_dist1, hit_dist2)

# HitsSorted_new:
hit_sort1 = pd.read_csv(
    f"{PATH_PROC}/HitsSorted_new_{uid1}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_sort2 = pd.read_csv(
    f"{PATH_PROC}/HitsSorted_new_{uid2}.csv", index_col=0, dtype=CUSTOM_DTYPES
)
hit_sort_new = create_concat_df(hit_sort1, hit_sort2)



__Check that created beh4 is correct__

In [None]:
ts = hit_raw_new.index.tolist()
t1 = hit_raw1.index.tolist()
t2 = hit_raw2.index.tolist()
len1 = seconds_to_minutes_seconds(t2[-1] - t2[0])
print(len1)
len2 = seconds_to_minutes_seconds(t1[-1] - t1[0])
print(len2)
length = seconds_to_minutes_seconds(ts[-1] - ts[0])
length

__Save the new DataFrames__

In [None]:
beh_new.to_csv(f"{PATH_PROC}/Behavior_new_{new_uid}.csv", index=True)
hit_raw_new.to_csv(f"{PATH_PROC}/HitInfo_new_{new_uid}_raw.csv", index=True)
hit_new.to_csv(f"{PATH_PROC}/HitInfo_new_{new_uid}.csv", index=True)
hit_dist_new.to_csv(f"{PATH_PROC}/HitDistance_new_{new_uid}.csv", index=True)
hit_sort_new.to_csv(f"{PATH_PROC}/HitsSorted_new_{new_uid}.csv", index=True)

__Create Timestamp_overall__

In [None]:
beh_new = pd.read_csv(
    f"{PATH_PROC}/Behavior_new_{new_uid}.csv", index_col=0, dtype=CUSTOM_DTYPES
)

times_overall = beh_new.index.tolist()
with open(f"{PATH_PROC}/Timestamps_overall_{new_uid}.json", "w") as f:
    json.dump(times_overall, f, indent=4)

__Adjust the recording infromation__

In [None]:
recordings = pd.read_csv("./recordings_village_old.csv", index_col="new_id")
ids = recordings.index.tolist()
recordings

In [None]:
ts = beh_new.index.tolist()
length = seconds_to_minutes_seconds(ts[-1] - ts[0])

rec = {}
rec[new_uid] = {}
rec[new_uid]["file"] = "08_v_comb.xdf"
rec[new_uid]["created"] = "2021-06-10 14:12:00"
rec[new_uid]["length"] = length
rec[new_uid]["start"] = 1.106542e06
rec[new_uid] = pd.Series(rec[new_uid])
print(rec)

In [None]:
recordings_new = []
idds = []
for i in ids:
    if i not in idd[:]:
        idds.append(i)
        recordings_new.append(recordings.loc[i])
    elif uid2 in i:
        recordings_new.append(rec[new_uid])
        idds.append(new_uid)


recordings_new = pd.DataFrame(recordings_new, index=idds)

# store recordings info as CSV
recordings_new.to_csv("./recordings_village.csv", index=True)
recordings_new