In [None]:
from h3 import h3
import numpy as np
import cv2
import glob
from matplotlib import pyplot as plt
import os
import pandas as pd
from osgeo import gdal
from osgeo import osr
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
def imshow(image, show_axes = False, quiet = False):
    if len(image.shape) == 3:
      # Height, width, channels
      # Assume BGR, do a conversion since 
      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    else:
      # Height, width - must be grayscale
      # convert to RGB, since matplotlib will plot in a weird colormap (instead of black = 0, white = 1)
      image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    # Draw the image
    plt.imshow(image)
    if not show_axes:
        # We'll also disable drawing the axes and tick marks in the plot, since it's actually an image
        plt.axis('off')
    if not quiet:
        # Make sure it outputs
        plt.show()

# Goal>> modern videos' h3 aggregation
1. Plot a timeseries chart of observed people
2. Aggregate to ground level per frame per h3 (count of people)
`sum(count_of_people_per_frame)/total_frames`
3. Table of total observed people, frame, year, h3

In [None]:
selvideogroups = [
    'Chestnut20100519-083343', 
       'Downtown20100521-115755',
       'Met20100612-120118',
       'bryant_park20081008-141944']

In [None]:
from tqdm import tqdm
outputfolder = "../../_data/05_tracking_result_projected/step0_attr_prj"
frame_folder = "../../_data/02_siteplan/sample_frames/current_sample"
tiff_folder = "../../_data/02_siteplan/geo_tiff"
ref_folder = "../../_data/02_siteplan/gcp_pt/"
# export data by location
staginging_folder2 = "../../_data/05_tracking_result_projected/step1_speed_vector/current"
if not os.path.exists(staginging_folder2):
    os.mkdir(staginging_folder2)
frames = os.listdir(frame_folder)
refs = os.listdir(ref_folder)

# each location can have two videos shoot in the morning or noon
timedict = {
    "Chestnut20100519-083343":"morning",
    'Downtown20100521-074701':"morning",
       'Downtown20100521-115755':"noon",
       'Met20100612-082221':"morning",
       'Met20100612-112553':'noon',
       'Met20100612-120118':"noon", 
       'bryant_park20081008-072238':"morning",
       'bryant_park20081008-141944':"noon",
}

def get_frame_num(time_str, fps = 29.97002997002997):
    try:
        time = time_str.split(" ")[0][3:].split(":")
        minute = int(time[0])
        second = int(time[1])
        frame = minute*60*fps + second*fps
        return int(frame)
    except:
        return 0

In [None]:
videopath = pd.read_csv("../../_data/00_raw/_video_meta/video_path_0509.csv")
videopath['video_id'] = videopath['video_name'].apply(lambda x: x.split(".")[0])
videopath['video_group_update'] = videopath['video_location'].apply(lambda x: x.split(" ")[0])+videopath['video_group']

# videopath_sel = videopath[videopath['scene'].isin([2,3])]
videopath_sel = videopath[~videopath['ref_path'].isna()].reset_index(drop = True)
videopath_sel['first_effective_time'].unique()
videopath_sel['first_effective_time'] = videopath_sel['first_effective_time'].fillna("12:00:00 AM")
videopath_sel['frame_start'] = videopath_sel['first_effective_time'].apply(lambda x: get_frame_num(x))
videopath_sel['frame_end'] = videopath_sel['last_effective_time'].apply(lambda x: get_frame_num(x))
videopath_sel['frame_end'] = videopath_sel['frame_end'].fillna(videopath_sel['length'])
videopath_sel['ref_epsg'] = videopath_sel['ref_epsg'].astype(int)
videopath_sel['video_date'] = videopath_sel['video_group_started_at'].apply(lambda x: x.split(" ")[0])
videopath_sel['video_section_started_at'] = pd.to_datetime(videopath_sel['video_date']+ " " +videopath_sel['video_section_started_at'])
videopath_sel.columns

# 1. Load full hex locations (overlapping with the historical ones)

In [None]:
stagingfolder = "../../_data/05_tracking_result_projected/step1_speed_vector"
foldershp = '/Users/yuan/Dropbox (MIT)/whyte_CV/_data/02_siteplan/site_plan_geometry'

In [None]:
# load all shapes
foldershp = '/Users/yuan/Dropbox (MIT)/whyte_CV/_data/02_siteplan/site_plan_geometry'
geofiles = os.listdir(foldershp)
geofiles = [x for x in geofiles if x.endswith(".shp")]
geofiles = [
    'chestnutstreet.shp',
 'downtowncrossing.shp',
 'bryant_park1.shp',
 'MET.shp',
 ]


def geth3(zones, res):
    zones = zones.to_crs("EPSG:4326")
    bound = zones.dissolve()
    polydict = bound.convex_hull[0].__geo_interface__
    polydict["coordinates"] = (tuple((t[0], t[1]) for t in polydict["coordinates"][0]),)
    
    hexs = h3.polyfill(polydict, res, geo_json_conformant = True)

    polygonise = lambda hex_id: Polygon(
                                    h3.h3_to_geo_boundary(
                                        hex_id, geo_json=True)
                                        )

    all_polys = gpd.GeoSeries(list(map(polygonise, hexs)), \
                                          index=hexs, \
                                          crs="EPSG:4326" \
                                         )


    h3_all = gpd.GeoDataFrame({"geometry": all_polys,
                                "hex_id": all_polys.index},
                                    crs=all_polys.crs
                                   )
    
    return h3_all

shp_ls = []
for f in geofiles:
    shp = gpd.read_file(os.path.join(foldershp, f))
    shp = shp.to_crs(epsg=4326)
    df_h3 = geth3(shp, 15)
    df_h3['video_location'] = f.split(".")[0]
    df_h3 = gpd.sjoin(df_h3, shp[['geometry']], how="inner", op='intersects')
    shp_ls.append(df_h3)
    
shpdf = pd.concat(shp_ls).reset_index(drop = True)
loc_map2 = dict(zip(
    shpdf['video_location'].unique(),
    ['Chestnut Street', 'Downtown Crossing', 'Bryant Park', 'MET']
))
shpdf['location_name'] = shpdf['video_location'].apply(lambda x: loc_map2[x])
shpdf.to_file(os.path.join(foldershp, "final_h3_15_four_city.geojson"), driver = "GeoJSON")

In [None]:
h3folder = "../_data/05_tracking_result_projected/step5_agg_h3"
# stagingfolder = "../../_data/05_tracking_result_projected/step1_speed_vector"
stagingfolder = "../../_data/05_tracking_result_projected/step1_speed_vector_05"
foldershp = '/Users/yuan/Dropbox (MIT)/whyte_CV/_data/02_siteplan/site_plan_geometry'
shpdf = gpd.read_file(os.path.join(foldershp, "final_h3_15_four_city.geojson"))
shpdf.head()

# 2. Load predicted results

In [None]:
# aggregate unique number of people appeared for each frame on a time series
# load a sample video first
def load_video(videoname):

    loc_name = videopath_sel[videopath_sel['video_id']==videoname]['video_location'].values[0]
    video_start_frame = videopath_sel[videopath_sel['video_id']==videoname]['frame_start'].values[0]
    video_start_at = videopath_sel[videopath_sel['video_id']==videoname]['video_section_started_at'].values[0]
    video_group = videopath_sel[videopath_sel['video_id']==videoname]['video_group'].values[0]

    # destfolder = os.path.join(outputfolder, loc_name)
    # traceGDF = pd.read_csv(os.path.join(destfolder, f"{videoname}_projected.csv"))
    traceGDF = pd.read_csv(os.path.join(stagingfolder, f"{videoname}.csv")) # this data contain speed update already
    traceGDF = traceGDF[traceGDF['frame_id']>video_start_frame].reset_index(drop = True)
    traceGDF['timestamp'] = video_start_at + traceGDF['frame_id'].apply(lambda x: pd.Timedelta(seconds = x/29.97))
    traceGDF['video_group'] = video_group
    traceGDF['video_location'] = loc_name
    return traceGDF


In [None]:
finished = ['20100612-120118b01',
 '20100612-120118b02',
 '20100612-120118b03',
 '20100612-120118b04']
videols = videopath_sel['video_id'].unique().tolist()
videols = [x for x in videols if x not in finished]


In [None]:

print("Total to process: ", len(videols))
fullGDF = []
for videoname in tqdm(videols):
    try:
        traceGDF = load_video(videoname)
        traceGDF['video_location'] = videopath_sel[videopath_sel['video_id']==videoname]['video_location'].values[0]
        fullGDF.append(traceGDF)
    except:
        print(videoname, ": failed")
fullGDF = pd.concat(fullGDF).reset_index(drop = True)
# fullGDF.head()

In [None]:
del traceGDF
import gc
gc.collect()

In [None]:

fullGDF['date'] = fullGDF['timestamp'].apply(lambda x: x.date())
fullGDF['video_group_update'] = fullGDF['video_location'].apply(lambda x: x.split(" ")[0])+fullGDF['video_group']
# aggregate pedestrian count per location per date per hour by every 5 second
fullGDF['second_from_start'] = fullGDF['timestamp'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second)
fullGDF['second_start'] = fullGDF.groupby(['video_group_update'])['second_from_start'].transform('min')
fullGDF['second_from_start'] = fullGDF['second_from_start'] - fullGDF['second_start']
fullGDF = fullGDF[fullGDF['video_group_update']!='Met20100612-124938'].reset_index(drop = True)
fullGDF['time_group'] = fullGDF['video_group_update'].apply(lambda x: timedict[x])
fullGDF['hex_id'] = fullGDF.apply(lambda x: h3.geo_to_h3(x['lat'], x['lon'], 15), axis = 1)
fullGDF['inside'] = fullGDF['hex_id'].apply(lambda x: x in list(shpdf['hex_id']))

In [None]:
selcols = ['track_id', 'frame_id', 'side', 'glasses', 'hat',
       'hold_objects_in_front', 'bag', 'upper', 'lower', 'boots', 'loc_x',
       'loc_y', 'x_3857', 'y_3857', 'video_id', 'moving_x', 'moving_y', 'lat',
       'lon', 'lat_moving', 'lon_moving', 'gender', 'age', 
       'timestamp', 'video_group', 'videoname', 'move_m_0.5s', 'speed_0.5s',
       'dist_x_0.5s', 'dist_y_0.5s', 'speed_x_0.5s', 'speed_y_0.5s', 'video_location',
       'date', 'video_group_update',
       'second_from_start', 'second_start', 'time_group', 'hex_id', 'inside','track_id_backup']
exportcols = [x for x in selcols if x in fullGDF.columns]
for video_group in fullGDF['video_group_update'].unique():
       temp = fullGDF[fullGDF['video_group_update']==video_group].reset_index(drop = True)
       temp[exportcols].to_csv(os.path.join(staginging_folder2, f"{video_group}_full.csv"), index = False)

In [None]:
selvideogroups = ['Chestnut20100519-083343',
 'Downtown20100521-115755',
 'Met20100612-120118',
 'bryant_park20081008-141944']

In [None]:
# load from results above directly
fullGDF  = []
for video_group in tqdm(selvideogroups):
    temp = pd.read_csv(os.path.join(staginging_folder2, f"{video_group}_full.csv"))
    fullGDF.append(temp)
fullGDF = pd.concat(fullGDF).reset_index(drop = True)

# 3. Overall summary

In [None]:
color_dict = {'Bryant Park': '#f77189',
 'Chestnut Street': '#97a431',
 'Downtown Crossing': '#36ada4',
 'MET': '#a48cf4'}
namedict = dict(zip(['Chestnut Street videos (NEW)', 'Downtown Crossing videos (NEW)',
       'Met Steps videos (NEW)', 'bryant_park'],
                    ['Chestnut Street',
                     'Downtown Crossing',
                     'MET',
                     'Bryant Park']))

In [None]:
# count h3 hexagon by each location
shpagg = shpdf.groupby('video_location')['hex_id'].nunique().reset_index()
shpagg = shpagg.rename(columns = {'hex_id':'hexagon_count'})
shpagg['area'] = shpagg['hexagon_count']*0.889
shpagg['video_location'] = np.where(shpagg['video_location']=='bryant_park1', 'bryant_park', shpagg['video_location'])
shpagg['video_location'] = shpagg['video_location'].apply(lambda x: x.title())

In [None]:
fps = 29.97
summary_video = fullGDF[fullGDF['inside']==True].groupby(['video_group_update','video_id']).agg({
    'track_id':['nunique', 'count'],
    'frame_id':'nunique',
}).reset_index()
summary_video.columns = ['video_group_update', 'video_id', 'pedestrian_count_unique', 'pedestrian_count_total', 'frame_count']
summary_video['second_count'] = summary_video['frame_count']/fps
summary_video['second_count'] = summary_video['second_count'].astype(int)
summary_video['minute_count'] = summary_video['second_count']/60

# summary_video['pedestrian_per_min'] = summary_video['pedestrian_count']/summary_video['minute_count']
summary_loc = summary_video.groupby('video_group_update').agg({
    'pedestrian_count_unique':'sum',
    'pedestrian_count_total':"sum",
    'minute_count':'sum',
    "frame_count":'sum'
}).reset_index()
summary_loc['pedestrian_per_min'] = summary_loc['pedestrian_count_unique']/summary_loc['minute_count']
summary_loc['time_group'] = summary_loc['video_group_update'].apply(lambda x: timedict[x])
summary_loc['video_location'] = summary_loc['video_group_update'].apply(lambda x: x[:-15].title())
# summary_loc['area'] = [136.017, 388.493, 443.611, 241.808]
# summary_loc = summary_loc.merge(shpagg, on = 'video_location', how = 'left')
summary_loc



In [None]:
overallsum_f = '/Users/yuan/Dropbox (MIT)/whyte_CV/_data/10_clean/01_timeseries'
summary_loc[summary_loc['video_group_update']!='Met20100612-112553'].to_csv(os.path.join(overallsum_f, 
                                                                                         'pedestrian_overall_2010s.csv'), index = False)

# 2. Summary time-series

In [None]:
pedes_temp = fullGDF[fullGDF['inside']==True].groupby(['video_location',
                              'video_group_update',
                              'video_id','date',
                              'second_from_start',
                              'timestamp',
                              'frame_id']).agg({
                                  'track_id': 'nunique'}).reset_index().rename(columns = {'track_id':'pedestrian_count'}).sort_values(
                                      ['video_id','timestamp']
                                  )
pedes_temp['time_group'] = pedes_temp['video_group_update'].apply(lambda x: timedict[x])
# get rolling average of track_id

# get rolling average of every 30 seconds window
window = 60
pedes_temp[f'pedestrian_count_rolling_{window}s'] = pedes_temp.groupby(['video_group_update'])['pedestrian_count']\
    .transform(lambda x: x.rolling(window*int(fps),1).mean())

pedes_temp['minute_from_start'] = pedes_temp['second_from_start'].apply(lambda x: int(x//60))
pedes_temp_min = pedes_temp.groupby(['time_group',
                                     'video_location',
                                     'video_group_update',
                                     'minute_from_start'])[f'pedestrian_count_rolling_{window}s'].mean().reset_index()
pedes_temp_min.head()

In [None]:
# sample the data every 1 minute
pedes_temp_min['pedestrian_count_rolling_10m'] = \
    pedes_temp_min.groupby(['video_group_update'])['pedestrian_count_rolling_60s'].transform(lambda x: x.rolling(10, 1).mean())


In [None]:
vizsel = [

       'Downtown20100521-115755', 
       # 'Met20100612-112553',
       'Chestnut20100519-083343',
       'Met20100612-120118', 
       'bryant_park20081008-141944'
       ]
pedes_temp_min['location_name'] = pedes_temp_min['video_location'].apply(lambda x: namedict[x])
vizdata = pedes_temp_min[pedes_temp_min['video_group_update'].isin(vizsel)].reset_index(drop = True)
# vizdata.columns

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(15, 5))
sns.set_theme(style="whitegrid")
sns.pointplot(data = vizdata, 
             x="minute_from_start", 
             y=f"pedestrian_count_rolling_10m", 
             hue='location_name', 
             ci = 95,
             palette=color_dict, 
             linewidth=2.5, 
             scale = 0.5,
             ax = ax)
# add xlabel
ax.set_xlabel("Minute from start",fontsize=15)
ax.set_ylabel("Observed pedestrian",fontsize=15)
# put legend outside the plot
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax.set_xticks(np.arange(0, pedes_temp['minute_from_start'].max(), 10))

In [None]:
# save the data for visualization cross time
cleanfolder = '../../_data/10_clean/01_timeseries'
pedes_temp_min.to_csv(os.path.join(cleanfolder, 'pedestrian_per_min_2010s.csv'), index = False)

# 3. H3 updates

In [None]:
h3folder = "../../_data/05_tracking_result_projected/step5_agg_h3_frame"

videols = videopath_sel['video_id'].unique().tolist()
# videols = ['20100612-120118b01',
#  '20100612-120118b02',
#  '20100612-120118b03',
#  '20100612-120118b04']

In [None]:

# video_todo = [x for x in videols if '20100612-120118' in x]
video_todo = videols.copy()
for videoname in tqdm(video_todo):
    try:
        loc_name = videopath_sel[videopath_sel['video_id']==videoname]['video_location'].values[0]
        destfolder = os.path.join(outputfolder, loc_name)
        traceGDF = pd.read_csv(os.path.join(destfolder, f"{videoname}_projected.csv"))

        res = 15
        traceGDF[f"h3_{res}"] = traceGDF.apply(lambda row: h3.geo_to_h3(row["lat"], row["lon"], res), axis = 1)
        countpeople = traceGDF.groupby([f"h3_{res}","frame_id"])["track_id"].nunique().reset_index()
        countpeople = countpeople.rename(columns = {"track_id":"pedestrian_count"})
        countpeople.to_csv(os.path.join(h3folder, f"{videoname}_h3_frame.csv"), index = False)
    except:
        print(videoname, ": failed")
        
h3folder = "../../_data/05_tracking_result_projected/step5_agg_h3_frame"
videols = videopath_sel['video_id'].unique().tolist()
countdf = []
for videoname in tqdm(videols):
    try:
        countpeople = pd.read_csv(os.path.join(h3folder, f"{videoname}_h3_frame.csv"))
        countpeople['videoname'] = videoname
        countdf.append(countpeople)
    except:
        print("Error")
countdf = pd.concat(countdf).reset_index(drop = True)
countdf.head()

In [None]:
location_map = dict(zip(videopath_sel['video_id'], videopath_sel['video_location']))
countdf['video_location'] = countdf['videoname'].apply(lambda x: location_map[x])
countdf['video_group_update'] = countdf['video_location'].apply(lambda x: x.split(" ")[0])+countdf['videoname']
countdf.rename(columns = {'track_id':'pedestrian_count'}, inplace = True)

## need to load the shapefile and construct all valid h3 so that we keep the places with 0 observation

In [None]:
countdf['video_location'].unique()

In [None]:
loc_map3 = dict(zip(
    countdf['video_location'].unique(),
    ['Bryant Park', 'Chestnut Street',
       'Downtown Crossing', 'MET']
))
countdf['location_name'] = countdf['video_location'].apply(lambda x: loc_map3[x])
countdf.head()

In [None]:
# videols = countdf['videoname'].unique()
videols = ['20100612-120118b01',
 '20100612-120118b02',
 '20100612-120118b03',
 '20100612-120118b04']

In [None]:

for videoname in videols:
    temp = countdf[countdf['videoname']==videoname].reset_index(drop = True).rename(columns = {'h3_15':'hex_id'})
    loc = temp['location_name'].unique()[0]
    tempshp = shpdf[shpdf['location_name']==loc].reset_index(drop = True)
    # construct a dataframe with each frame and hex_id
    frame_shp = tempshp[['hex_id']].merge(temp[['frame_id']].drop_duplicates(), how = "cross")
    print(frame_shp.shape)
    mdf = frame_shp[['hex_id','frame_id']].merge(temp,how = "left", on = ["hex_id", "frame_id"]).fillna(0)
    mdf.to_csv(os.path.join(h3folder, f"{videoname}_h3_frame_full.csv"), index = False)
    # fulldf_hx.append(mdf)


In [None]:
# only select the videos used in the study
countdf['video_group'] = countdf['video_group_update'].apply(lambda x: x[:-3])
countdf = countdf[countdf['video_group'].isin(selvideogroups)].reset_index(drop = True)
# countdf.shape

In [None]:
# Load the full dataset and aggregate by each hex_id
h3folder = "../../_data/05_tracking_result_projected/step5_agg_h3_frame"
h3_clean = "../../_data/10_clean/02_h3_agg"

allloc = ['Bryant Park', 'Chestnut Street',
       'Downtown Crossing', 'MET']
for location_name in ["MET"]:
    # videols = countdf[countdf['location_name']==location_name]['videoname'].unique().tolist()
    videols = ['20100612-120118b01',
 '20100612-120118b02',
 '20100612-120118b03',
 '20100612-120118b04']
    aggdf = []
    for videoname in tqdm(videols):
        filename = f"{videoname}_h3_frame_full.csv"
        temp = pd.read_csv(os.path.join(h3folder, filename))
        temp_summary = temp.groupby('hex_id').agg({
            'pedestrian_count':'sum',
            'frame_id':'nunique'
        }).reset_index().rename(columns = {'frame_id':'frame_count',
                                        'pedestrian_count':'pedestrian_count_sum'})
        aggdf.append(temp_summary)
    aggdf = pd.concat(aggdf).reset_index(drop = True)
    aggdf = aggdf.groupby('hex_id').agg({
        'pedestrian_count_sum':'sum',
        'frame_count':'sum'
    }).reset_index()
    aggdf['pedestrian_count_frame'] = aggdf['pedestrian_count_sum']/aggdf['frame_count']
    aggdf['location_name'] = location_name
    # aggdf.to_csv(os.path.join(h3_clean, f"{location_name}_h3_agg.csv"), index = False)
    # merge the geometry and check result
    viz = shpdf.merge(aggdf, on = "hex_id", how = "inner")
    viz.plot(column = "pedestrian_count_frame", legend = True)
    viz.to_file(os.path.join(h3folder, f"{location_name}_2010s_h3_agg.geojson"), driver = "GeoJSON")
    viz.drop("geometry", axis = 1).to_csv(os.path.join(h3folder, f"{location_name}_2010s_h3_agg.csv"), index = False)


In [None]:
overall = []
for location_name in ['Bryant Park', 'Chestnut Street',
       'Downtown Crossing', 'MET']:
    viz = gpd.read_file(os.path.join(h3folder, f"{location_name}_2010s_h3_agg.geojson"))
    overall.append(viz)
overall = pd.concat(overall).reset_index(drop = True)

In [None]:
# aggregation to get unique track_id per second
fullGDF_agg_sec = fullGDF.groupby([
    # 'video_id',
                                      'video_location',
                                      'hex_id',
]).agg({
                                          "track_id":"nunique",
                                        #   "second_from_start":"nunique"
                                      }).reset_index().rename(columns = {'track_id':'pedestrian_unique_count'})
fullGDF_time = fullGDF.groupby('video_location').agg({'second_from_start':'max'})\
  .reset_index().rename(columns = {'second_from_start':'total_second'})
fullGDF_agg_sec = fullGDF_agg_sec.merge(fullGDF_time, on = "video_location", how = "left")
                 
gdfsummary = overall.merge(fullGDF_agg_sec[['hex_id', 'pedestrian_unique_count', 'total_second']], 
              on = ["hex_id"], how = "left").drop('location_name_y', axis = 1).rename(columns = {'location_name_x':'location_name'})
gdfsummary.head()

In [None]:
import seaborn as sns
sns.scatterplot(
    data =gdfsummary,
    x = "pedestrian_unique_count",
    y = "pedestrian_count_frame",
)

In [None]:
gdfsummary.to_file(os.path.join(h3_clean, f"2010s_h3_agg.geojson"), driver = "GeoJSON")


# Plot timeseries change of people observed (Archived)

In [None]:
videopath_sel['video_section'] = videopath_sel['video_id'].apply(lambda x: x[-2:])
videopath_sel = videopath_sel[videopath_sel['video_section']!="T)"].reset_index(drop= True)
videopath_sel['video_section'] = videopath_sel['video_section'].astype(int)
videopath_sel['video_section'].unique()

In [None]:
videopath_sel = videopath_sel.sort_values(["video_location", "video_section"], ascending = True)
videopath_sel['starting_time'] = videopath_sel['video_name'].apply(lambda x: x.split("b")[0])
videopath_sel['starting_time'] = pd.to_datetime(videopath_sel['starting_time'])
# the starting_time is only valid for the first video in each location


# Compare Gender Distribution among four videos

In [None]:
# Individual analysis, use overall data per place to generate
import seaborn as sns
videols = [ "20081008-141944b02", "20100519-083343b03", "20100521-074701b03","20100612-120118b01"]
for videoname in videols:
    loc_name = videopath_sel[videopath_sel['video_id']==videoname]['video_location'].values[0]
    print(f"{loc_name = }")
    destfolder = os.path.join(outputfolder, loc_name)
    traceGDF = pd.read_csv(os.path.join(destfolder, f"{videoname}_projected.csv"))
    gendersummary = traceGDF.groupby("gender")["track_id"].nunique().reset_index()


    fig, ax = plt.subplots(figsize = (5,5))
    sns.barplot(
        data = gendersummary,
        x = "gender", 
        y = "track_id",
        palette = ["#ef5c43", "#3bc0cf"]
        
    )
    sns.despine()
    ax.set_xlabel("Gender")
    ax.set_ylabel("Number of pedestrians")
    ax.grid(axis = "y", color = "grey", linestyle = "--", linewidth = 0.5)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_tick_params(rotation=0)
    plt.xticks(fontsize = 10)