In [None]:
import numpy as np
import cv2
import glob
from matplotlib import pyplot as plt
import os
import pandas as pd

import seaborn as sns
from mpl_toolkits import mplot3d
from shapely.geometry import Point, Polygon, MultiPoint, LineString
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from itertools import combinations

from datetime import datetime, timezone, tzinfo
import geopandas as gpd
from tqdm import tqdm
import random
import networkx as nx
import itertools
from fiona.crs import from_epsg
from sklearn.cluster import DBSCAN
from matplotlib.font_manager import FontProperties
import networkx as nx

font = FontProperties()
font.set_family('serif')
font.set_name('Times New Roman')
pd.options.display.max_columns = None

sns.set(style="white")
sns.color_palette("hls", 4)
sns.set_context("paper", rc={"font.size":10,"axes.titlesize":12,"axes.labelsize":12})
from shapely.geometry import Point
globalcrs = "EPSG:3857"
def imshow(image, show_axes = False, quiet = False):
    if len(image.shape) == 3:
      # Height, width, channels
      # Assume BGR, do a conversion since 
      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    else:
      # Height, width - must be grayscale
      # convert to RGB, since matplotlib will plot in a weird colormap (instead of black = 0, white = 1)
      image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    # Draw the image
    plt.imshow(image)
    if not show_axes:
        # We'll also disable drawing the axes and tick marks in the plot, since it's actually an image
        plt.axis('off')
    if not quiet:
        # Make sure it outputs
        plt.show()

# Goal
1. Simplify the process to identify people in groups
2. export sample file to verify the code

In [None]:
# REVISED 2023-06-24
resultfolder = "../../_data/10_clean/03_individual/current/20230711"
if not os.path.exists(resultfolder):
  os.mkdir(resultfolder)
  
root = "../../_data/05_tracking_result_projected/step1_speed_vector/current"
files = glob.glob(root + "/*.csv")
projectionls = {
    "bryant_park":2263,
   'Met':3857,
 'Chestnut':3857,
 'Downtown':3857
}
pathdf = pd.DataFrame(files, columns=["path"])
pathdf['video_group_name'] = pathdf['path'].apply(lambda x: "_".join(x.split("/")[-1].split(".")[0].split("_")[:-1]))
pathdf['loc'] = pathdf['video_group_name'].apply(lambda x: x[:-15])
pathdf['coord'] = pathdf['loc'].apply(lambda x: projectionls[x])
pathdf.head()

In [None]:
# load all video paths
videofolder = "../../_data/00_raw/videos_current_highres"
videos = glob.glob(os.path.join(videofolder, "*/*.avi"))
videos2 = glob.glob(os.path.join(videofolder, "*/*/*.avi"))
videos = videos+videos2
# file_path = "../data/00_raw/videos_current_highres/"+videoname+".avi"
# video, fps, size = getbasics(file_path)
video_df = pd.DataFrame({
    "video_name": [os.path.basename(x) for x in videos],
    "video_path": videos,
    "video_id": [os.path.basename(x).split(".")[0] for x in videos],
})
video_df.shape

## Method 1
1. Two/more people stay in the same DB cluster for more than 3 seconds

In [None]:
def generatecluster(df, dis, epsg):
    """
    This function go through each frame and run DBscan based on different Distance
    threshod

    """
    predlist = []
    for f in tqdm(df['frame_id'].unique()):
        preDF = df[df['frame_id']==f].reset_index(drop = True)
        X = preDF[[f"x_{epsg}", f"y_{epsg}"]].values

        # eps : maximum distance between two samples
        # Here we use pixel distance for ease of visualization
        # 12 pixel for 0.5m, given hunman to human interaction distance maximum as 1.2 meter
        # https://en.wikipedia.org/wiki/Proxemics#:~:text=Hall%20described%20the%20interpersonal%20distances,and%20(4)%20public%20space.

        clustering = DBSCAN(eps = dis, min_samples = 2).fit(X)
        pred = clustering.labels_
        predlist.append(pred)
    
    allpred = np.concatenate(predlist, axis=0)
    return allpred


def generate_social(traceGDF, epsg, dis = 1.9):
    # dislist = [1.5, 2]
    # distlistdict = {
    #     2263: [1.2, 3.6], # feet
    #     3857: [0.6, 1.5] # meter
    # }
    """
    d is the distance threshold for DBscan at meter
    """
    traceGDF = traceGDF.sort_values(['frame_id', 'track_id']).reset_index(drop = True)
    clusterlabel = generatecluster(traceGDF, dis, epsg)
    FPre = pd.DataFrame(clusterlabel, columns = ['Social'])
    DBcluster = pd.concat([traceGDF, FPre], axis = 1)


    DBcluster['frame_id'] = DBcluster['frame_id'].astype(int)
    # Drop those clusterlabel == -1, create a spatial cluster id
    DBcluster['group_id_social'] = DBcluster['frame_id'].astype(str) + '_' +DBcluster['Social'].astype(str)
    DBcluster['group_id_social'] = np.where(DBcluster['Social']==-1, np.nan, DBcluster['group_id_social'])
    DBSocial = DBcluster[DBcluster['Social']!=-1].reset_index(drop = True)
    # os.makedirs(outfolder+"/step1_dbscan")
    DBSocial['group_id_social'] = DBSocial['group_id_social'].astype(str)
    return DBSocial, DBcluster


def valid_link(DBSocial, x, y, thred = 0.1, n = 1):
    samplegroup = DBSocial[DBSocial['track_id'].isin([x,y])]
    # calculate the speed_x, speed_y correlation between track 10 and 11
    df_wide = samplegroup.pivot(index = 'frame_id', 
                                columns = 'track_id', 
                                values = [f'speed_x_{n}s', f'speed_y_{n}s',f'speed_{n}s']).reset_index()
    # calculate the correlation
    df_wide = df_wide.dropna()
    coor1 = df_wide[(f"speed_x_{n}s",x)].corr(df_wide[(f"speed_x_{n}s",y)])
    coor2 = df_wide[(f"speed_y_{n}s",x)].corr(df_wide[(f"speed_y_{n}s",y)])
    coor3 = df_wide[(f"speed_{n}s",x)].corr(df_wide[(f"speed_{n}s",y)])
    if coor1>thred and coor2>thred:
        return True
    else:
        return False


def getuvperframe(testdf, iditem):
#     testdf = effDF[effDF['frame_id'] == frameid]
    U = []
    V = []
    groupid = []
    for i, group in tqdm(testdf.groupby([iditem])['track_id']):

    # generate all combinations without replacement 
    # from the group of similar column pairs
        for u, v in itertools.combinations(group, 2):
            U.append(u)
            V.append(v)
            groupid.append(i)
            
    dfframe = pd.DataFrame({'u':U,
             'v':V,
              iditem: groupid
             })

    return dfframe

def get_links(traceGDF, epsg=3857, timethred = 4):

    # traceGDF = get_speed_vector(traceGDF)
    # create spatial cluster
    DBSocial, DBcluster = generate_social(traceGDF, epsg)


    # two person appear together for at least 5 second (60*2 frame per second )
    # or half of the appearing time
    
    selp = 'social'
    # selp = 'personal_far'
    iditem = 'group_id_{}'.format(selp)
    df_links = getuvperframe(DBSocial, iditem)

    # First Aggregation, disregard time continuity, only consider frequency
    df_links = df_links.groupby(['u', 'v']).size().reset_index().rename(columns={0: 'weight'})

    fps = 29.9
    # qualified link --> staying together more than 2.5 seconds and speed correlation higher than 10%
    # and the speed vector (x,y) have correlation more than 10%

    df_links['valid'] = df_links.apply(lambda x: valid_link(DBSocial, x['u'], x['v']), axis = 1)
    df_links_valid = df_links[(df_links['valid']==True)&(df_links['weight'] > timethred*fps)].reset_index(drop = True)
    
    return DBSocial, DBcluster, df_links_valid

In [None]:
from networkx import edge_betweenness_centrality as betweenness
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms.community.centrality import girvan_newman
# set up loops
from datetime import datetime
import json
# use girvan_newman for community detection first
# df_plot = newlinks.reset_index(drop=True)

def most_central_edge(G):
    centrality = betweenness(G, weight="weight")
    return max(centrality, key=centrality.get)


def getcommunity(df_links):
    G_plot = nx.Graph()
    for link in tqdm(df_links.index):
        G_plot.add_edge(df_links.iloc[link]['u'],
                    df_links.iloc[link]['v'],
                    weight = df_links.iloc[link]['weight'])

    communities = girvan_newman(G_plot, most_valuable_edge = most_central_edge)
    # tuple(sorted(c) for c in next(comp))

    node_groups = []
    for com in next(communities):
        node_groups.append(list(com))
    node_groupslen = [len(group) for group in node_groups]
    communitydf = pd.DataFrame({
    'communityID':np.arange(0, len(node_groupslen)),
    'nodegroup':node_groups,
    'groupsize':node_groupslen
    })
    return communitydf


In [None]:

def get_confirm_group(communitydf, DB, example_com):
    """This function only identify the exactly same group, disregarding groups appear across frames.
    for example, track 1, 2 in frame 3-8, track 1,2,3 in frame 7-9
    """
    tracks = communitydf[communitydf["communityID"]==example_com]["nodegroup"].values[0]
    tracks = [int(x) for x in tracks]
    tracks.sort()
    tracksls = "_".join([str(x) for x in tracks])

    # for each frame and social id, get a list of tracks
    sel = DB[(DB["communityID"]==example_com)&(DB["track_id"].isin(tracks))&(DB["Social"]!=-1)]
    frame_summary = sel.groupby(["frame_id", "Social"])["track_id"].unique().reset_index()
    frame_summary["track_id_str"] = frame_summary["track_id"].apply(lambda x: "_".join([str(i) for i in x]))
    confirmed = frame_summary[frame_summary["track_id_str"]==tracksls]
    confirmed["communityID"] = example_com
    return confirmed


def get_confirm_group_loose(communitydf, DB, example_com):

    tracks = communitydf[communitydf["communityID"]==example_com]["nodegroup"].values[0]
    tracks = [int(x) for x in tracks]
    tracks.sort()
    tracksls = "_".join([str(x) for x in tracks])

    # for each frame and social id, get a list of tracks
    sel = DB[(DB["communityID"]==example_com)&(DB["track_id"].isin(tracks))&(DB["Social"]!=-1)]
    frame_summary = sel.groupby(["frame_id", "Social"])["track_id"].unique().reset_index()
    frame_summary["track_id_str"] = frame_summary["track_id"].apply(lambda x: "_".join([str(i) for i in x]))
    frame_summary["len"] = frame_summary["track_id"].apply(lambda x: len(x))
    def get_inter(temp):
        lst2 = temp["track_id_str"].values[0].split("_")
        lst2 = [int(x) for x in temp["track_id_str"].values[0].split("_")]
        intersection = [value for value in lst2 if value in tracks]
        temp["track_ls_intersection"] = "_".join([str(x) for x in intersection])
        return temp

    reconstruct = frame_summary[frame_summary["len"]>1].groupby("track_id_str").apply(get_inter)
    reconstruct["communityID"] = example_com
    return reconstruct


def get_group(communitydf, DB):
    group_df = []
    for comID in tqdm(communitydf["communityID"].unique()):
        temp = get_confirm_group(communitydf, DB,comID)
        group_df.append(temp)

    group_df = pd.concat(group_df).reset_index(drop = True)
    return group_df

def get_group_loose(communitydf, DB):
    group_df = []
    for comID in tqdm(communitydf["communityID"].unique()):
        temp = get_confirm_group_loose(communitydf, DB, comID)
        group_df.append(temp)

    group_df = pd.concat(group_df).reset_index(drop = True)
    return group_df

def get_gender_comp(genderls):
    if len(genderls)==2:
        return "Mixed"
    else:
        return genderls[0]
    
    
def valid_link_corr(DBSocial, x,y, thred = 0.5, n = 1):
    samplegroup = DBSocial[DBSocial['track_id'].isin([x,y])]
    # calculate the speed_x, speed_y correlation between track 10 and 11
    df_wide = samplegroup.pivot(index = 'frame_id', 
                                columns = 'track_id', 
                                values = [f'speed_x_{n}s', 
                                          f'speed_y_{n}s',
                                          f'speed_{n}s']).reset_index()
    # calculate the correlation
    df_wide = df_wide.dropna()
    
    coor1 = df_wide[(f"speed_x_{n}s",x)].corr(df_wide[(f"speed_x_{n}s",y)])
    coor2 = df_wide[(f"speed_y_{n}s",x)].corr(df_wide[(f"speed_y_{n}s",y)])
    coor3 = df_wide[(f"speed_{n}s",x)].corr(df_wide[(f"speed_{n}s",y)])
    # corr4 is a stay indicator, if both speed_{n}s <0.5, thenn it is a stay
    speed_mean1 = df_wide[(f"speed_{n}s",x)].mean()
    speed_mean2 = df_wide[(f"speed_{n}s",y)].mean()
    if speed_mean1<0.5 and speed_mean2<0.5:
        coor4 = 1
    else:
        coor4 = 0
    return coor1, coor2, coor3, coor4

def get_selfile(framsel, thre=2):

    seldb = DBcluster[DBcluster['frame_id']==framsel].reset_index(drop = True)
    seldb = gpd.GeoDataFrame(seldb, geometry=[Point(x,y) for x,y in zip(seldb['lon'], seldb['lat'])], crs = f"EPSG:4326")
    ax = seldb[seldb['Social']!=-1].plot(column = 'Social', legend = True, cmap = 'tab20', figsize = (8,8))
    seldb[seldb['Social']==-1].plot(color = 'grey', ax = ax)
    
    seldb_shift = DBcluster[DBcluster['frame_id']==framsel+thre*2].reset_index(drop = True)
    seldb_shift = gpd.GeoDataFrame(seldb_shift, geometry=[Point(x,y) for x,y in zip(seldb_shift['lon'], 
                                                                                    seldb_shift['lat'])], 
                                   crs = f"EPSG:4326")
    ax = seldb_shift[seldb_shift['Social']!=-1].plot(column = 'Social', legend = True, cmap = 'tab20', figsize = (8,8))
    seldb_shift[seldb_shift['Social']==-1].plot(color = 'grey', ax = ax)
    return seldb, seldb_shift


# Experiment, add speed similarity

In [None]:
# THESE ARE CONSTANT VARIABLES
fps = 29.9

selp = 'social'
# selp = 'personal_far'
iditem = 'group_id_{}'.format(selp)
timethred = 1

metadata = {
    'order':"video order in one location", 
     'video_location':"video location name", 
     'track_id':"reconstructed track id, unique within each video", 
       'video_id':"video id, unique within each location",
       'lat':"prejected latitude",
       'lon':"prejected longitude",
       'track_id_backup':"original track id from the tracking file", 
       'speed_0.5s':"speed in meter per second",
       'speed_x_0.5s':"speed in meter per second in x direction", 
       'speed_y_0.5s':"speed in meter per second in y direction", 
       'hex_id':"h3 level 15 index", 
       'inside':"inside the comparable area (both historical and current) or not",
        'frame_id':"reconstructed frame_id, across videos in a location, unique within one location", 
        'frame_id_original':"original frame_id from the tracking file", 
        'second_from_start':"calculated second from start based on the frame_id, 48 frames per real second",
       'appear_sec':"total second the track appeared in the video", 
       'individual_frame_total':"total number of frames the track appeared in the video", 
       'Social':"spatial cluster id, unique within each frame, disregarding invalid or valid across time", 
       'frame_social_track':"frame_id + Social + track_id",
       'group_id_social':"frame_id + Social, unique within each video",
       'group_size':"number of tracks in the group",
       'is_group':"whether the track is in a group or not",
       'group_first_frame':"first frame_id when the track is in a group",
       'track_first_frame':"first frame_id when the track appear in this video", 
       'group_track_delta':"difference between group_first_frame and track_first_frame", 
       'emerging_group':"whether the group is newly formed or not",
       'cross_frame_group_id':\
         "this is a group id that can be used to identify the group across frames (only available for current videos)",
         "gender":"gender of each pedestrian",
         "age":"age of each pedestrian",
         'timestamp':"timestamp of each frame (Only available for modern videos). use for reference."
}
selcols = list(metadata.keys())

In [None]:
fist_stagingfolder = "../../_data/05_tracking_result_projected/step1_speed_vector"
stagingfolder = "../../_data/05_tracking_result_projected/step1_speed_vector/current"
# resultfolder = '../../_data/10_clean/03_individual/current/20230711c/'
resultfolder = '../../_data/10_clean/03_individual/current/20230711d'

In [None]:
# allvideos = glob.glob(os.path.join(fist_stagingfolder, "*.csv"))
# allvideonames = [os.path.basename(x) for x in allvideos]
# remain = [x for x in allvideonames if x not in finished]
# len(remain)

# videogroup = set([x.split("b")[0] for x in remain])
videogroup = [
    '20081008-072238',
 '20100519-083343',
 '20100521-074701',
 '20100612-112553',]
sel = ['Met20100612-120118', 
       'bryant_park20081008-141944',
       'Downtown20100521-115755',
       'Chestnut20100519-083343',]

In [None]:
pathdf['video_group'] = pathdf['video_group_name'].apply(lambda x: x[-15:])
# pathdf['video_group'].unique()

In [None]:
def link_method(traceGDF, DBSocial, DBcluster,df_links_valid, videoname, interpolation = True):
    data_link = DBSocial.groupby(['frame_id', 'Social'])['track_id'].unique().reset_index()
    data_link['group_member'] = data_link.apply(lambda x: "&&".join([str(i) for i in x['track_id']]), axis = 1)

    #measure group length
    data_link['group_len'] = data_link['track_id'].apply(lambda x: len(x))
    # make a list of all possible combination of 2 people
    data_link['combination2'] = data_link['track_id'].apply(lambda x: list(combinations(x, 2)))
    data_link_explode = data_link[['frame_id',  'Social',
        'combination2']].explode('combination2').reset_index(drop = True)
    data_link_explode['u_v'] = data_link_explode['combination2'].apply(lambda x: "&&".join([str(i) for i in x]))
    data_link_explode['v_u'] = data_link_explode['combination2'].apply(lambda x: "&&".join([str(i) for i in x[::-1]]))

    df_links_valid['u_v'] = df_links_valid.apply(lambda x: "&&".join([str(x['u']), str(x['v'])]), axis = 1)
    df_links_valid['v_u'] = df_links_valid.apply(lambda x: "&&".join([str(x['v']), str(x['u'])]), axis = 1)
    # check if the u_v combination exist in the valid link dataframe
    demolinks = data_link_explode[(data_link_explode['u_v'].isin(df_links_valid['u_v'].unique()))|(data_link_explode['u_v']\
        .isin(df_links_valid['v_u'].unique()))
        ].reset_index(drop = True)
    # flatten the groups to each track
    demolinks = demolinks[['frame_id', 
                        'Social', 
                        'combination2']].explode('combination2')\
    .reset_index(drop = True).sort_values(['frame_id', 'Social'], ascending=False).rename(columns = {'combination2':'track_id'})\
        .drop_duplicates(['frame_id', 'Social', 'track_id']).reset_index(drop = True)
    demolinks['frame_social_track'] = demolinks['frame_id'].astype(str)+"$$"+\
        demolinks['Social'].astype(str)+"$$"+demolinks['track_id'].astype(str)
        
    # confirm the data can be merged back to the original data
    DBSocial['frame_social_track'] = DBSocial['frame_id'].astype(str)+ "$$"\
        + DBSocial['Social'].astype(str)+"$$"\
            +DBSocial['track_id'].astype(str)
    DBSocial_update = DBSocial[DBSocial['frame_social_track'].isin(demolinks['frame_social_track'].unique())]\
        .reset_index(drop = True)
    per = DBSocial_update.shape[0]/DBcluster.shape[0] # 26% observation ever in a group
    # Social cluster id become the group id within each frame
    DBSocial_update['group_id_social'] = DBSocial_update['frame_id'].astype(str) + '_' +DBSocial_update['Social'].astype(str)

    # create the True Group ID
    DBSocial_group = DBSocial_update.groupby(['frame_id', 'Social'])['track_id'].unique().reset_index()
    DBSocial_group['truegroup'] = DBSocial_group['track_id'].apply(lambda x: "&&".join([str(i) for i in x]))



    DBSocial_update = DBSocial_update[[ 'group_id_social','frame_id', 'Social','track_id']]\
                                        .merge(DBSocial_group[['frame_id', 'Social', 'truegroup']], 
                                                on = ['frame_id', 'Social'], how = 'inner')
    # for each track, if the frame_id within its group first and last frame, then it is a group
    def get_largergroup(DBSocial_group, DBcluster):
        DBsocial_group_update = []
        for truegroup in DBSocial_group['truegroup'].unique():
            temp = DBSocial_update[DBSocial_update['truegroup']==truegroup].reset_index(drop = True)
            trackls = temp['track_id'].unique()
            firstframe = temp['frame_id'].min()
            lastframe = temp['frame_id'].max()
            # print(temp.shape[0])}
            allvalid = DBcluster[(DBcluster['track_id'].isin(trackls))&(DBcluster['frame_id']<=lastframe)&(DBcluster['frame_id']>=firstframe)]
            allvalid['truegroup'] = truegroup
            # print(allvalid.shape[0])
            DBsocial_group_update.append(allvalid)
        DBsocial_group_update = pd.concat(DBsocial_group_update).reset_index(drop = True)
        return DBsocial_group_update
    if interpolation == True:
        DBsocial_group_update = get_largergroup(DBSocial_group, DBcluster)
    else:
        DBsocial_group_update = DBSocial_group.copy()
    # NOW WE MAY HAVE DUPLICATES>> MERGE THEM >> EACH TRACK SHOULD ONLY HAVE ONE TRUEGROUP IN ONE FRAME
    
    # DBsocial_group_update = DBSocial_update.copy()

    DBsocial_group_update['group_id_social'] = DBsocial_group_update['frame_id'].astype(str) + '_' +DBsocial_group_update['Social'].astype(str)
    DBsocial_group_update['frame_social_track'] = DBsocial_group_update['frame_id'].astype(str)+ "$$"\
        +DBsocial_group_update['Social'].astype(str)+"$$"\
            +DBsocial_group_update['track_id'].astype(str)
    DBsocial_group_update = DBsocial_group_update.drop_duplicates(['frame_id', 'track_id'])
    
    DBcluster.drop('group_id_social', axis = 1, inplace = True)
    
    DBcluster['frame_social_track'] = DBcluster['frame_id'].astype(str)+ "$$"\
        +DBcluster['Social'].astype(str)+"$$"\
            +DBcluster['track_id'].astype(str)
    # DBcluster.drop('group_id_social', axis = 1, inplace = True)
    DBcluster_update = DBcluster.merge(DBsocial_group_update[['frame_social_track', 'truegroup','group_id_social']],
                                            on = 'frame_social_track', how = 'left')
    # merge the DBSocial_update back to the DBcluster
    DBcluster_update['is_group'] = np.where(DBcluster_update['group_id_social'].isnull(), False, True)

    # check if a group is newly formed or not
    # for each track, find its first frame_id when it is in a group
    DBcluster_update['group_first_frame'] = DBcluster_update.groupby(['track_id','is_group'])['frame_id'].transform('min')
    DBcluster_update['group_last_frame'] = DBcluster_update.groupby(['track_id','is_group'])['frame_id'].transform('max')
    DBcluster_update['group_first_frame'] = np.where(DBcluster_update['is_group']==False, 
                                                    np.nan, DBcluster_update['group_first_frame'])
    DBcluster_update['group_last_frame'] = np.where(DBcluster_update['is_group']==False, 
                                                    np.nan, DBcluster_update['group_last_frame'])


    DBcluster_update['track_first_frame'] = DBcluster_update.groupby(['track_id'])['frame_id'].transform('min')
    DBcluster_update['group_track_delta'] = DBcluster_update['group_first_frame'] - DBcluster_update['track_first_frame']
    DBcluster_update['emerging_group'] = np.where(DBcluster_update['group_track_delta']>29.97*5, True, False) # 1 second
    DBcluster_update = DBcluster_update.drop_duplicates(['track_id',  'frame_id'])

    DBcluster_update['appear_sec'] = DBcluster_update.groupby('track_id')['frame_id'].transform('count')/fps
    DBcluster_update['individual_frame_total'] = DBcluster_update.groupby('track_id')['frame_id'].transform('count')
    DBcluster_update['group_size'] = DBcluster_update['truegroup'].fillna("").apply(lambda x: len(x.split('&&')))
    DBcluster_update.rename(columns = {
            'truegroup':'cross_frame_group_id',
            
        }, inplace = True)
    exportcols = [x for x in selcols if x in DBcluster_update.columns]
    assert DBcluster_update.shape[0]==traceGDF.shape[0]
    DBcluster_update[exportcols].to_csv(os.path.join(resultfolder, 
                                                            f"{videoname}.csv"), index = False)
    return DBcluster_update

In [None]:
import gc
# del fullgdf, traceGDF, DBSocial, DBcluster, df_links_valid, df_links, DBcluster_update
gc.collect()

In [None]:
for i in [5,1,2,4]:
    video_group_name = pathdf.loc[i,"video_group_name"]
    video_loc = pathdf.loc[i, 'loc']
    
    fullgdf = pd.read_csv(os.path.join(stagingfolder, f"{video_group_name}_full.csv"), 
                          engine = "python").drop_duplicates(['track_id', 'frame_id'])\
                              .reset_index(drop = True)
    videols = fullgdf['video_id'].unique()
    for videoname in tqdm(videols):
        traceGDF = fullgdf[fullgdf['video_id']==videoname].reset_index(drop = True)
        DBSocial, DBcluster = generate_social(traceGDF, 3857, dis = 1.9)
        df_links = getuvperframe(DBSocial, iditem)
        df_links = df_links.groupby(['u', 'v']).size().reset_index().rename(columns={0: 'weight'})
        df_links = df_links[df_links['weight'] > 1*fps].reset_index(drop = True)
        df_links['coor_ls'] = df_links.apply(lambda x: valid_link_corr(DBSocial, x['u'], x['v'], n = 0.5),axis = 1)
        df_links['valid'] = np.where(df_links['coor_ls'].apply(lambda x: x[0]>0.0 or x[1]>0.0), True, False)
        # links are valid if all people in a group are staying
        df_links['valid'] = np.where(df_links['coor_ls'].apply(lambda x: x[3]>0), True, df_links['valid'])
        
        df_links_valid = df_links[(df_links['valid']==True)].reset_index(drop = True) # 20230711c
        # df_links_valid = df_links[(df_links['weight'] > 1*fps)].reset_index(drop = True) # 20230711b
        DBcluster_update = link_method(traceGDF, DBSocial, DBcluster,df_links_valid, videoname, 
                                       interpolation = True)
### for all valid links if not able to construct community, keep the valid link and memebers directly

In [None]:
DBcluster_update = link_method(traceGDF, DBSocial, DBcluster,df_links_valid, videoname, 
                                       interpolation = True)

# Visualize the finding

In [None]:
def get_selfile(framsel, thre=2):

    seldb = DBcluster[DBcluster['frame_id']==framsel].reset_index(drop = True)
    seldb = gpd.GeoDataFrame(seldb, geometry=[Point(x,y) for x,y in zip(seldb['lon'], seldb['lat'])], crs = f"EPSG:4326")
    ax = seldb[seldb['Social']!=-1].plot(column = 'Social', legend = True, cmap = 'tab20', figsize = (8,8))
    seldb[seldb['Social']==-1].plot(color = 'grey', ax = ax)
    
    seldb_shift = DBcluster[DBcluster['frame_id']==framsel+thre*2].reset_index(drop = True)
    seldb_shift = gpd.GeoDataFrame(seldb_shift, geometry=[Point(x,y) for x,y in zip(seldb_shift['lon'], 
                                                                                    seldb_shift['lat'])], 
                                   crs = f"EPSG:4326")
    ax = seldb_shift[seldb_shift['Social']!=-1].plot(column = 'Social', legend = True, cmap = 'tab20', figsize = (8,8))
    seldb_shift[seldb_shift['Social']==-1].plot(color = 'grey', ax = ax)
    return seldb, seldb_shift

In [None]:
# visualize groups identified in the MET video
videogroup = '20100612-120118'
individualfolder = '../../_data/10_clean/03_individual/'
currentfolder = os.path.join(individualfolder, 'current')
files = os.listdir(currentfolder)
files = [x for x in files if videogroup in x]
# videoname = '20100612-120118b01'
df = pd.read_csv(os.path.join(currentfolder, videoname+'.csv')).drop_duplicates(['track_id', 'frame_id'])

result_folder = "/Users/yuan/Dropbox (MIT)/whyte_CV/_data/03_tracking_result/_current_video_no_attr"
predpath = os.path.join(result_folder, f'{videoname}.txt')
trace = pd.read_csv(predpath, sep = '\t', header = None)
trace.columns = [ "x1", "y1", "x2", "y2", "track_id", "frame_id"]
trace['w'] = trace['x2'] - trace['x1']
trace['h'] = trace['y2'] - trace['y1']
trace['ratio'] = trace['w']/trace['h']

In [None]:
# load the image and check
selframe = 12583

In [None]:
frame_sel = 12583
seldb = df[df['frame_id']==frame_sel]


# df_sel.groupby('Social').size()
data = trace.merge(seldb[[ "frame_id", 'Social','track_id','is_group']], 
                                               left_on = ["track_id", "frame_id"],
                                               right_on = ["track_id", "frame_id"],
                                               suffixes=('', '_new'))
data = data[data['Social']!=-1].reset_index(drop = True)


In [None]:
def compute_color_for_labels(label):
    """
    Simple function that adds fixed color depending on the class
    """
    palette = (2 ** 11 - 1, 2 ** 15 - 1, 2 ** 20 - 1)
    color = [int((p * (label ** 2 - label + 1)) % 255) for p in palette]
    return tuple(color)

def getbasics(file_path):
    video = cv2.VideoCapture(file_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    print('frames per second =',fps)
    size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    print('frames size =',size)
    # video.release()
    return video, fps, size, length

def plot_img(video_name, framsel, data):
    video_file = videopath = f"/Users/yuan/Dropbox (MIT)/whyte_CV/_data/00_raw/videos_current_highres/Met Steps videos (NEW)/20100521-115754-01/{video_name}.avi"
    
    video, fps, size, length= getbasics(video_file)
        # set video to the frame
    video.set(cv2.CAP_PROP_POS_FRAMES, framsel)
    re, frame = video.read()
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    for j in range(data.shape[0]):
        social_id = data.at[j, 'Social']
        color = compute_color_for_labels(social_id)
        cv2.rectangle(frame,
                        (data.at[j,'x1'], 
                    data.at[j,'y1']), 
                    (data.at[j,'x2'], 
                    data.at[j,'y2']), 
                        color, 2)

    fig = plt.subplots(figsize = (10,10))
    plt.imshow(frame)
    plt.axis('off')

In [None]:
plot_img(videoname, frame_sel, data)

In [None]:
plot_img(videoname, frame_sel, data)

In [None]:
# load the frame from video
videopath = f"/Users/yuan/Dropbox (MIT)/whyte_CV/_data/00_raw/videos_current_highres/Met Steps videos (NEW)/20100521-115754-01/{file}.avi"
def getbasics(file_path):
    video = cv2.VideoCapture(file_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    length = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    print('frames per second =',fps)
    size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    print('frames size =',size)
    # video.release()
    return video, fps, size, length

video, fps, size, length = getbasics(videopath)
video.set(cv2.CAP_PROP_POS_FRAMES, frame_sel)
frame = video.read()[1]
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
fig = plt.subplots(figsize = (10,10))
plt.imshow(frame)

# Summarize group attributes (archive below)

In [None]:
# load original prediction result
# result_folder = "../data/06_attr_result/"
# # predpath = os.path.join(result_folder, f'{videoname}_attr_mot.csv')
# predpath = os.path.join(result_folder, f'{videoname}.csv')
# trace = pd.read_csv(predpath)
# trace.rename(columns = {"x":"bbox0", "y":"bbox1", "w":"bbox2", "h":"bbox3"}, inplace = True)
# for x in ["bbox0", "bbox1", "bbox2", "bbox3"]:
#     trace[x] = trace[x].astype(int)

# for each track, get the major gender and age
traceGDF["gender"] = traceGDF.groupby("track_id")["gender"].transform(lambda x: x.mode()[0])
traceGDF["track_id"] = traceGDF["track_id"].astype(int)


seltrack = traceGDF.merge(group_df_l_flat, on = ["frame_id", "track_id"], how = "left")
group_attr = group_df_l_flat[["truegroup", "group_first_frame", "group_last_frame"]].drop_duplicates("truegroup")

# missing one step, fill the inner frames betwen the first and last frame
# for any frames that within (min, max) and the N people are identified in the same frame, assign the same truegroup value
newgroup = []
for g in group_attr["truegroup"].unique():
    first = group_attr[group_attr["truegroup"]==g]["group_first_frame"].values[0]
    last = group_attr[group_attr["truegroup"]==g]["group_last_frame"].values[0]
    trackls = [int(x) for x in g.split("_")]
    temp = seltrack[(seltrack["frame_id"]<=last)&(seltrack["frame_id"]>=first)].reset_index(drop = True)
    temp = temp[temp["track_id"].isin(trackls)].reset_index(drop = True)
    
    temp["truegroup"]  = temp["truegroup"].fillna(method = "ffill")
    newgroup.append(temp)
    
newgroup = pd.concat(newgroup).reset_index(drop = True)
# use this for visualization

In [None]:
traceGDF.drop(['video_id'], 
              axis = 1, 
              inplace = True)

In [None]:
# merge back to group lat lon
grouptracedf = newgroup[['track_id', 
          'frame_id', 
        'Social', 
          'truegroup',
       'group_first_frame', 
        'group_last_frame']].merge(
    traceGDF,
               on = ["frame_id", "track_id"],
               how = "right"
              )
grouptracedf["individual_first_frame"] = grouptracedf.groupby("track_id")["frame_id"].transform("min")
grouptracedf["individual_last_frame"] = grouptracedf.groupby("track_id")["frame_id"].transform("max")
grouptracedf.head()

# NOT FINISHED

In [None]:
# group size
summary = newgroup.groupby("truegroup")["track_id"].nunique().reset_index().rename(columns = {"track_id":"group_size"})

# group appearing time and walking speed
grouptracegdf = gpd.GeoDataFrame(grouptracedf, geometry = [Point(x,y) for x,y in zip(grouptracedf["lon"],
                                                                                    grouptracedf["lat"])])
grouptracegdf.crs = "EPSG:4326"

# select only groups and convert to lines
group_gdf = grouptracegdf[~grouptracegdf["truegroup"].isna()].reset_index(drop = True)
single_gdf = grouptracegdf[grouptracegdf["truegroup"].isna()].reset_index(drop = True)



In [None]:
# grouptracedf_update = grouptracedf.merge(summary, on = "truegroup", how = "left").drop_duplicates(['track_id',  'truegroup',
#     'group_first_frame', 'group_last_frame']).reset_index(drop = True)
# grouptracedf_update['delta_frame'] = grouptracedf_update['group_first_frame'] - grouptracedf_update['individual_first_frame']


In [None]:
# convert point to linestrings
from shapely.geometry import LineString
group_gdf["indi_totalFrame"] = group_gdf.groupby("track_id")["frame_id"].transform("count")
group_gdf = group_gdf[group_gdf["indi_totalFrame"]>1].reset_index(drop = True)
group_gdf_line = group_gdf.sort_values("frame_id").reset_index(drop = True)\
.groupby(['track_id',"gender", "age","indi_totalFrame","individual_last_frame", 
          "individual_first_frame"])['geometry'].apply(lambda x: LineString(x.tolist())).reset_index()

single_gdf["indi_totalFrame"] = single_gdf.groupby("track_id")["frame_id"].transform("count")
single_gdf = single_gdf[single_gdf["indi_totalFrame"]>1].reset_index(drop = True)

single_gdf_line = single_gdf.sort_values("frame_id").reset_index(drop = True)\
.groupby(['track_id',"gender", "age","indi_totalFrame","individual_last_frame", "individual_first_frame"])['geometry'].apply(lambda x: LineString(x.tolist())).reset_index()
group_gdf_line.crs = "EPSG:4326"
single_gdf_line.crs = "EPSG:4326"
group_gdf_line = group_gdf_line.to_crs(f"EPSG:{epsg}")
single_gdf_line = single_gdf_line.to_crs(f"EPSG:{epsg}")

group_gdf_line["moving_distance"] = group_gdf_line["geometry"].length
single_gdf_line["moving_distance"] = single_gdf_line["geometry"].length
group_gdf_line["indi_sec"] = group_gdf_line["indi_totalFrame"]/fps
single_gdf_line["indi_sec"] = single_gdf_line["indi_totalFrame"]/fps

# this moving_speed is at individual average level
group_gdf_line["moving_speed"] = group_gdf_line["moving_distance"]/group_gdf_line["indi_sec"]
single_gdf_line["moving_speed"] = single_gdf_line["moving_distance"]/single_gdf_line["indi_sec"]

group_gdf_line["track_id"] = group_gdf_line["track_id"].astype(int)
group_attr1 = group_gdf[~group_gdf["group_first_frame"].isna()][['track_id',  'truegroup',
       'group_first_frame', 'group_last_frame']].drop_duplicates()

group_gdf["track_id"] = group_gdf["track_id"].astype(int)

group_attr2 = group_gdf_line.merge(
    group_attr1,
    on = "track_id"
)\
.merge(summary, on = ["truegroup"])
group_attr2["group_totalFrame"] = group_attr2["group_last_frame"] - group_attr2["group_first_frame"]

# calculate the time difference between individual's first appearing time and group appearing time
group_attr2["delta_frame"] = group_attr2["group_first_frame"] - group_attr2["individual_first_frame"] 
group_attr_valid = group_attr2[group_attr2["moving_speed"]<7] # 3.6 ft/s - 6 ft/s # 2263 at ft, 3857 at meter


group_attr_valid["gender_indi"] = np.where(group_attr_valid["gender"]=="Male",0, 1)
gendersummary = group_attr_valid.drop_duplicates(["truegroup", "track_id"]).groupby("truegroup")["gender"].unique().reset_index()
gendersummary["gender_comp"] = gendersummary["gender"].apply(get_gender_comp)
gendergroup = gendersummary.groupby("gender_comp").size().reset_index().rename(columns = {0:"count"})



## Gender

In [None]:
# Individual analysis, use overall data per place to generate
gendersummary = traceGDF.groupby("gender")["track_id"].nunique().reset_index()


fig, ax = plt.subplots(figsize = (5,5))
sns.barplot(
    data = gendersummary,
    x = "gender", 
    y = "track_id",
    palette = ["#ef5c43", "#3bc0cf"]
    
)
sns.despine()
ax.set_xlabel("Gender")
ax.set_ylabel("Number of pedestrians")
ax.grid(axis = "y", color = "grey", linestyle = "--", linewidth = 0.5)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_tick_params(rotation=0)
plt.xticks(fontsize = 10)

In [None]:
# group level gender comparison
gendersummary = group_attr_valid.drop_duplicates(["truegroup", "track_id"]).groupby("truegroup")["gender"].unique().reset_index()
gendersummary["gender_comp"] = gendersummary["gender"].apply(get_gender_comp)
gendergroup = gendersummary.groupby("gender_comp").size().reset_index().rename(columns = {0:"count"})

fig, ax = plt.subplots(figsize = (5,5))
sns.barplot(
    data = gendergroup,
    x = "gender_comp", 
    y = "count",
    palette = ["#ef5c43", "#3bc0cf","#ffdda6"]
)
sns.despine()
ax.set_xlabel("Gender Composition")
ax.set_ylabel("Number of observed groups")
ax.grid(axis = "y", color = "grey", linestyle = "--", linewidth = 0.5)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_tick_params(rotation=0)
plt.xticks(fontsize = 10)

### Emerging interaction 

In [None]:
single_valid = single_gdf_line[single_gdf_line["moving_speed"]<3]
single_valid.shape

In [None]:
def set_emerging_thred(group_attr_valid, thred = 5):
    group_attr_valid["if_emerging"]= np.where(group_attr_valid["delta_frame"]<thred*29.97,0, 1)
    summary = group_attr_valid.groupby("if_emerging")["track_id"].nunique().reset_index()
    summary["thred"] = thred
    return summary, group_attr_valid

summary, group_attr_valid = set_emerging_thred(group_attr_valid)
single_valid = single_gdf_line[single_gdf_line["moving_speed"]<3]
group_attr_valid
# combine the single and group in one df and plot relationship
single_valid["in_group"] = 0
single_valid["group_size"] = 1
group_attr_valid["in_group"] = 1
df_clean = pd.concat([single_valid, group_attr_valid]).reset_index(drop = True)

# resultfolder = "../data/05_tracking_result_projected/step4_full"
# df_clean.to_parquet(os.path.join(resultfolder, f"{videoname}.parquet"), index = False)

In [None]:

# robust step
summaryDF = []
for thred in range(2, 30):
    summary, group_attr_valid = set_emerging_thred(group_attr_valid, thred)
    summaryDF.append(summary)
summaryDF = pd.concat(summaryDF).reset_index(drop = True).pivot(columns = "if_emerging", values = "track_id", index = "thred")\
.rename(columns = {0:"existing", 1:"emerging"})
summaryDF["total"] = summaryDF["existing"]+summaryDF["emerging"]


In [None]:
# bar plot the changes
fig, ax = plt.subplots(figsize = (5,5))
summaryDF[["existing", "emerging"]].plot.bar(
    stacked = True,
    ax = ax,
    color = [ "#ffdda6", "#3bc0cf"],
)
sns.despine()
ax.set_xlabel("Threshold (seconds)")
ax.set_ylabel("Number of pedestrians")
ax.grid(axis = "y", color = "grey", linestyle = "--", linewidth = 0.5)
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_tick_params(rotation=0)
# handles, labels = ax.get_legend_handles_labels()
# ax.legend(handles, ["Existing", "Emerging"], title = "Group Type")
ax.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0., fontsize = 10)
ax.xaxis.set_major_locator(plt.MaxNLocator(6))


In [None]:
resultfolder = "../data/05_tracking_result_projected/step4_full"
# df_clean.to_csv(os.path.join(resultfolder, f"{videoname}.csv"), index = False)

In [None]:
fig, ax = plt.subplots(figsize = (5,5))
sns.pointplot(
    data = df_clean,
    x = "group_size",
    y = "moving_speed",
    ci = 98,
    errwidth = 1,
    capsize=.2,
    color = "#3bc0cf"

)
# sns.regplot(
#     data = group_attr_valid,
#     x = "group_size",
#     y = "moving_speed",
#     ax = ax,
#     color = "#3bc0cf"
# )
ax.set_ylabel("Moving Speed (ft/s)")
ax.set_xlabel("Group Size")
sns.despine()

ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

In [None]:
fig, ax = plt.subplots(figsize = (5,5))
df_clean["in_group_des"] = np.where(df_clean["in_group"]==0, "Alone", "In Group")

daynightcolor = ["#ef5c43","#0c7cba"]
sns.set_palette(sns.color_palette(daynightcolor))

sns.boxplot(
    data = df_clean,
    x = "in_group_des",
    y = "moving_speed",
    linewidth = 0.5
    
)
ax.set_ylabel("Moving Speed (ft/s)")
ax.set_xlabel("Group Size")
sns.despine()


ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

# Create Visualization

In [None]:
newgroupunique = newgroup.drop_duplicates(["track_id","frame_id"]) 
newgroupunique.shape

In [None]:
result_folder = "../data/06_attr_result/"
# predpath = os.path.join(result_folder, f'{videoname}_attr_mot.csv')
predpath = os.path.join(result_folder, f'{videoname}.csv')
trace = pd.read_csv(predpath)
trace.rename(columns = {"x":"bbox0", "y":"bbox1", "w":"bbox2", "h":"bbox3"}, inplace = True)
for x in ["bbox0", "bbox1", "bbox2", "bbox3"]:
    trace[x] = trace[x].astype(int)

In [None]:
result_folder = "../data/06_attr_result/"
# predpath = os.path.join(result_folder, f'{videoname}_attr_mot.csv')
predpath = os.path.join(result_folder, f'{videoname}.csv')
trace = pd.read_csv(predpath)
trace.rename(columns = {"x":"bbox0", "y":"bbox1", "w":"bbox2", "h":"bbox3"}, inplace = True)
for x in ["bbox0", "bbox1", "bbox2", "bbox3"]:
    trace[x] = trace[x].astype(int)
# Visualize by comparing the group activity and single activity
def getbasics(file_path):
    video = cv2.VideoCapture(file_path)
    fps = video.get(cv2.CAP_PROP_FPS)
    print('frames per second =',fps)
    size = (int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))
    print('frames size =',size)
    # video.release()
    return video, fps, size

clipfolder = "../data/99_result_sample_export2"
if not os.path.exists(clipfolder):
    os.makedirs(clipfolder)
    
file_path = video_df[video_df["video_id"]==videoname]["video_path"].values[0]
video, fps, size = getbasics(file_path)

In [None]:
video_df[video_df["video_id"]==videoname]

In [None]:
# sample setup
dur = 120 # second, change here for other options
# startframe = int(newgroup["frame_id"].min())
startframe = 1
endframe = int(startframe + dur * fps)
# endframe = int(newgroup["frame_id"].max()) # fullvideo export

# clip the data

vizdata = newgroup[(newgroup["frame_id"]<=endframe)&(newgroup["frame_id"]>=startframe)].reset_index(drop = True)

vizdata = vizdata.merge(trace[['track_id', 'frame_id',  'bbox0', 'bbox1', 'bbox2', 'bbox3']],
                        on = ['track_id', 'frame_id'], how = 'inner')

# set up color
groups = vizdata['truegroup'].unique()
colorls = [tuple(random.choices(range(i%255,255), k =3)) for i in range(len(groups))]
colorset = dict(zip(groups, colorls))

In [None]:
# set startframe
video.set(cv2.CAP_PROP_POS_FRAMES, startframe)
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
video_out_name = videoname + f"_{startframe}_{endframe}_group"
video_out = cv2.VideoWriter(os.path.join(clipfolder, 
                                         f"{video_out_name}.mp4"), fourcc, fps, size)

In [None]:
ret = True
count = startframe
frame_total = endframe - startframe

while ret and count<=frame_total+startframe:
    ret, frame = video.read()
    frame_id = count
    data = vizdata[vizdata["frame_id"]==frame_id].reset_index(drop = True)
    for j, tr in enumerate(data['track_id'].values):
        gender = data.at[j,"gender"]
        gr = data.at[j, 'truegroup']
        # only viz the groups
        if gr in groups:
            cv2.rectangle(frame, 
                      (data.at[j,'bbox0'], 
                       data.at[j,'bbox1']), 
                      (data.at[j,'bbox0']+data.at[j,'bbox2'], 
                       data.at[j,'bbox1']+data.at[j,'bbox3']), 
                          colorset[gr], 2)
            cv2.putText(
                    frame, 
                    str(gr+" "+ gender),
                    (data.at[j,'bbox0'], data.at[j,'bbox1']-10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, colorset[gr],
                    2,
                    lineType=cv2.LINE_AA
                )
    # print(frame, ret)
    video_out.write(frame)
    count = count+1
    # print(ret)
video_out.release()