In [1]:
import sys, os

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import preprocess

# 1. Data Load

In [18]:
orig_df = pd.read_csv('./data/exhibition_behavior_preprocessed.csv', delimiter=',', index_col=False)

In [19]:
orig_df

Unnamed: 0,uid,date,filename,start,duration,A/C,behavior,code,M/F,appearance
0,0,11월 19일,01_20221119085958_part2,56240,2720,Child,물리적거리,Approach,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
1,0,11월 19일,01_20221119085958_part2,59360,2960,Child,상호작용시도,,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
2,0,11월 19일,01_20221119085958_part2,71420,5520,Child,물리적거리,Approach,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
3,0,11월 19일,01_20221119085958_part2,77400,6320,Child,상호작용시도,,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
4,1,11월 19일,01_20221119085958_part2,48240,9240,Child,물리적거리,Approach,Male,"검정색 점퍼, 청바지, 검정/흰색 운동화"
...,...,...,...,...,...,...,...,...,...,...
2682,371,12월 4일,01_20221204165959_part1,2567280,17280,Child,물리적거리,Pass,Female,"하늘색 롱패딩, 흰색 바지"
2683,371,12월 4일,01_20221204165959_part1,2893760,14160,Child,물리적거리,Pass,Female,"하늘색 롱패딩, 흰색 바지"
2684,372,12월 4일,01_20221204165959_part1,2567280,18080,Adult,물리적거리,Pass,Male,"검은색 롱패딩, 검은색 바지"
2685,373,12월 4일,01_20221204165959_part2,15760,13840,Child,물리적거리,Pass,Female,"하늘색 롱패딩, 흰색 바지"


Remove 'Pass' tag

In [20]:
orig_df = orig_df.drop(orig_df[orig_df['code'] == 'Pass'].index)

In [21]:
orig_df = orig_df.reset_index(drop=True)

# 2. Indexing
Avoid, Follow를 제거하지 않고 Approach와 결합하여 indexing

In [22]:
orig_df = preprocess.index_with_start(orig_df)
orig_df = preprocess.reindex(orig_df, ['uid', 'pid', 'date', 'filename', 'start', 'duration', 'A/C', 'M/F', 'behavior', 'code', 'appearance'])

In [23]:
orig_df

Unnamed: 0,uid,pid,date,filename,start,duration,A/C,M/F,behavior,code,appearance
0,0,0,11월 19일,01_20221119085958_part2,56240,2720,Child,Female,물리적거리,Approach,"포니테일, 아이보리색 점퍼, 검정색 바지"
1,0,0,11월 19일,01_20221119085958_part2,59360,2960,Child,Female,상호작용시도,,"포니테일, 아이보리색 점퍼, 검정색 바지"
2,0,1,11월 19일,01_20221119085958_part2,71420,5520,Child,Female,물리적거리,Approach,"포니테일, 아이보리색 점퍼, 검정색 바지"
3,0,1,11월 19일,01_20221119085958_part2,77400,6320,Child,Female,상호작용시도,,"포니테일, 아이보리색 점퍼, 검정색 바지"
4,1,2,11월 19일,01_20221119085958_part2,48240,9240,Child,Male,물리적거리,Approach,"검정색 점퍼, 청바지, 검정/흰색 운동화"
...,...,...,...,...,...,...,...,...,...,...,...
2362,367,563,12월 4일,01_20221204160000_part1,1746960,11680,Child,Male,상호작용시도,,"검은색 롱패딩, 검은색 바지"
2363,367,563,12월 4일,01_20221204160000_part1,1758640,4800,Child,Male,물리적거리,Approach,"검은색 롱패딩, 검은색 바지"
2364,367,563,12월 4일,01_20221204160000_part1,1763440,16480,Child,Male,상호작용시도,,"검은색 롱패딩, 검은색 바지"
2365,369,564,12월 4일,01_20221204165959_part1,2730640,12640,Child,Male,물리적거리,Approach,"갈색 후리스, 검은색 바지"


# 3. Make Data

In [24]:
def make_data(df):
    df = df.groupby('pid').agg({'code': list, 'start': list, 'duration': list, 'A/C': 'first', 'M/F': 'first', 'appearance': list})
    df['appearance'] = [set(data) for data in df['appearance']]
#     df = df.drop(df[df['code'].apply(lambda x: len(x) <= 1)].index)
    df.reset_index(inplace=True, drop=True)
    return df

In [25]:
df = orig_df[['pid', 'code', 'start', 'duration', 'A/C', 'M/F', 'appearance']]

In [26]:
df = make_data(df)

In [27]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}"
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}"
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}"
...,...,...,...,...,...,...
560,"[None, Approach, None, Approach, None]","[1716560, 1729280, 1735360, 1768880, 1771040]","[12720, 6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}"
561,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}"
562,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}"
563,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}"


In [28]:
df.reset_index(inplace=True, drop=True)
df.index

RangeIndex(start=0, stop=565, step=1)

## 3-1) 이상치 제거
1) code list가 상호작용시도로 이루어져 있는 경우
2) code list에 Approach가 없는 경우

In [29]:
print("code list의 길이가 1 이하이고 코드 리스트가 상호작용 코드로 시작하는 데이터를 drop")
count = 0

for i in df.index:
    try:
        if len(df['code'][i]) <= 1 and (df['code'][i][0] == 'None' or df['code'][i][0] == 'Touch' or df['code'][i][0] == 'Gesture'):
            print(df['code'][i-1] ,df['appearance'][i-1])
            print(df['code'][i] ,df['appearance'][i])
            print(df['code'][i+1] ,df['appearance'][i+1])
            print("--------------------------------------")
            df.drop(i, inplace=True)
            count += 1
    except:
        break

print("drop한 데이터 개수:", count)

code list의 길이가 1 이하이고 코드 리스트가 상호작용 코드로 시작하는 데이터를 drop
['Approach'] {'검정색 롱코트, 연두색 후리스 입은 아이와 동행'}
['None'] {'검정색 롱코트, 연두색 후리스 입은 아이와 동행'}
['Approach', 'None', 'Approach', 'None'] {'연두색 후리스, 흰색 바지'}
--------------------------------------
['Approach', 'None', 'Gesture'] {'검정색 점퍼, 분홍색 상의'}
['Gesture'] {'검정색 점퍼, 분홍색 상의'}
['Approach', 'Gesture', 'None', 'Gesture'] {'검정색 후드, 흰색 상의'}
--------------------------------------
['Approach', 'None', 'Approach', 'Touch'] {'빨간 리본, 양갈래 머리, 무지개색 상의'}
['None'] {'빨간 리본, 양갈래 머리, 무지개색 상의'}
['Approach', 'None'] {'갈색 후드, 흰색 바지, 배낭'}
--------------------------------------
['Approach', 'Touch', 'Gesture', 'None', 'Follow'] {'검정색 후드(팔에 줄무늬), 검은 바지, 검정색 운동화'}
['Gesture'] {'하늘색 상의, 검은색 바지'}
['Approach', 'None'] {'회색 상의, 청바지, 흰색 운동화'}
--------------------------------------
['Approach'] {'검정색 상의 안 회색 티, 검정색 바지, 흰색 운동화'}
['None'] {'검정색 상의 안 회색 티, 검정색 바지, 흰색 운동화'}
['Approach', 'None', 'Gesture', 'None', 'Gesture'] {'베이지색 상의, 검정색 바지'}
----------------------------------

In [30]:
print("코드 리스트가 상호작용시도 코드로 시작하고 리스트 내에 approach가 없는 경우 drop")
count = 0

for i in df.index:
    try:
        if (df['code'][i][0] == 'None' or df['code'][i][0] == 'Touch' or df['code'][i][0] == 'Gesture'):
            if 'Approach' not in df['code'][i]:
                print(df['code'][i])
                df.drop(i, inplace=True)
                count += 1
    except: 
        break
        
print("drop한 데이터 개수:", count)

코드 리스트가 상호작용시도 코드로 시작하고 리스트 내에 approach가 없는 경우 drop
['None', 'Touch', 'None', 'Touch']
['None', 'Avoid']
['Touch', 'Avoid']
['Touch', 'None']
['Gesture', 'None', 'Touch', 'None']
['Touch']
['Touch', 'None']
['Touch']
['Touch']
['Touch']
['Touch']
['Touch']
['Touch', 'Touch']
['None']
['None']
['None']
['None']
['None', 'Touch', 'None', 'Touch', 'None']
['None']
['Gesture', 'None', 'Gesture']
['Gesture']
['None']
['None']
['None']
['Gesture']
['None']
['None']
['None']
['None']
['None']
['None']
['None']
['None', 'Avoid']
['None']
['None']
['None', 'Touch', 'None', 'Touch', 'None']
['None']
['None']
drop한 데이터 개수: 38


In [31]:
print("코드 리스트가 상호작용시도 코드로 시작하지만 Approach 코드가 있는 경우 Approach 이전의 상호작용시도 코드들을 제거")
index = 0
count = 0

for i in df.index:
    try:
        index = 0
        if (df['code'][i][0] == 'None' or df['code'][i][0] == 'Touch' or df['code'][i][0] == 'Gesture') \
                and ('Approach' in df['code'][i]):
            for code in df['code'][i]:
                if code != 'Approach' and code != 'Avoid' and code != 'Follow':
                    index += 1
                else:
                    break
            if index != 0:
                print("before:", df['code'][i])
                print("before:", df['start'][i])
                print("before:", df['duration'][i])
                print()
                
                print("after: ", df['code'][i][index:])
                print("after: ", df['start'][i][index:])
                print("after: ", df['duration'][i][index:])
                print("---------------------------------------")
                
                df['code'][i] = df['code'][i][index:]
                df['start'][i] = df['start'][i][index:]
                df['duration'][i] = df['duration'][i][index:]
                
                count += 1
    except: 
        break

print("값을 바꾼 데이터 개수:", count)

코드 리스트가 상호작용시도 코드로 시작하지만 Approach 코드가 있는 경우 Approach 이전의 상호작용시도 코드들을 제거
before: ['Touch', 'Approach']
before: [532360, 553280]
before: [24280, 3360]

after:  ['Approach']
after:  [553280]
after:  [3360]
---------------------------------------
before: ['Gesture', 'Avoid', 'Approach', 'Touch', 'None', 'Touch', 'Avoid', 'Follow', 'Approach', 'Approach', 'Touch', 'None', 'Touch', 'None', 'Touch', 'None']
before: [0, 9280, 18000, 23440, 33360, 34720, 39920, 44480, 51120, 57920, 60480, 69040, 96480, 100320, 107760, 111840]
before: [7920, 8720, 5440, 9920, 1360, 6080, 4560, 7640, 7360, 2320, 8560, 27440, 3840, 7440, 4080, 7200]

after:  ['Avoid', 'Approach', 'Touch', 'None', 'Touch', 'Avoid', 'Follow', 'Approach', 'Approach', 'Touch', 'None', 'Touch', 'None', 'Touch', 'None']
after:  [9280, 18000, 23440, 33360, 34720, 39920, 44480, 51120, 57920, 60480, 69040, 96480, 100320, 107760, 111840]
after:  [8720, 5440, 9920, 1360, 6080, 4560, 7640, 7360, 2320, 8560, 27440, 3840, 7440, 4080, 7200]
----

In [33]:
print("코드 리스트에 Avoid 또는 Follow와 상호작용시도 코드로만 이루어진 data drop")
count = 0

for i in df.index:
    try:
        if (len(df['code'][i]) > 1) and (df['code'][i][0] == 'Avoid' or df['code'][i][0] == 'Follow'):
            if 'Approach' not in df['code'][i]:
                print(df['code'][i])
                df.drop(i, inplace=True)
                count += 1
    except:
        break

print("drop한 데이터 개수:", count)

코드 리스트에 Avoid 또는 Follow와 상호작용시도 코드로만 이루어진 data drop
drop한 데이터 개수: 0


In [34]:
print("code list의 길이가 1 이하인 code data를 drop")
count = 0

for i in df.index:
    try:
        if (len(df['code'][i]) <= 1 or 'Approach' not in df['code'][i]) \
                or (len(df['code'][i]) <= 1 and 'Approach' in df['code'][i]):
            print(df['code'][i])
            df.drop(i, inplace=True)
            count += 1
    except:
        break

print("drop한 데이터 개수:", count)
df.reset_index(inplace=True, drop=True)

code list의 길이가 1 이하인 code data를 drop
['Approach']
['Follow']
['Follow']
['Follow']
['Follow']
['Approach']
['Follow']
['Approach']
['Follow']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Avoid']
['Follow']
['Approach']
['Follow']
['Approach']
['Approach']
['Approach']
drop한 데이터 개수: 25


In [35]:
df.reset_index(inplace=True, drop=True)
df.index

RangeIndex(start=0, stop=485, step=1)

In [36]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}"
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}"
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}"
...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1729280, 1735360, 1768880, 1771040]","[6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}"
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}"
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}"
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}"


In [37]:
df.to_csv('./data/behavior_hmm_data.csv')

## 3-2) One-Hot Encoding (code, duration)

In [38]:
def code_one_hot_encoding(df):
    # 행동 코드 리스트
    actions = ['Avoid', 'Follow', 'Approach', 'None', 'Touch', 'Gesture']

    # 각각의 sublist를 one-hot encoding하여 다차원 리스트로 생성
    one_hot_data = []
    for sublist in df['code']:
        one_hot_sublist = np.zeros((len(sublist), len(actions)))
        for i, code in enumerate(sublist):
            index = actions.index(code)
            one_hot_sublist[i][index] = 1
        one_hot_data.append(one_hot_sublist)
    
    df['encoded_code'] = one_hot_data
    df['encoded_code'] = df[['encoded_code']].apply(lambda x: [np.array(item).astype(int) for item in x])

    return df

In [39]:
def duration_one_hot_encoding(df):
    # 행동 코드 리스트
    actions = ['Avoid', 'Follow', 'Approach', 'None', 'Touch', 'Gesture']

    # 각각의 sublist를 one-hot encoding하여 다차원 리스트로 생성
    one_hot_data = []
    for i, sublist in enumerate(df['code']):
        one_hot_sublist = np.zeros((len(sublist), len(actions)))
        for j, code in enumerate(sublist):
            index = actions.index(code)
            one_hot_sublist[j][index] = df['duration'][i][j]
        one_hot_data.append(one_hot_sublist)
    
    df['encoded_duration'] = one_hot_data
    df['encoded_duration'] = df[['encoded_duration']].apply(lambda x: [np.array(item).astype(int) for item in x])

    return df

In [40]:
df = code_one_hot_encoding(df)
df = duration_one_hot_encoding(df)

In [41]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance,encoded_code,encoded_duration
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 2720, 0, 0, 0], [0, 0, 0, 2960, 0, 0]]"
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 5520, 0, 0, 0], [0, 0, 0, 6320, 0, 0]]"
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9240, 0, 0, 0], [0, 0, 0, 15600, 0, 0]..."
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 4160, 0, 0, 0], [0, 0, 0, 7760, 0, 0]]"
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 8640, 0, 0, 0], [0, 0, 0, 16480, 0, 0]..."
...,...,...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1729280, 1735360, 1768880, 1771040]","[6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 6080, 0, 0, 0], [0, 0, 0, 33520, 0, 0]..."
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 9760, 0, 0, 0], [0, 0, 0, 135360, 0, 0]]"
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 10400, 0, 0, 0], [0, 0, 0, 6560, 0, 0]..."
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 7680, 0, 0, 0], [0, 0, 0, 5200, 0, 0],..."


## 3-3) Add Zero Padding
sequence가 가장 긴 data에 맞춰 zero padding을 더함

In [49]:
def padding_encoding_data(df, col_name, num):
    max_length = max(map(len, df[col_name]))  # 가장 긴 sequence의 길이 구하기
    padded_data = []
    
    for data in df[col_name]:
        if max_length - len(data) == 0:
            padded_data.append(data)
        else:
            # 가장 긴 sequence 길이를 기준으로 padding 적용
            padding_list = [[0 for i in range(num)]] * (max_length - len(data))
            result = np.concatenate((data, padding_list), axis=0)
            padded_data.append(result)
        
    df[col_name] = padded_data

    return df

In [50]:
df = padding_encoding_data(df, "encoded_code", 6)
df = padding_encoding_data(df, "encoded_duration", 6)

In [51]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance,encoded_code,encoded_duration,time_series_data
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 2720, 0, 0, 0], [0, 0, 0, 2960, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 2720, 0, 0, 0], [0, ..."
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 5520, 0, 0, 0], [0, 0, 0, 6320, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 5520, 0, 0, 0], [0, ..."
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9240, 0, 0, 0], [0, 0, 0, 15600, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 9240, 0, 0, 0], [0, ..."
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 4160, 0, 0, 0], [0, 0, 0, 7760, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 4160, 0, 0, 0], [0, ..."
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 8640, 0, 0, 0], [0, 0, 0, 16480, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 8640, 0, 0, 0], [0, ..."
...,...,...,...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1729280, 1735360, 1768880, 1771040]","[6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 6080, 0, 0, 0], [0, 0, 0, 33520, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 6080, 0, 0, 0], [0, ..."
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9760, 0, 0, 0], [0, 0, 0, 135360, 0, 0...","[[0, 0, 1, 0, 0, 0, 0, 0, 9760, 0, 0, 0], [0, ..."
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 10400, 0, 0, 0], [0, 0, 0, 6560, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 10400, 0, 0, 0], [0,..."
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 7680, 0, 0, 0], [0, 0, 0, 5200, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 7680, 0, 0, 0], [0, ..."


## 3-4) Make Time Series Data
code list와 duration list를 병합하여 시계열 데이터로 변환

In [52]:
def make_data_for_hmm(df):
    time_series_data = []

    for encoded_code, encoded_duration in zip(df['encoded_code'], df['encoded_duration']):
        encoded_code = np.squeeze(encoded_code)
        encoded_duration = np.squeeze(encoded_duration)

        # encoded_code와 encoded_duration을 수평으로 결합합니다.
        time_series_data.append(np.hstack([encoded_code, encoded_duration]))

    df['time_series_data'] = time_series_data
    
    return df

In [53]:
df = make_data_for_hmm(df)

In [54]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance,encoded_code,encoded_duration,time_series_data
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 2720, 0, 0, 0], [0, 0, 0, 2960, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 2720, 0, 0, 0], [0, ..."
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 5520, 0, 0, 0], [0, 0, 0, 6320, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 5520, 0, 0, 0], [0, ..."
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9240, 0, 0, 0], [0, 0, 0, 15600, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 9240, 0, 0, 0], [0, ..."
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 4160, 0, 0, 0], [0, 0, 0, 7760, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 4160, 0, 0, 0], [0, ..."
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 8640, 0, 0, 0], [0, 0, 0, 16480, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 8640, 0, 0, 0], [0, ..."
...,...,...,...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1729280, 1735360, 1768880, 1771040]","[6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 6080, 0, 0, 0], [0, 0, 0, 33520, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 6080, 0, 0, 0], [0, ..."
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9760, 0, 0, 0], [0, 0, 0, 135360, 0, 0...","[[0, 0, 1, 0, 0, 0, 0, 0, 9760, 0, 0, 0], [0, ..."
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 10400, 0, 0, 0], [0, 0, 0, 6560, 0, 0]...","[[0, 0, 1, 0, 0, 0, 0, 0, 10400, 0, 0, 0], [0,..."
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 7680, 0, 0, 0], [0, 0, 0, 5200, 0, 0],...","[[0, 0, 1, 0, 0, 0, 0, 0, 7680, 0, 0, 0], [0, ..."


In [55]:
df.to_csv("./data/behavior_hmm_preprocessed.csv")

# HMM

In [64]:
import ast
from hmmlearn import hmm

In [87]:
n_samples

485

In [88]:
n_timesteps

30

In [89]:
n_features

12

In [66]:
time_series_data = []

for encoded_code, encoded_duration in zip(df['encoded_code'], df['encoded_duration']):
    encoded_code = np.squeeze(encoded_code)
    encoded_duration = np.squeeze(encoded_duration)

    # encoded_code와 encoded_duration을 수평으로 결합합니다.
    time_series_data.append(np.hstack([encoded_code, encoded_duration]))

time_series_data = np.array(time_series_data)
n_samples, n_timesteps, n_features = time_series_data.shape

In [67]:
X = time_series_data.reshape((n_samples, n_timesteps * n_features))

## Initialize GaussianHMM

In [68]:
n_states = 3

# Define the HMM model
model = hmm.GaussianHMM(n_components=n_states)

## Train

In [69]:
# Train the model
model.fit(X)

## Classify

In [70]:
# Predict the sequence of states for each observed sequence
predicted_states = model.predict(X)

# Decode the predicted states back to engagement levels
results = np.array(['state0' if s == 2 else 'state1' if s == 1 else 'state2' for s in predicted_states])

In [72]:
chunk_size = n_timesteps * n_features  # 하나의 2차원 리스트의 크기
num_chunks = len(results) // chunk_size  # 나눈 후 생성될 리스트의 개수
results_ = [results[i:i+chunk_size] for i in range(0, num_chunks*chunk_size, chunk_size)]

In [73]:
behavior = df[['code', 'duration']]

In [74]:
behavior['predicted_engagement_levels'] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  behavior['predicted_engagement_levels'] = results


In [75]:
behavior['predicted_engagement_levels'].value_counts()

state1    266
state2    181
state0     38
Name: predicted_engagement_levels, dtype: int64

## Save

In [76]:
# 저장할 파일 경로
file_path = './data/new_behavior_hmm_result_state3.xlsx'

# ExcelWriter 객체 생성
writer = pd.ExcelWriter(file_path)

In [77]:
state0_df = behavior[behavior['predicted_engagement_levels'] == 'state0']
# 'code' 칼럼의 데이터 길이
state0_df['code']

16     [Approach, None, Touch, None, Gesture, None, F...
17     [Approach, Gesture, None, Gesture, Touch, None...
48     [Approach, None, Gesture, None, Touch, None, T...
77     [Approach, None, Approach, Touch, None, Touch,...
81     [Approach, None, Gesture, None, None, Approach...
85     [Approach, None, Gesture, None, Approach, None...
87     [Approach, None, Gesture, None, Touch, None, G...
118    [Approach, Touch, Approach, Touch, Approach, A...
170    [Approach, Touch, Touch, None, Touch, None, To...
188    [Approach, Touch, None, Approach, Touch, None,...
189    [Approach, None, Approach, Touch, None, Approa...
191    [Approach, Gesture, Approach, Touch, Approach,...
192    [Approach, None, Touch, None, Touch, None, Tou...
194    [Approach, Touch, None, Touch, None, Touch, No...
209    [Approach, Touch, None, Approach, None, Touch,...
213    [Approach, None, Touch, None, Touch, None, App...
218    [Approach, None, Gesture, None, Gesture, None,...
228    [Approach, Touch, None, 

In [78]:
state1_df = behavior[behavior['predicted_engagement_levels'] == 'state1']
# 'code' 칼럼의 데이터 길이
state1_df['code']

0                      [Approach, None]
1                      [Approach, None]
2      [Approach, None, Approach, None]
3                      [Approach, None]
4      [Approach, None, Approach, None]
                     ...               
473                    [Approach, None]
475                    [Approach, None]
478                    [Approach, None]
480    [Approach, None, Approach, None]
481                    [Approach, None]
Name: code, Length: 266, dtype: object

In [79]:
state2_df = behavior[behavior['predicted_engagement_levels'] == 'state2']
# 'code' 칼럼의 데이터 길이
state2_df['code']

10      [Approach, None, Approach, Touch, None, Gesture]
11     [Approach, None, None, Approach, None, Approac...
18                             [Approach, Gesture, None]
22                                   [Approach, Gesture]
23                                   [Approach, Gesture]
                             ...                        
476                 [Approach, Touch, None, Touch, None]
477                                  [Approach, Gesture]
479    [Approach, None, Touch, None, Approach, Touch,...
482     [Approach, None, Approach, None, Approach, None]
484                                    [Approach, Touch]
Name: code, Length: 181, dtype: object

In [80]:
# 각 DataFrame을 다른 시트에 저장
state0_df.to_excel(writer, sheet_name='state0', index=False)
state1_df.to_excel(writer, sheet_name='state1', index=False)
state2_df.to_excel(writer, sheet_name='state2', index=False)

In [81]:
# 저장 및 파일 닫기
writer.save()
writer.close()

  writer.save()


## Analyze Trained Parameters

In [82]:
# Print trained parameters and plot
print(">> Transition matrix")
print(np.round(model.transmat_, 2))
print()

print(">> Initial Probability")
print(np.round(model.startprob_, 2))
print()

>> Transition matrix
[[0.42 0.5  0.08]
 [0.35 0.59 0.06]
 [0.34 0.5  0.16]]

>> Initial Probability
[0. 1. 0.]



In [84]:
len(model.means_[0])

360

In [86]:
len(model.covars_[0])

360

In [83]:
print(">> Means and vars of each hidden state")
for i in range(model.n_components):
    print("{0}th hidden state".format(i))
    print("mean = ", np.round(model.means_[i],2))
    print("var = ", np.round(np.diag(model.covars_[i]),2))
    print()

>> Means and vars of each hidden state
0th hidden state
mean =  [2.00000e-02 2.00000e-02 9.60000e-01 0.00000e+00 0.00000e+00 0.00000e+00
 1.25970e+02 1.04310e+02 6.80365e+03 0.00000e+00 0.00000e+00 0.00000e+00
 3.00000e-02 2.00000e-02 3.00000e-02 2.50000e-01 4.30000e-01 2.40000e-01
 2.19670e+02 3.04970e+02 1.48510e+02 3.33834e+03 7.18431e+03 1.47547e+03
 3.00000e-02 2.00000e-02 1.90000e-01 3.20000e-01 9.00000e-02 7.00000e-02
 1.80770e+02 4.70280e+02 8.56800e+02 4.36221e+03 6.96570e+02 3.53150e+02
 2.00000e-02 2.00000e-02 6.00000e-02 2.20000e-01 2.20000e-01 3.00000e-02
 5.52500e+01 1.97570e+02 1.43200e+02 3.32530e+03 2.42652e+03 2.31600e+02
 1.00000e-02 1.00000e-02 1.30000e-01 2.10000e-01 6.00000e-02 4.00000e-02
 5.74600e+01 1.43650e+02 7.16460e+02 2.05657e+03 6.29390e+02 2.57680e+02
 2.00000e-02 0.00000e+00 3.00000e-02 1.60000e-01 1.40000e-01 3.00000e-02
 9.59100e+01 0.00000e+00 1.12270e+02 3.69481e+03 8.23430e+02 1.29500e+02
 0.00000e+00 2.00000e-02 3.00000e-02 1.60000e-01 4.00000e-02