In [1]:
import sys, os
sys.path.append('../..')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import preprocess

# 1. Data Load

In [390]:
orig_df = pd.read_csv('../../data/exhibition_behavior_preprocessed.csv', delimiter=',', index_col=False)

In [391]:
orig_df

Unnamed: 0,uid,date,filename,start,duration,A/C,behavior,code,M/F,appearance
0,0,11월 19일,01_20221119085958_part2,56240,2720,Child,물리적거리,Approach,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
1,0,11월 19일,01_20221119085958_part2,59360,2960,Child,상호작용시도,,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
2,0,11월 19일,01_20221119085958_part2,71420,5520,Child,물리적거리,Approach,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
3,0,11월 19일,01_20221119085958_part2,77400,6320,Child,상호작용시도,,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
4,1,11월 19일,01_20221119085958_part2,48240,9240,Child,물리적거리,Approach,Male,"검정색 점퍼, 청바지, 검정/흰색 운동화"
...,...,...,...,...,...,...,...,...,...,...
2682,371,12월 4일,01_20221204165959_part1,2567280,17280,Child,물리적거리,Pass,Female,"하늘색 롱패딩, 흰색 바지"
2683,371,12월 4일,01_20221204165959_part1,2893760,14160,Child,물리적거리,Pass,Female,"하늘색 롱패딩, 흰색 바지"
2684,372,12월 4일,01_20221204165959_part1,2567280,18080,Adult,물리적거리,Pass,Male,"검은색 롱패딩, 검은색 바지"
2685,373,12월 4일,01_20221204165959_part2,15760,13840,Child,물리적거리,Pass,Female,"하늘색 롱패딩, 흰색 바지"


In [392]:
orig_df = orig_df.drop(orig_df[orig_df['code'] == 'Pass'].index)

In [393]:
orig_df = orig_df.reset_index(drop=True)

# 2. Indexing
Avoid, Follow를 제거하지 않고 Approach와 결합하여 indexing

In [394]:
def assign_pid(df):
    pids = []
    current_pid = 0

    for i in range(len(df)):
        if i == 0:
            pids.append(current_pid)
            continue
        
        prev_row = df.iloc[i - 1]
        current_row = df.iloc[i]

        if prev_row['appearance'] == current_row['appearance'] and (current_row['start'] - (prev_row['start'] + prev_row['duration'])) < 2000:
            pids.append(current_pid)
        else:
            current_pid += 1
            pids.append(current_pid)

    df['pid'] = pids
    return df

In [395]:
orig_df = assign_pid(orig_df)
orig_df = preprocess.reindex(orig_df)
orig_df.to_csv('test.csv', index=False)

In [396]:
orig_df

Unnamed: 0,uid,pid,date,filename,start,duration,A/C,behavior,code,M/F,appearance
0,0,0,11월 19일,01_20221119085958_part2,56240,2720,Child,물리적거리,Approach,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
1,0,0,11월 19일,01_20221119085958_part2,59360,2960,Child,상호작용시도,,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
2,0,1,11월 19일,01_20221119085958_part2,71420,5520,Child,물리적거리,Approach,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
3,0,1,11월 19일,01_20221119085958_part2,77400,6320,Child,상호작용시도,,Female,"포니테일, 아이보리색 점퍼, 검정색 바지"
4,1,2,11월 19일,01_20221119085958_part2,48240,9240,Child,물리적거리,Approach,Male,"검정색 점퍼, 청바지, 검정/흰색 운동화"
...,...,...,...,...,...,...,...,...,...,...,...
2362,367,563,12월 4일,01_20221204160000_part1,1746960,11680,Child,상호작용시도,,Male,"검은색 롱패딩, 검은색 바지"
2363,367,563,12월 4일,01_20221204160000_part1,1758640,4800,Child,물리적거리,Approach,Male,"검은색 롱패딩, 검은색 바지"
2364,367,563,12월 4일,01_20221204160000_part1,1763440,16480,Child,상호작용시도,,Male,"검은색 롱패딩, 검은색 바지"
2365,369,564,12월 4일,01_20221204165959_part1,2730640,12640,Child,물리적거리,Approach,Male,"갈색 후리스, 검은색 바지"


In [480]:
orig_df.to_csv('temp.csv')

# 3. Make Data

In [397]:
def make_data(df):
    df = df.groupby('pid').agg({'code': list, 'start': list, 'duration': list, 'A/C': 'first', 'M/F': 'first', 'appearance': list})
    df['appearance'] = [set(data) for data in df['appearance']]
#     df = df.drop(df[df['code'].apply(lambda x: len(x) <= 1)].index)
    df.reset_index(inplace=True, drop=True)
    return df

In [398]:
df = orig_df[['pid', 'code', 'start', 'duration', 'A/C', 'M/F', 'appearance']]

In [399]:
df = make_data(df)

In [400]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}"
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}"
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}"
...,...,...,...,...,...,...
560,"[None, Approach, None, Approach, None]","[1716560, 1729280, 1735360, 1768880, 1771040]","[12720, 6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}"
561,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}"
562,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}"
563,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}"


In [401]:
df.reset_index(inplace=True, drop=True)
df.index

RangeIndex(start=0, stop=565, step=1)

## 3-1) 이상치 제거
1) code list가 상호작용시도로 이루어져 있는 경우
2) code list에 Approach가 없는 경우

In [402]:
print("code list의 길이가 1 이하이고 상호작용시도의 code data를 drop")
count = 0

for i in df.index:
    try:
        if len(df['code'][i]) <= 1 and (df['code'][i][0] == 'None' or df['code'][i][0] == 'Touch' or df['code'][i][0] == 'Gesture'):
            print(df['code'][i-1] ,df['appearance'][i-1])
            print(df['code'][i] ,df['appearance'][i])
            print(df['code'][i+1] ,df['appearance'][i+1])
            print("--------------------------------------")
            df.drop(i, inplace=True)
            count += 1
    except:
        break

print("drop한 데이터 개수:", count)

code list의 길이가 1 이하이고 상호작용시도의 code data를 drop
['Approach'] {'검정색 롱코트, 연두색 후리스 입은 아이와 동행'}
['None'] {'검정색 롱코트, 연두색 후리스 입은 아이와 동행'}
['Approach', 'None', 'Approach', 'None'] {'연두색 후리스, 흰색 바지'}
--------------------------------------
['Approach', 'None', 'Gesture'] {'검정색 점퍼, 분홍색 상의'}
['Gesture'] {'검정색 점퍼, 분홍색 상의'}
['Approach', 'Gesture', 'None', 'Gesture'] {'검정색 후드, 흰색 상의'}
--------------------------------------
['Approach', 'None', 'Approach', 'Touch'] {'빨간 리본, 양갈래 머리, 무지개색 상의'}
['None'] {'빨간 리본, 양갈래 머리, 무지개색 상의'}
['Approach', 'None'] {'갈색 후드, 흰색 바지, 배낭'}
--------------------------------------
['Approach', 'Touch', 'Gesture', 'None', 'Follow'] {'검정색 후드(팔에 줄무늬), 검은 바지, 검정색 운동화'}
['Gesture'] {'하늘색 상의, 검은색 바지'}
['Approach', 'None'] {'회색 상의, 청바지, 흰색 운동화'}
--------------------------------------
['Approach'] {'검정색 상의 안 회색 티, 검정색 바지, 흰색 운동화'}
['None'] {'검정색 상의 안 회색 티, 검정색 바지, 흰색 운동화'}
['Approach', 'None', 'Gesture', 'None', 'Gesture'] {'베이지색 상의, 검정색 바지'}
--------------------------------------
['A

In [403]:
print("상호작용시도로만 이루어져있는 code list drop")
count = 0

for i in df.index:
    try:
        if (df['code'][i][0] == 'None' or df['code'][i][0] == 'Touch' or df['code'][i][0] == 'Gesture'):
            if 'Approach' not in df['code'][i]:
                print(df['code'][i])
                df.drop(i, inplace=True)
                count += 1
    except: 
        break
        
print("drop한 데이터 개수:", count)

상호작용시도로만 이루어져있는 code list drop
['None', 'Touch', 'None', 'Touch']
['None', 'Avoid']
['Touch', 'Avoid']
['Touch', 'None']
['Gesture', 'None', 'Touch', 'None']
['Touch']
['Touch', 'None']
['Touch']
['Touch']
['Touch']
['Touch']
['Touch']
['Touch', 'Touch']
['None']
['None']
['None']
['None']
['None', 'Touch', 'None', 'Touch', 'None']
['None']
['Gesture', 'None', 'Gesture']
['Gesture']
['None']
['None']
['None']
['Gesture']
['None']
['None']
['None']
['None']
['None']
['None']
['None']
['None', 'Avoid']
['None']
['None']
['None', 'Touch', 'None', 'Touch', 'None']
['None']
['None']
drop한 데이터 개수: 38


In [404]:
print("상호작용시도로만 이루어져있는 code list drop")
index = 0
count = 0

for i in df.index:
    try:
        index = 0
        if (df['code'][i][0] == 'None' or df['code'][i][0] == 'Touch' or df['code'][i][0] == 'Gesture') \
                and ('Approach' in df['code'][i]):
            for code in df['code'][i]:
                if code != 'Approach' and code != 'Avoid' and code != 'Follow':
                    index += 1
                else:
                    break
            if index != 0:
                print("before:", df['code'][i])
                print("after: ", df['code'][i][index:])
                print("---------------------------------------")
                
                df['code'][i] = df['code'][i][index:]
                
                count += 1
    except: 
        break

print("값을 바꾼 데이터 개수:", count)

상호작용시도로만 이루어져있는 code list drop
before: ['Touch', 'Approach']
after:  ['Approach']
---------------------------------------
before: ['Gesture', 'Avoid', 'Approach', 'Touch', 'None', 'Touch', 'Avoid', 'Follow', 'Approach', 'Approach', 'Touch', 'None', 'Touch', 'None', 'Touch', 'None']
after:  ['Avoid', 'Approach', 'Touch', 'None', 'Touch', 'Avoid', 'Follow', 'Approach', 'Approach', 'Touch', 'None', 'Touch', 'None', 'Touch', 'None']
---------------------------------------
before: ['None', 'Touch', 'None', 'Touch', 'None', 'Approach', 'None', 'Touch', 'None', 'Touch']
after:  ['Approach', 'None', 'Touch', 'None', 'Touch']
---------------------------------------
before: ['Touch', 'None', 'Approach', 'None', 'Touch', 'None', 'Approach', 'None']
after:  ['Approach', 'None', 'Touch', 'None', 'Approach', 'None']
---------------------------------------
before: ['None', 'Approach', 'Touch', 'None', 'Gesture', 'Touch', 'None', 'Gesture', 'Touch', 'None', 'Gesture', 'None', 'Touch', 'None', 'Approac

In [405]:
print("code list에 Aproach가 없는 data drop")
count = 0

for i in df.index:
    try:
        if (len(df['code'][i]) > 1) and (df['code'][i][0] == 'Avoid' or df['code'][i][0] == 'Follow'):
            if 'Approach' not in df['code'][i]:
                print(df['code'][i])
                df.drop(i, inplace=True)
                count += 1
    except:
        break

print("drop한 데이터 개수:", count)

code list에 Aproach가 없는 data drop
['Avoid', 'None']
['Follow', 'None']
['Avoid', 'None']
['Avoid', 'None', 'Touch']
['Follow', 'Avoid', 'Follow']
['Avoid', 'None']
['Follow', 'Avoid']
['Avoid', 'Follow']
['Avoid', 'Follow']
['Avoid', 'Follow']
['Follow', 'Avoid']
drop한 데이터 개수: 11


In [406]:
print("code list의 길이가 1 이하인 code data를 drop")
count = 0

for i in df.index:
    try:
        if (len(df['code'][i]) <= 1 or 'Approach' not in df['code'][i]) \
                or (len(df['code'][i]) <= 1 and 'Approach' in df['code'][i]):
            print(df['code'][i])
            df.drop(i, inplace=True)
            count += 1
    except:
        break

print("drop한 데이터 개수:", count)
df.reset_index(inplace=True, drop=True)

code list의 길이가 1 이하인 code data를 drop
['Approach']
['Follow']
['Follow']
['Follow']
['Follow']
['Approach']
['Follow']
['Approach']
['Follow']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Approach']
['Avoid']
['Follow']
['Approach']
['Follow']
['Approach']
['Approach']
['Approach']
drop한 데이터 개수: 25


In [407]:
df.reset_index(inplace=True, drop=True)
df.index

RangeIndex(start=0, stop=485, step=1)

In [408]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}"
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}"
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}"
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}"
...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1716560, 1729280, 1735360, 1768880, 1771040]","[12720, 6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}"
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}"
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}"
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}"


In [345]:
df.to_csv('test.csv')

In [410]:
df.to_csv('../../data/behavior_hmm_data.csv')

## 3-2) One-Hot Encoding (code, duration)

In [411]:
def code_one_hot_encoding(df):
    # 행동 코드 리스트
    actions = ['Avoid', 'Follow', 'Approach', 'None', 'Touch', 'Gesture']

    # 각각의 sublist를 one-hot encoding하여 다차원 리스트로 생성
    one_hot_data = []
    for sublist in df['code']:
        one_hot_sublist = np.zeros((len(sublist), len(actions)))
        for i, code in enumerate(sublist):
            index = actions.index(code)
            one_hot_sublist[i][index] = 1
        one_hot_data.append(one_hot_sublist)
    
    df['encoded_code'] = one_hot_data
    df['encoded_code'] = df[['encoded_code']].apply(lambda x: [np.array(item).astype(int) for item in x])

    return df

In [412]:
def duration_one_hot_encoding(df):
    # 행동 코드 리스트
    actions = ['Avoid', 'Follow', 'Approach', 'None', 'Touch', 'Gesture']

    # 각각의 sublist를 one-hot encoding하여 다차원 리스트로 생성
    one_hot_data = []
    for i, sublist in enumerate(df['code']):
        one_hot_sublist = np.zeros((len(sublist), len(actions)))
        for j, code in enumerate(sublist):
            index = actions.index(code)
            one_hot_sublist[j][index] = df['duration'][i][j]
        one_hot_data.append(one_hot_sublist)
    
    df['encoded_duration'] = one_hot_data
    df['encoded_duration'] = df[['encoded_duration']].apply(lambda x: [np.array(item).astype(int) for item in x])

    return df

In [413]:
df = code_one_hot_encoding(df)
df = duration_one_hot_encoding(df)

In [414]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance,encoded_code,encoded_duration
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 2720, 0, 0, 0], [0, 0, 0, 2960, 0, 0]]"
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 5520, 0, 0, 0], [0, 0, 0, 6320, 0, 0]]"
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9240, 0, 0, 0], [0, 0, 0, 15600, 0, 0]..."
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 4160, 0, 0, 0], [0, 0, 0, 7760, 0, 0]]"
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 8640, 0, 0, 0], [0, 0, 0, 16480, 0, 0]..."
...,...,...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1716560, 1729280, 1735360, 1768880, 1771040]","[12720, 6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 12720, 0, 0, 0], [0, 0, 0, 6080, 0, 0]..."
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0]]","[[0, 0, 9760, 0, 0, 0], [0, 0, 0, 135360, 0, 0]]"
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 10400, 0, 0, 0], [0, 0, 0, 6560, 0, 0]..."
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 7680, 0, 0, 0], [0, 0, 0, 5200, 0, 0],..."


## 3-3) Add Zero Padding
sequence가 가장 긴 data에 맞춰 zero padding을 더함

In [415]:
def padding_encoding_data(df, col_name, num):
    max_length = max(map(len, df[col_name]))  # 가장 긴 sequence의 길이 구하기
    padded_data = []
    
    for data in df[col_name]:
        if max_length - len(data) == 0:
            padded_data.append(data)
        else:
            # 가장 긴 sequence 길이를 기준으로 padding 적용
            padding_list = [[0 for i in range(num)]] * (max_length - len(data))
            result = np.concatenate((data, padding_list), axis=0)
            padded_data.append(result)
        
    df[col_name] = padded_data

    return df

In [416]:
df = padding_encoding_data(df, "encoded_code", 6)
df = padding_encoding_data(df, "encoded_duration", 6)

In [417]:
df

Unnamed: 0,code,start,duration,A/C,M/F,appearance,encoded_code,encoded_duration
0,"[Approach, None]","[56240, 59360]","[2720, 2960]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 2720, 0, 0, 0], [0, 0, 0, 2960, 0, 0],..."
1,"[Approach, None]","[71420, 77400]","[5520, 6320]",Child,Female,"{포니테일, 아이보리색 점퍼, 검정색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 5520, 0, 0, 0], [0, 0, 0, 6320, 0, 0],..."
2,"[Approach, None, Approach, None]","[48240, 56640, 71200, 77400]","[9240, 15600, 5920, 6360]",Child,Male,"{검정색 점퍼, 청바지, 검정/흰색 운동화}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9240, 0, 0, 0], [0, 0, 0, 15600, 0, 0]..."
3,"[Approach, None]","[238160, 242320]","[4160, 7760]",Child,Male,"{검정 마스크, 흰색 후리스, 회색 트레이닝 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 4160, 0, 0, 0], [0, 0, 0, 7760, 0, 0],..."
4,"[Approach, None, Approach, None]","[47920, 56480, 71200, 77200]","[8640, 16480, 6000, 6640]",Adult,Female,"{검정 핸드백, 흰색 점퍼, 아이보리색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 8640, 0, 0, 0], [0, 0, 0, 16480, 0, 0]..."
...,...,...,...,...,...,...,...,...
480,"[Approach, None, Approach, None]","[1716560, 1729280, 1735360, 1768880, 1771040]","[12720, 6080, 33520, 2160, 10000]",Adult,Female,"{검은색 패딩, 회색 상의, 회색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 12720, 0, 0, 0], [0, 0, 0, 6080, 0, 0]..."
481,"[Approach, None]","[1573200, 1582960]","[9760, 135360]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 9760, 0, 0, 0], [0, 0, 0, 135360, 0, 0..."
482,"[Approach, None, Approach, None, Approach, None]","[1727920, 1738320, 1744880, 1750160, 1755360, ...","[10400, 6560, 5280, 5200, 4560, 20800]",Child,Female,"{회색 롱패딩, 흰색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 10400, 0, 0, 0], [0, 0, 0, 6560, 0, 0]..."
483,"[Approach, None, Gesture, None, Approach, None...","[1566960, 1574640, 1579840, 1582640, 1585040, ...","[7680, 5200, 2800, 2400, 5360, 45360, 1840, 36...",Child,Male,"{검은색 롱패딩, 검은색 바지}","[[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0...","[[0, 0, 7680, 0, 0, 0], [0, 0, 0, 5200, 0, 0],..."


In [418]:
def make_data_for_hmm(df):
    time_series_data = []

    for encoded_code, encoded_duration in zip(df['encoded_code'], df['encoded_duration']):
        encoded_code = np.squeeze(encoded_code)
        encoded_duration = np.squeeze(encoded_duration)

        # encoded_code와 encoded_duration을 수평으로 결합합니다.
        time_series_data.append(np.hstack([encoded_code, encoded_duration]))

    df['time_series_data'] = time_series_data
    
    return df

In [419]:
df = make_data_for_hmm(df)

In [420]:
df.to_csv("../../data/behavior_hmm_preprocessed.csv")

# HMM

In [421]:
import pandas as pd
import numpy as np
import ast
from hmmlearn import hmm

In [422]:
time_series_data = []

for encoded_code, encoded_duration in zip(df['encoded_code'], df['encoded_duration']):
    encoded_code = np.squeeze(encoded_code)
    encoded_duration = np.squeeze(encoded_duration)

    # encoded_code와 encoded_duration을 수평으로 결합합니다.
    time_series_data.append(np.hstack([encoded_code, encoded_duration]))

time_series_data = np.array(time_series_data)
n_samples, n_timesteps, n_features = time_series_data.shape

In [423]:
X = time_series_data.reshape((n_samples, n_timesteps * n_features))

In [424]:
n_states = 3

# Define the HMM model
model = hmm.GaussianHMM(n_components=n_states)

In [425]:
# Train the model
model.fit(X)

Model is not converging.  Current: 307611.09977440926 is not greater than 307611.18203350145. Delta is -0.08225909218890592


In [445]:
# Predict the sequence of states for each observed sequence
predicted_states = model.predict(X)

# Decode the predicted states back to engagement levels
results = np.array(['state0' if s == 2 else 'state1' if s == 1 else 'state2' for s in predicted_states])

In [446]:
results

array(['state1', 'state1', 'state1', 'state1', 'state1', 'state1',
       'state1', 'state1', 'state1', 'state1', 'state0', 'state0',
       'state1', 'state1', 'state1', 'state1', 'state2', 'state0',
       'state0', 'state1', 'state1', 'state1', 'state0', 'state0',
       'state1', 'state0', 'state0', 'state1', 'state1', 'state1',
       'state0', 'state0', 'state0', 'state1', 'state0', 'state1',
       'state1', 'state1', 'state1', 'state1', 'state1', 'state1',
       'state1', 'state1', 'state1', 'state1', 'state0', 'state0',
       'state2', 'state0', 'state0', 'state1', 'state1', 'state2',
       'state0', 'state0', 'state1', 'state0', 'state1', 'state0',
       'state0', 'state1', 'state1', 'state0', 'state1', 'state1',
       'state1', 'state1', 'state0', 'state1', 'state0', 'state0',
       'state1', 'state0', 'state0', 'state1', 'state0', 'state2',
       'state0', 'state0', 'state0', 'state2', 'state1', 'state1',
       'state0', 'state2', 'state1', 'state2', 'state1', 'stat

In [427]:
chunk_size = n_timesteps * n_features  # 하나의 2차원 리스트의 크기
num_chunks = len(results) // chunk_size  # 나눈 후 생성될 리스트의 개수
results_ = [results[i:i+chunk_size] for i in range(0, num_chunks*chunk_size, chunk_size)]

In [428]:
behavior = df[['code', 'duration']]

In [429]:
results

array(['high', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'low', 'low', 'high', 'high', 'high', 'high',
       'low', 'low', 'low', 'high', 'high', 'high', 'low', 'low', 'high',
       'low', 'low', 'high', 'high', 'high', 'low', 'low', 'low', 'high',
       'low', 'high', 'high', 'high', 'high', 'high', 'high', 'high',
       'high', 'high', 'high', 'high', 'low', 'low', 'low', 'low', 'low',
       'high', 'high', 'low', 'low', 'low', 'high', 'low', 'high', 'low',
       'low', 'high', 'high', 'low', 'high', 'high', 'high', 'high',
       'low', 'high', 'low', 'low', 'high', 'low', 'low', 'high', 'low',
       'low', 'low', 'low', 'low', 'low', 'high', 'high', 'low', 'low',
       'high', 'low', 'high', 'low', 'low', 'low', 'low', 'low', 'low',
       'low', 'high', 'low', 'high', 'high', 'high', 'high', 'high',
       'low', 'low', 'high', 'low', 'high', 'low', 'high', 'low', 'high',
       'low', 'low', 'high', 'high', 'low', 'low', 'low', 'high',

In [447]:
behavior['predicted_engagement_levels'] = results

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  behavior['predicted_engagement_levels'] = results


In [452]:
behavior['predicted_engagement_levels'].value_counts()

state1    260
state0    187
state2     38
Name: predicted_engagement_levels, dtype: int64

In [473]:
# 저장할 파일 경로
file_path = '../../data/new_behavior_hmm_result_state3.xlsx'

# ExcelWriter 객체 생성
writer = pd.ExcelWriter(file_path)

In [474]:
state0_df = behavior[behavior['predicted_engagement_levels'] == 'state0']
# 'code' 칼럼의 데이터 길이
state0_df['code']

10      [Approach, None, Approach, Touch, None, Gesture]
11     [Approach, None, None, Approach, None, Approac...
17     [Approach, Gesture, None, Gesture, Touch, None...
18                             [Approach, Gesture, None]
22                                   [Approach, Gesture]
                             ...                        
476                 [Approach, Touch, None, Touch, None]
477                                  [Approach, Gesture]
479    [Approach, None, Touch, None, Approach, Touch,...
482     [Approach, None, Approach, None, Approach, None]
484                                    [Approach, Touch]
Name: code, Length: 187, dtype: object

In [475]:
state1_df = behavior[behavior['predicted_engagement_levels'] == 'state1']
# 'code' 칼럼의 데이터 길이
state1_df['code']

0                      [Approach, None]
1                      [Approach, None]
2      [Approach, None, Approach, None]
3                      [Approach, None]
4      [Approach, None, Approach, None]
                     ...               
473                    [Approach, None]
475                    [Approach, None]
478                    [Approach, None]
480    [Approach, None, Approach, None]
481                    [Approach, None]
Name: code, Length: 260, dtype: object

In [476]:
state2_df = behavior[behavior['predicted_engagement_levels'] == 'state2']
# 'code' 칼럼의 데이터 길이
state2_df['code']

16     [Approach, None, Touch, None, Gesture, None, F...
48     [Approach, None, Gesture, None, Touch, None, T...
53     [Approach, None, Touch, None, Touch, None, Tou...
77     [Approach, None, Approach, Touch, None, Touch,...
81     [Approach, None, Gesture, None, None, Approach...
85     [Approach, None, Gesture, None, Approach, None...
87     [Approach, None, Gesture, None, Touch, None, G...
145    [Approach, None, Approach, None, Approach, Non...
170    [Approach, Touch, Touch, None, Touch, None, To...
188    [Approach, Touch, None, Approach, Touch, None,...
189    [Approach, None, Approach, Touch, None, Approa...
191    [Approach, Gesture, Approach, Touch, Approach,...
192    [Approach, None, Touch, None, Touch, None, Tou...
194    [Approach, Touch, None, Touch, None, Touch, No...
213    [Approach, None, Touch, None, Touch, None, App...
218    [Approach, None, Gesture, None, Gesture, None,...
228    [Approach, Touch, None, Approach, Touch, None,...
233    [Approach, None, Touch, 

In [477]:
# 각 DataFrame을 다른 시트에 저장
state0_df.to_excel(writer, sheet_name='state0', index=False)
state1_df.to_excel(writer, sheet_name='state1', index=False)
state2_df.to_excel(writer, sheet_name='state2', index=False)

In [478]:
# 저장 및 파일 닫기
writer.save()
writer.close()

  writer.save()


In [440]:
behavior.to_csv('../../data/behavior_hmm_result_state3.csv')