# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

## Data Refining

In [2]:
# audio result 읽기
audio_result_df = pd.read_csv('audio_result.csv')
# 읽어온 명시적 인덱스 열 drop
audio_result_df = audio_result_df.drop(['Unnamed: 0'], axis = 1)
# file name 수정
audio_result_df['file'] = audio_result_df['file'] + '.mp4'

In [3]:
audio_result_df

Unnamed: 0,file,Silence,Baby laughter,"Baby cry, infant cry"
0,v12_s-1.mp4,0.148438,0.0,0.0
1,v12_s-10.mp4,0.109375,0.0,0.0
2,v12_s-11.mp4,0.109375,0.0,0.0
3,v12_s-12.mp4,0.109375,0.0,0.0
4,v12_s-13.mp4,0.109375,0.0,0.0
...,...,...,...,...
545,v97_ns-5.mp4,0.148438,0.0,0.0
546,v97_ns-6.mp4,0.148438,0.0,0.0
547,v97_ns-7.mp4,0.148438,0.0,0.0
548,v97_ns-8.mp4,0.148438,0.0,0.0


In [4]:
# rebalanced_scores 읽기
rebalanced_scores_df = pd.read_csv('rebalanced_scores.csv')
# 필요한 col만 뽑기
rebalanced_scores_df = rebalanced_scores_df[['video_file', 'blink_score', 'movement_score', 'movement_norm']]
# null 행 제거
rebalanced_scores_df = rebalanced_scores_df.dropna()

In [5]:
rebalanced_scores_df

Unnamed: 0,video_file,blink_score,movement_score,movement_norm
0,v12_s-1.mp4,0.631505,0.022537,0.168826
1,v12_s-10.mp4,0.538419,0.037090,0.442904
2,v12_s-11.mp4,0.495586,0.017830,0.156351
3,v12_s-12.mp4,0.504354,0.063563,0.640250
4,v12_s-13.mp4,0.485364,0.018498,0.187213
...,...,...,...,...
566,v87_ns-5.mp4,0.012740,0.001331,0.024984
567,v87_ns-6.mp4,0.090649,0.002792,0.017828
568,v87_ns-7.mp4,0.017686,0.004703,0.545787
569,v87_ns-8.mp4,0.236408,0.000899,0.443880


In [6]:
# left outer join 
joined_df = pd.merge(rebalanced_scores_df, audio_result_df,
                     how='left', left_on='video_file', right_on='file')
# null 체크
joined_df.isnull().sum()

video_file               0
blink_score              0
movement_score           0
movement_norm            0
file                    84
Silence                 84
Baby laughter           84
Baby cry, infant cry    84
dtype: int64

In [7]:
# null 행 제거
joined_df = joined_df.dropna()
# 필요없는 col 제거 
### movement_norm or movement_score
### 우선 norm 제거
### file
joined_df = joined_df.drop(['movement_norm', 'file'], axis = 1) 
# 레이블 설정 
# is_sleeping 
# video_file에 ns가 포함되면 0 (안잠)
# 아니면 1 (잠)
joined_df['is_sleeping'] = joined_df['video_file'].apply(lambda x: 0 if 'ns' in x else 1)

In [8]:
joined_df

Unnamed: 0,video_file,blink_score,movement_score,Silence,Baby laughter,"Baby cry, infant cry",is_sleeping
0,v12_s-1.mp4,0.631505,0.022537,0.148438,0.0,0.0,1
1,v12_s-10.mp4,0.538419,0.037090,0.109375,0.0,0.0,1
2,v12_s-11.mp4,0.495586,0.017830,0.109375,0.0,0.0,1
3,v12_s-12.mp4,0.504354,0.063563,0.109375,0.0,0.0,1
4,v12_s-13.mp4,0.485364,0.018498,0.109375,0.0,0.0,1
...,...,...,...,...,...,...,...
562,v87_ns-5.mp4,0.012740,0.001331,0.148438,0.0,0.0,0
563,v87_ns-6.mp4,0.090649,0.002792,0.109375,0.0,0.0,0
564,v87_ns-7.mp4,0.017686,0.004703,0.199219,0.0,0.0,0
565,v87_ns-8.mp4,0.236408,0.000899,0.109375,0.0,0.0,0


In [9]:
# 레이블 체크
joined_df.groupby('is_sleeping').size()

is_sleeping
0    420
1     63
dtype: int64

## OverSampling

### Random OverSampling

In [10]:
from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=42)

X = joined_df.drop(['video_file','is_sleeping'], axis=1) # Features, 이름, 레이블 제거
y = y = joined_df['is_sleeping'] # label

X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [11]:
y_resampled.value_counts()

is_sleeping
1    420
0    420
Name: count, dtype: int64

In [12]:
# 데이터 병함
# y에 마지막 차원 추가
# 마지막 차원 기준으로 concatenate 
y_resampled = np.expand_dims(y_resampled, axis = -1)
data_random_oversampled = np.concatenate([X_resampled, y_resampled], axis = -1)

In [13]:
data_random_oversampled

array([[0.63150463, 0.02253711, 0.1484375 , 0.        , 0.        ,
        1.        ],
       [0.53841867, 0.03708968, 0.109375  , 0.        , 0.        ,
        1.        ],
       [0.49558617, 0.01783049, 0.109375  , 0.        , 0.        ,
        1.        ],
       ...,
       [0.65134788, 0.00915567, 0.109375  , 0.        , 0.        ,
        1.        ],
       [0.53189416, 0.00283812, 0.109375  , 0.        , 0.        ,
        1.        ],
       [0.32650202, 0.00318804, 0.109375  , 0.        , 0.        ,
        1.        ]])

In [14]:
# 저장
data_random_oversampled = pd.DataFrame(data_random_oversampled)
data_random_oversampled.to_csv('data_random_oversampled.csv')
print('random ver 저장 완료')

random ver 저장 완료


### SMOTE OverSampling

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X = joined_df.drop(['video_file','is_sleeping'], axis=1) # Features, 이름, 레이블 제거
y = joined_df['is_sleeping'] # label

X_resampled, y_resampled = smote.fit_resample(X, y)

print('레이블 카운트')
print(y_resampled.value_counts())

# 데이터 병함
# y에 마지막 차원 추가
# 마지막 차원 기준으로 concatenate 
y_resampled = np.expand_dims(y_resampled, axis = -1)
data_smote_oversampled = np.concatenate([X_resampled, y_resampled], axis = -1)
# 저장
data_smote_oversampled = pd.DataFrame(data_smote_oversampled)
data_smote_oversampled.to_csv('data_smote_oversampled.csv')
print('smote ver 저장 완료')

레이블 카운트
is_sleeping
1    420
0    420
Name: count, dtype: int64
smote ver 저장 완료


### ADASYN OverSampling

In [16]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN(random_state=42)

X = joined_df.drop(['video_file','is_sleeping'], axis=1) # Features, 이름, 레이블 제거
y = joined_df['is_sleeping'] # label

X_resampled, y_resampled = adasyn.fit_resample(X, y)

print('레이블 카운트')
print(y_resampled.value_counts())

# 데이터 병함
# y에 마지막 차원 추가
# 마지막 차원 기준으로 concatenate 
y_resampled = np.expand_dims(y_resampled, axis = -1)
data_adasyn_oversampled = np.concatenate([X_resampled, y_resampled], axis = -1)
# 저장
data_adasyn_oversampled = pd.DataFrame(data_adasyn_oversampled)
data_adasyn_oversampled.to_csv('data_adasyn_oversampled.csv')
print('adasyn ver 저장 완료')

레이블 카운트
is_sleeping
0    420
1    418
Name: count, dtype: int64
adasyn ver 저장 완료
