# Import Libraries

In [1]:
#!pip install  plotly

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',None)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

from statsmodels.imputation import mice
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Data Load

In [3]:
train = pd.read_csv('./train.csv')
display(train.head(5))

Unnamed: 0.1,Unnamed: 0,timestamp,A_x,A_y,A_z,B_x,B_y,B_z,label
0,0,2019-01-12 00:45:54.450,-0.25913,-0.834869,-0.485499,0.196409,,0.384934,8
1,1,2000-01-01 01:37:06.440,0.37049,0.175042,0.122625,-0.338242,0.358245,0.126491,2
2,2,2019-01-12 00:45:33.900,-0.257837,-0.881947,-0.391895,0.196027,0.894537,0.411221,8
3,3,2000-01-01 00:46:22.680,-0.937753,-0.055961,0.362041,-0.929881,0.087673,0.134609,11
4,4,2000-01-01 00:49:56.620,-0.98832,-0.19039,0.157909,-0.954669,-0.02481,-0.38842,6


In [4]:
display(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  100000 non-null  int64  
 1   timestamp   100000 non-null  object 
 2   A_x         90000 non-null   float64
 3   A_y         90000 non-null   float64
 4   A_z         90000 non-null   float64
 5   B_x         90000 non-null   float64
 6   B_y         90000 non-null   float64
 7   B_z         90000 non-null   float64
 8   label       100000 non-null  int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 6.9+ MB


None

In [5]:
display(train.describe())

Unnamed: 0.1,Unnamed: 0,A_x,A_y,A_z,B_x,B_y,B_z,label
count,100000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,100000.0
mean,49999.5,-0.876639,-0.110566,-0.013331,-0.780543,0.116559,0.167377,5.94563
std,28867.657797,0.464709,0.348463,0.361989,0.817925,0.593224,0.785482,3.333515
min,0.0,-5.132823,-1.94931,-1.28475,-7.483251,-5.466767,-7.20822,1.0
25%,24999.75,-1.016718,-0.181035,-0.28195,-1.050645,-0.190546,-0.243964,3.0
50%,49999.5,-0.958268,-0.047425,-0.063756,-0.918144,0.023954,0.113276,7.0
75%,74999.25,-0.764307,0.060235,0.222074,-0.157112,0.319651,0.52597,9.0
max,99999.0,1.849398,2.413866,2.584467,5.372528,7.182237,6.766558,11.0


# EDA

In [6]:
# Activity 값 별로 개수 세기
activity_counts = train['label'].value_counts().sort_index()

# Plotly Express를 사용하여 바 차트 생성, 여기서는 모든 막대에 동일한 색상 적용
fig = px.bar(activity_counts, x=activity_counts.index, y=activity_counts.values, 
             labels={'y': 'Count', 'index': 'Activity'}, 
             color_discrete_sequence=['pink'])  # 모든 막대에 대해 단일 색상 사용

fig.update_layout(title_text='Activity Counts', xaxis_title='Activity', yaxis_title='Count')

# 그래프 보기
fig.show()

In [7]:
fig = px.pie(train, names='label',width=720)
fig.update_layout(
    title={
        'text': "Activities distribution in the data",
        'y':0.95,
        'x':0.40,
        'xanchor': 'center',
        'yanchor': 'top'},
         legend_title ="Activities",
         font=dict(
         family="Arial",
         size=18))
fig.show()

# Data Pre-Processing

In [8]:
def data_preprocess(data):
    # 시간 순 정렬
    data = data.sort_values('timestamp')
    # 데이트타임 설정
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    # 일시 관련 변수 추가
    data['10ms'] = data['timestamp'].dt.microsecond // 10000
    data['second'] = data['timestamp'].dt.second
    data['minute'] = data['timestamp'].dt.minute
    data['hour'] = data['timestamp'].dt.hour
    data['day'] = data['timestamp'].dt.day
    data = data.astype({'10ms':'int', 'second':'int', 'minute':'int', 'hour':'int', 'day':'int'})
    # 인덱스 순 정렬
    data = data.sort_values('Unnamed: 0')
    
    # 사용하지 않는 변수 삭제
    data.drop(['Unnamed: 0', 'timestamp'], axis=1, inplace=True)
    
    # MICE를 이용한 결측치 대체
    ## MICE : Multiple imputation, NA가 있는 변수 이외의 다른 변수들까지 함께 고려하여, 채워넣을 변수를 지정하는 방법
    imp = mice.MICEData(data)
    filled_data = imp.data
    data['A_x'] = filled_data['A_x']
    data['A_y'] = filled_data['A_y']
    data['A_z'] = filled_data['A_z']
    data['B_x'] = filled_data['B_x']
    data['B_y'] = filled_data['B_y']
    data['B_z'] = filled_data['B_z']
    
    # 센서의 값에서 크기 벡터로 변환한 데이터 변수 추가
    data['A_magnitude'] = np.sqrt(data['A_x']**2 + data['A_y']**2 + data['A_z']**2)
    data['B_magnitude'] = np.sqrt(data['B_x']**2 + data['B_y']**2 + data['B_z']**2)
    
    # A와 B 벡터의 각 컴포넌트
    A_x, A_y, A_z = data['A_x'], data['A_y'], data['A_z']
    B_x, B_y, B_z = data['B_x'], data['B_y'], data['B_z']

    # 벡터 A와 B의 내적
    dot_product = A_x * B_x + A_y * B_y + A_z * B_z

    # 벡터 A와 B의 크기 (이미 계산되어 있음)
    A_magnitude = np.sqrt(A_x**2 + A_y**2 + A_z**2)
    B_magnitude = np.sqrt(B_x**2 + B_y**2 + B_z**2)

    # 각도 계산 (단위: 라디안)
    cos_theta = dot_product / (A_magnitude * B_magnitude)
    angle_rad = np.arccos(np.clip(cos_theta, -1.0, 1.0)) # 클리핑으로 인한 오류 방지

    # 라디안을 도(degree)로 변환, 필요시 사용
    angle_deg = np.degrees(angle_rad)

    # 데이터프레임에 각도 추가
    data['angle'] = angle_rad
    
    return data

In [9]:
train_data = data_preprocess(train)

In [30]:
train_data.head(5)

Unnamed: 0,A_x,A_y,A_z,B_x,B_y,B_z,label,10ms,second,minute,hour,day,A_magnitude,B_magnitude,angle
0,-0.25913,-0.834869,-0.485499,0.196409,0.116564,0.384934,8,45,54,45,0,12,0.999932,0.447591,2.416921
1,0.37049,0.175042,0.122625,-0.338242,0.358245,0.126491,2,44,6,37,1,1,0.427714,0.508672,1.788994
2,-0.257837,-0.881947,-0.391895,0.196027,0.894537,0.411221,8,90,33,45,0,12,0.998945,1.003856,3.075883
3,-0.937753,-0.055961,0.362041,-0.929881,0.087673,0.134609,11,68,22,46,0,1,1.00677,0.943655,0.269202
4,-0.98832,-0.19039,0.157909,-0.954669,-0.02481,-0.38842,6,62,56,49,0,1,1.018803,1.03096,0.565361


# Modeling

In [28]:
def get_model(data, y_cols, opt=False):
    X = data.drop(y_cols, axis=1)
    y = data[y_cols] -1

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.3, random_state=530,stratify=y)
    
    # best : model = LGBMClassifier(learning_rate = 0.05, n_estimators=500, num_leaves=150) : 0.9946
    model = LGBMClassifier(learning_rate = 0.05, n_estimators=500, num_leaves=150, random_state=530)
    model.fit(X_train, y_train)

    pred = model.predict(X_valid)

    print('accuracy :',accuracy_score(y_valid, pred))
    print('='*60)
    print(confusion_matrix(y_valid, pred))
    print('='*60)
    print(classification_report(y_valid, pred))
    return model

In [29]:
y_col = 'label'
act_clf_model = get_model(train_data, y_col)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001097 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2515
[LightGBM] [Info] Number of data points in the train set: 70000, number of used features: 14
[LightGBM] [Info] Start training from score -2.165809
[LightGBM] [Info] Start training from score -2.106196
[LightGBM] [Info] Start training from score -2.488398
[LightGBM] [Info] Start training from score -2.629899
[LightGBM] [Info] Start training from score -2.916657
[LightGBM] [Info] Start training from score -2.923013
[LightGBM] [Info] Start training from score -2.244855
[LightGBM] [Info] Start training from score -2.163072
[LightGBM] [Info] Start training from score -2.377155
[LightGBM] [Info] Start training from score -2.345642
[LightGBM] [Info] Start training from score -2.395641
accuracy : 0.9904
[[3349   32    3   33    0    5    

# Predict

In [14]:
test = pd.read_csv('./test.csv')
test_data = data_preprocess(test)

In [15]:
test.head(5)

Unnamed: 0.1,Unnamed: 0,timestamp,A_x,A_y,A_z,B_x,B_y,B_z
0,0,2000-01-01 00:00:42.700,-1.000957,-0.170691,0.124889,-0.979561,0.00315,-0.264673
1,1,2000-01-01 00:28:38.540,-0.87483,0.132696,-0.501727,-1.274911,0.045122,0.12127
2,2,2000-01-01 00:07:23.900,-1.219112,0.074678,0.435331,-0.86082,0.22274,0.008689
3,3,2019-01-12 01:58:44.580,-0.907752,-0.171816,0.211507,-0.972017,0.337799,1.013534
4,4,2019-01-12 00:59:32.380,-1.031261,0.00034,-0.091693,-0.217434,-0.323466,0.931614


In [16]:
test_data.head(5)

Unnamed: 0,A_x,A_y,A_z,B_x,B_y,B_z,10ms,second,minute,hour,day,A_magnitude,B_magnitude,angle
0,-1.000957,-0.170691,0.124889,-0.979561,0.00315,-0.264673,70,42,0,0,1,1.023058,1.014693,0.422267
1,-0.87483,0.132696,-0.501727,-1.274911,0.045122,0.12127,54,38,28,0,1,1.017185,1.281461,0.620532
2,-1.219112,0.074678,0.435331,-0.86082,0.22274,0.008689,90,23,7,0,1,1.296659,0.889213,0.382107
3,-0.907752,-0.171816,0.211507,-0.972017,0.337799,1.013534,58,44,58,1,12,0.94777,1.44436,0.709393
4,-1.031261,0.00034,-0.091693,-0.217434,-0.323466,0.931614,38,32,59,0,12,1.03533,1.009858,1.437746


In [17]:
y_pred = act_clf_model.predict(test_data)

# Submit

In [18]:
submit = pd.read_csv('./sample.csv')
submit['label'] = y_pred+1

In [19]:
submit.to_csv('./submit12.csv',index=False)