# Basic library & settings

In [3]:
# Primary library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import matplotlib.dates as mdates
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')

# visualization setting
plt.style.use("seaborn-v0_8-whitegrid")
plt.rcParams["figure.figsize"] = (15, 8)
sns.set(font_scale=1.2)


# Data Load & Basic information search

In [4]:
df = pd.read_csv("/Users/b._.chan/Documents/DKU_DataScience/Not_Null/data/raw/train.csv")

# Data basic information
print(f"Data size : {df.shape}")
print("\n first 5 rows of data : ")
print(df.head())


Data size : (23587209, 19)

 first 5 rows of data : 
  module(equipment)      timestamp       localtime  operation  voltageR  \
0           14(2호기)  1739918155000  20250218143555          1    216.65   
1          16(호이스트)  1742552385000  20250321031945          1    212.13   
2        11(우측분전반1)  1738245780000  20250130060300          1    219.30   
3          5(좌측분전반)  1740645790000  20250227004310          1    214.15   
4        18(우측분전반2)  1737829500000  20250125102500          1    212.07   

   voltageS  voltageT  voltageRS  voltageST  voltageTR  currentR  currentS  \
0    215.21    216.89     373.99     374.20     375.45     17.76      6.59   
1    216.51    218.88     371.20     377.04     373.26     17.75     12.07   
2    211.36    212.71     372.96     367.24     374.12     27.54     12.88   
3    213.73    210.73     370.54     367.58     367.95     26.78      7.03   
4    215.52    215.66     370.29     373.40     370.41     29.51      8.88   

   currentT  activePower  p

In [5]:
# Data type & memory usage check
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23587209 entries, 0 to 23587208
Data columns (total 19 columns):
 #   Column                Dtype  
---  ------                -----  
 0   module(equipment)     object 
 1   timestamp             int64  
 2   localtime             int64  
 3   operation             int64  
 4   voltageR              float64
 5   voltageS              float64
 6   voltageT              float64
 7   voltageRS             float64
 8   voltageST             float64
 9   voltageTR             float64
 10  currentR              float64
 11  currentS              float64
 12  currentT              float64
 13  activePower           float64
 14  powerFactorR          float64
 15  powerFactorS          float64
 16  powerFactorT          float64
 17  reactivePowerLagging  float64
 18  accumActiveEnergy     int64  
dtypes: float64(14), int64(4), object(1)
memory usage: 3.3+ GB
None


In [6]:
# describe data check
print("\n Data description : ")
print(df.describe())


 Data description : 
          timestamp     localtime   operation      voltageR      voltageS  \
count  2.358721e+07  2.358721e+07  23587209.0  2.358721e+07  2.358721e+07   
mean   1.739519e+12  2.024839e+13         1.0  2.149942e+02  2.149946e+02   
std    3.739740e+09  3.664714e+09         0.0  2.901942e+00  2.902580e+00   
min    1.733040e+12  2.024120e+13         1.0  1.901000e+02  1.901400e+02   
25%    1.736280e+12  2.025011e+13         1.0  2.125000e+02  2.125000e+02   
50%    1.739521e+12  2.025021e+13         1.0  2.150000e+02  2.150000e+02   
75%    1.742757e+12  2.025032e+13         1.0  2.175000e+02  2.175000e+02   
max    1.745996e+12  2.025043e+13         1.0  2.200000e+02  2.200000e+02   

           voltageT     voltageRS     voltageST     voltageTR      currentR  \
count  2.358721e+07  2.358721e+07  2.358721e+07  2.358721e+07  2.358721e+07   
mean   2.149940e+02  3.723703e+02  3.723701e+02  3.723697e+02  1.750025e+01   
std    2.902125e+00  3.572992e+00  3.572323e+00

In [7]:
# missing value check
print("\n Missing value check : ")
print(df.isnull().sum())


 Missing value check : 
module(equipment)       0
timestamp               0
localtime               0
operation               0
voltageR                0
voltageS                0
voltageT                0
voltageRS               0
voltageST               0
voltageTR               0
currentR                0
currentS                0
currentT                0
activePower             0
powerFactorR            0
powerFactorS            0
powerFactorT            0
reactivePowerLagging    0
accumActiveEnergy       0
dtype: int64


# Time data preprocessing

In [8]:
# localtime data type check
print("localtime data sample")
print(df['localtime'].head())


localtime data sample
0    20250218143555
1    20250321031945
2    20250130060300
3    20250227004310
4    20250125102500
Name: localtime, dtype: int64


In [9]:
# timestamp & localtime column type transform
def convert_timestamp(x):
    try:
        # 1. timestamp column type transform
        return pd.to_datetime(str(x), format="%Y%m%d%H%M%S")
    except:
        return pd.NaT


In [10]:
# localtime transform
df['datetime'] = df['localtime'].apply(convert_timestamp)

# transform results check
print("\nconverted datetime column")
print(df['datetime'].head())


converted datetime column
0   2025-02-18 14:35:55
1   2025-03-21 03:19:45
2   2025-01-30 06:03:00
3   2025-02-27 00:43:10
4   2025-01-25 10:25:00
Name: datetime, dtype: datetime64[ns]


In [12]:
# Time feature extraction
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['weekday'] = df['datetime'].dt.weekday
df['week'] = df['datetime'].dt.isocalendar().week

# Time feature check
print("\nTime feature check")
print(df[['datetime', 'year', 'month', 'day', 'hour', 'weekday', 'week']].head())



Time feature check
             datetime  year  month  day  hour  weekday  week
0 2025-02-18 14:35:55  2025      2   18    14        1     8
1 2025-03-21 03:19:45  2025      3   21     3        4    12
2 2025-01-30 06:03:00  2025      1   30     6        3     5
3 2025-02-27 00:43:10  2025      2   27     0        3     9
4 2025-01-25 10:25:00  2025      1   25    10        5     4
