In [None]:
import pandas as pd
import numpy as np
import glob
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_path = r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_AI_Challenge\qhstjs\train'
train_paths = glob.glob(os.path.join(base_path, '*.csv'))

In [None]:
# 파일 이름에서 target 값을 추출
def load_data(file_path):
    file_name = os.path.basename(file_path)
    target_value = re.match(r'(\d+)kg_\w+\.csv', file_name).group(1)

    loaded_data = pd.read_csv(file_path)

    loaded_data['Target'] = int(target_value)
    return loaded_data

## 전체 데이터 concat해서 진행

In [None]:
all_data = pd.concat([load_data(f) for f in train_paths])

print(all_data.isnull().sum())
print(all_data.describe())

In [None]:
columns_to_plot = ['Signal A', 'Signal B', 'Signal C', 'Sensor A', 'Sensor B', 'Sensor C', 'Sensor D', 'Target']
all_data[columns_to_plot].hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(all_data[columns_to_plot].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('confusion matrix')
plt.show()

## 개별 데이터

In [None]:
for file_path in train_paths:
    data = load_data(file_path)
    file_name = os.path.basename(file_path)

    columns_to_plot = ['Signal A', 'Signal B', 'Signal C', 'Sensor A', 'Sensor B', 'Sensor C', 'Sensor D']
    data[columns_to_plot].hist(bins=30, figsize=(15, 10))
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(12, 10))
    sns.heatmap(data[columns_to_plot].corr(), annot=True, cmap='coolwarm', center=0)
    plt.title(f'{file_name}')
    plt.tight_layout()
    plt.show()

In [None]:
for file_path in train_paths:
    data = load_data(file_path)
    file_name = os.path.basename(file_path)

    # 시간에 따른 센서 값들의 변화를 플롯합니다
    time_col = data.columns[0]  
    sensor_cols = ['Signal A', 'Signal B', 'Signal C', 'Sensor A', 'Sensor B', 'Sensor C', 'Sensor D']

    plt.figure(figsize=(15, 15))
    for sensor in sensor_cols:
        plt.plot(data[time_col], data[sensor], label=sensor)
    plt.xlabel('Time')
    plt.ylabel('Sensor Values')
    plt.title(f'Sensor Values Over Time for {file_name}')
    plt.legend()
    plt.show()

In [None]:
for file_path in train_paths:
    data = load_data(file_path)
    file_name = os.path.basename(file_path)

    # 시간에 따른 센서 값들의 변화를 플롯합니다
    time_col = data.columns[0]  
    sensor_cols = ['Signal A', 'Signal B', 'Signal C', 'Sensor A', 'Sensor B', 'Sensor C', 'Sensor D']

    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()

    for i, sensor in enumerate(sensor_cols):
        ax = axes[i]
        ax.plot(data[time_col], data[sensor])
        ax.set_xlabel('Time')
        ax.set_ylabel('Sensor Values')
        ax.set_title(f'{sensor} Over Time')

        # x축 눈금 조절과 회전 추가
        ax.set_xticks(np.arange(min(data[time_col]), max(data[time_col])+1, step=3))
        ax.set_xticklabels(ax.get_xticks(), rotation=45)

    # 사용하지 않는 축 숨기기
    for j in range(len(sensor_cols), len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.suptitle(f'Sensor Values Over Time for {file_name}', y=1.02)
    plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

for sensor in sensor_cols:
    plt.figure(figsize=(8, 6))
    plot_acf(data[sensor], lags=50)
    plt.title(f'Autocorrelation of {sensor}')
    plt.show()