In [1]:
from pathlib import Path

In [None]:
# G:\industry_data

In [2]:
# all_path = Path('../data/').rglob('*')
all_path = Path('G:\industry_data\industry_data/').rglob('*')
file_only = [i for i in all_path if i.is_file()]

In [13]:
class DataProcessor:
    """데이터 처리와 관련된 기능을 담당하는 클래스"""
    
    @staticmethod
    def format_bytes(size):
        """byte를 KB, MB, GB, TB 등으로 변경하는 함수"""
        volum = 1024
        n = 0
        volum_labels = {0: 'B', 1: 'KB', 2: 'MB', 3: 'GB', 4: 'TB'}
        original_size = size
        while original_size >= volum and n < len(volum_labels) - 1:
            original_size /= volum
            n += 1
        return f"{original_size:.1f} {volum_labels[n]}"
    
    @staticmethod
    def calculate_total_size(file_paths):
        """파일 경로들의 총 크기를 계산하는 함수"""
        total_size = 0
        file_count = 0
        
        for file_path in file_paths:
            if file_path.exists():
                file_size = file_path.stat().st_size
                total_size += file_size
                file_count += 1
                # print(f"{file_path.name}: {DataProcessor.format_bytes(file_size)}")
        
        return total_size, file_count


In [15]:
# 총 크기 계산
total_size, file_count = DataProcessor.calculate_total_size(file_only)
print(f"\n총 파일 수: {file_count}개")
print(f"총 크기: {DataProcessor.format_bytes(total_size)}")



총 파일 수: 9개
총 크기: 378.5 MB


In [33]:
file_only[0].stat()

os.stat_result(st_mode=33206, st_ino=562949955112197, st_dev=1947047632, st_nlink=1, st_uid=0, st_gid=0, st_size=3156156, st_atime=1750223955, st_mtime=1723502703, st_ctime=1750223955)

In [None]:
def make_polars_dataframe(self, paths):
    '''
    polars_dataframe 생성 \n
    full_path, file_id, file_name, folder_name, file_size, image_width, image_height
    '''
    image_width_height = self.extract_image_resolutions(paths)

    df = pl.DataFrame({
        "full_path": paths,
        "file_id": self.extract_file_id(paths),
        "file_name": self.extract_file_name(paths),
        "folder_name": self.extract_folder_name(paths),
        "file_size": self.extract_file_size(paths),
        "image_width": [size[0] for size in image_width_height],
        "image_height": [size[1] for size in image_width_height],

    })
    return df.to_pandas()

In [34]:
import polars as pl
from PIL import Image

class DataProcessor:
    """데이터 처리와 관련된 기능을 담당하는 클래스"""
    
    @staticmethod
    def format_bytes(size):
        """byte를 KB, MB, GB, TB 등으로 변경하는 함수"""
        volum = 1024
        n = 0
        volum_labels = {0: 'B', 1: 'KB', 2: 'MB', 3: 'GB', 4: 'TB'}
        original_size = size
        while original_size >= volum and n < len(volum_labels) - 1:
            original_size /= volum
            n += 1
        return f"{original_size:.1f} {volum_labels[n]}"
    
    @staticmethod
    def calculate_total_size(file_paths):
        """파일 경로들의 총 크기를 계산하는 함수"""
        total_size = 0
        file_count = 0
        
        for file_path in file_paths:
            if file_path.exists():
                file_size = file_path.stat().st_size
                total_size += file_size
                file_count += 1
        
        return total_size, file_count
    
    @staticmethod
    def extract_full_path(file_paths):
        """전체 경로 추출"""
        return [str(path) for path in file_paths]
    
    @staticmethod
    def extract_file_id(file_paths):
        """파일 ID 추출 (파일명에서 확장자 제외)"""
        return [path.stem for path in file_paths]
    
    @staticmethod
    def extract_file_name(file_paths):
        """파일명 추출 (확장자 포함)"""
        return [path.name for path in file_paths]
    
    @staticmethod
    def extract_folder_name(file_paths):
        """폴더명 추출"""
        return [path.parent.name for path in file_paths]
    
    @staticmethod
    def extract_file_size(file_paths):
        """파일 크기 추출 (바이트)"""
        return [path.stat().st_size for path in file_paths if path.exists()]
    
    @staticmethod
    def extract_image_resolutions(file_paths):
        """이미지 해상도 추출 (width, height)"""
        resolutions = []
        for path in file_paths:
            try:
                with Image.open(path) as img:
                    width, height = img.size
                    resolutions.append((width, height))
            except Exception as e:
                print(f"이미지 읽기 실패 {path}: {e}")
                resolutions.append((0, 0))  # 실패 시 기본값
        return resolutions
    
    @staticmethod
    def make_polars_dataframe(file_paths):
        """polars DataFrame 생성"""
        image_width_height = DataProcessor.extract_image_resolutions(file_paths)
        
        df = pl.DataFrame({
            "full_path": DataProcessor.extract_full_path(file_paths),
            "file_id": DataProcessor.extract_file_id(file_paths),
            "file_name": DataProcessor.extract_file_name(file_paths),
            "folder_name": DataProcessor.extract_folder_name(file_paths),
            "file_size": DataProcessor.extract_file_size(file_paths),
            "image_width": [size[0] for size in image_width_height],
            "image_height": [size[1] for size in image_width_height],
        })
        return df
    
    @staticmethod
    def make_pandas_dataframe(file_paths):
        """pandas DataFrame 생성"""
        df = DataProcessor.make_polars_dataframe(file_paths)
        return df.to_pandas()

In [35]:
# 이미지 파일만 필터링 (PNG, JPG, JPEG 등)
image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'}
image_files = [f for f in file_only if f.suffix.lower() in image_extensions]

# DataFrame 생성
df = DataProcessor.make_polars_dataframe(image_files)

# 결과 확인
print("=== 이미지 파일 정보 ===")
print(df)

# pandas DataFrame으로 변환
pandas_df = DataProcessor.make_pandas_dataframe(image_files)
print("\n=== Pandas DataFrame ===")
print(pandas_df)

=== 이미지 파일 정보 ===
shape: (6, 7)
┌────────────────┬─────────┬───────────────┬─────────────┬───────────┬─────────────┬───────────────┐
│ full_path      ┆ file_id ┆ file_name     ┆ folder_name ┆ file_size ┆ image_width ┆ image_height  │
│ ---            ┆ ---     ┆ ---           ┆ ---         ┆ ---       ┆ ---         ┆ ---           │
│ str            ┆ str     ┆ str           ┆ str         ┆ i64       ┆ i64         ┆ i64           │
╞════════════════╪═════════╪═══════════════╪═════════════╪═══════════╪═════════════╪═══════════════╡
│ ..\data\001000 ┆ 0010002 ┆ 0010002       ┆ data        ┆ 3156156   ┆ 1920        ┆ 1080          │
│ 2 copy.png     ┆ copy    ┆ copy.png      ┆             ┆           ┆             ┆               │
│ ..\data\001000 ┆ 0010002 ┆ 0010002.png   ┆ data        ┆ 3156156   ┆ 1920        ┆ 1080          │
│ 2.png          ┆         ┆               ┆             ┆           ┆             ┆               │
│ ..\data\path1\ ┆ 0010002 ┆ 0010002       ┆ path1       ┆ 