In [1]:
from pyspark.sql import SparkSession

# spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder \
    .appName("ImageDataProcessing") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [2]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

In [None]:
from pyspark.sql.functions import col, split, udf
from pyspark.sql.types import StringType, LongType, StructType, StructField
import os
import glob

In [4]:
# paths = glob.glob('./work/*')
paths = glob.glob('../data/sand/*')

In [6]:
img_dir_list = [path for path in paths if not path.endswith('.zip')]
img_total_list = [glob.glob(folder+'/*') for folder in img_dir_list]
img_paths = [img_path for sublist in img_total_list for img_path in sublist]

In [7]:
def get_file_name(path):
    return os.path.basename(path)

def get_file_id(path):
    file_name = os.path.basename(path)
    file_id = os.path.splitext(file_name)[0]
    return file_id
    # return file_name.split('.')[0] 
    # return os.path.splitext(file_name)[0]
    
def get_folder_name(path):
    return os.path.basename(os.path.dirname(path))

def get_file_size(path):
    return os.path.getsize(path)



In [8]:
get_file_id_udf = udf(get_file_id, StringType())
get_file_name_udf = udf(get_file_name, StringType())
get_folder_name_udf = udf(get_folder_name, StringType())
get_file_size_udf = udf(get_file_size, LongType())

In [9]:
%%time
df = spark.createDataFrame([(path,) for path in img_paths], ["full_path"])
df = df.withColumn("file_id", get_file_id_udf("full_path"))
df = df.withColumn("file_name", get_file_name_udf("full_path"))
df = df.withColumn("folder_name", get_folder_name_udf("full_path"))
df = df.withColumn("file_size", get_file_size_udf("full_path"))
df.show(truncate=False)

+------------------------------------------------+-------+-----------+-----------------------+---------+
|full_path                                       |file_id|file_name  |folder_name            |file_size|
+------------------------------------------------+-------+-----------+-----------------------+---------+
|../data/sand/TS_1.모래입자크기분류_1/0010067.png|0010067|0010067.png|TS_1.모래입자크기분류_1|3018192  |
|../data/sand/TS_1.모래입자크기분류_1/0010049.png|0010049|0010049.png|TS_1.모래입자크기분류_1|3287386  |
|../data/sand/TS_1.모래입자크기분류_1/0010021.png|0010021|0010021.png|TS_1.모래입자크기분류_1|3286182  |
|../data/sand/TS_1.모래입자크기분류_1/0010004.png|0010004|0010004.png|TS_1.모래입자크기분류_1|2930369  |
|../data/sand/TS_1.모래입자크기분류_1/0010094.png|0010094|0010094.png|TS_1.모래입자크기분류_1|3266231  |
|../data/sand/TS_1.모래입자크기분류_1/0010005.png|0010005|0010005.png|TS_1.모래입자크기분류_1|3293061  |
|../data/sand/TS_1.모래입자크기분류_1/0010006.png|0010006|0010006.png|TS_1.모래입자크기분류_1|3143184  |
|../data/sand/TS_1.모래입자크기분류_1/0010057.png|0010057|0010057.png|

In [10]:
df.count()

490755