In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import collect_list, to_date, month, year, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import BooleanType, TimestampType, ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

import shutil

from concurrent.futures import ThreadPoolExecutor

from sklearn.model_selection import train_test_split

import math

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("analyze multi voicemetadata") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


# 불러오기

In [5]:
july_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_training_minutescript_labeling/*.json"
july_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_training_wordscript_labeling/*.json"
july_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_val_minutescript_labeling/*.json"
july_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_val_wordscript_labeling/*.json"

july_train_min_df = spark.read.json(july_train_min_file_path)
july_train_word_df = spark.read.json(july_train_word_file_path)
july_val_min_df = spark.read.json(july_val_min_file_path)
july_val_word_df = spark.read.json(july_val_word_file_path)

In [6]:
august_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_training_minutescript_labeling/*.json"
august_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_training_wordscript_labeling/*.json"
august_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_val_minutescript_labeling/*.json"
august_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_val_wordscript_labeling/*.json"

august_train_min_df = spark.read.json(august_train_min_file_path)
august_train_word_df = spark.read.json(august_train_word_file_path)
august_val_min_df = spark.read.json(august_val_min_file_path)
august_val_word_df = spark.read.json(august_val_word_file_path)

In [7]:
september_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_training_minutescript_labeling/*.json"
september_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_training_wordscript_labeling/*.json"
september_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_val_minutescript_labeling/*.json"
september_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_val_wordscript_labeling/*.json"

september_train_min_df = spark.read.json(september_train_min_file_path)
september_train_word_df = spark.read.json(september_train_word_file_path)
september_val_min_df = spark.read.json(september_val_min_file_path)
september_val_word_df = spark.read.json(september_val_word_file_path)

In [8]:
october_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_training_minutescript_labeling/*.json"
october_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_training_wordscript_labeling/*.json"
october_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_val_minutescript_labeling/*.json"
october_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_val_wordscript_labeling/*.json"

october_train_min_df = spark.read.json(october_train_min_file_path)
october_train_word_df = spark.read.json(october_train_word_file_path)
october_val_min_df = spark.read.json(october_val_min_file_path)
october_val_word_df = spark.read.json(october_val_word_file_path)

In [7]:
# november_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_training_minutescript_labeling/*.json"
# november_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_training_wordscript_labeling/*.json"
# november_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_val_minutescript_labeling/*.json"
# november_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_val_wordscript_labeling/*.json"

# november_train_min_df = spark.read.json(november_train_min_file_path)
# november_train_word_df = spark.read.json(november_train_word_file_path)
# november_val_min_df = spark.read.json(november_val_min_file_path)
# november_val_word_df = spark.read.json(november_val_word_file_path)

## 월별 음성의 개수가 5개 이하인 유저의 퍼센트 파악 -> 삭제

In [10]:
july_train_min_df.printSchema()

root
 |-- accuracy_array: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ad_avg_tries: double (nullable = true)
 |-- ad_duration: string (nullable = true)
 |-- ad_link: string (nullable = true)
 |-- ad_name: string (nullable = true)
 |-- ad_script: string (nullable = true)
 |-- ad_title: string (nullable = true)
 |-- average_accuracy_by_ad: double (nullable = true)
 |-- average_accuracy_by_user: double (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- created_timestamp_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- excepted_age_array: string (nullable = true)
 |-- first_created_date: long (nullable = true)
 |-- first_timestamp: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- job: string (nullable = true)
 |-- language: string (nullable = true)
 |-- level: string (nullable = true)
 |-- local_code: string (n

In [11]:
# DataFrame 리스트
dfs = [
    july_train_min_df, july_train_word_df, july_val_min_df, july_val_word_df,
    august_train_min_df, august_train_word_df, august_val_min_df, august_val_word_df,
    september_train_min_df, september_train_word_df, september_val_min_df, september_val_word_df,
    october_train_min_df, october_train_word_df, october_val_min_df, october_val_word_df
]

# 각 DataFrame에 대해 반복
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupBy("user_id").agg(count("ad_name").alias("audio_count"))

    # 5개 이하인 행 필터링
    less_than_five = count_df.filter(col("audio_count") <= 5)

    # 필터링된 행의 개수
    count_less_than_five = less_than_five.count()

    # 전체 행 개수
    total_count = df.count()

    # 비율 계산
    percentage = (count_less_than_five / total_count) * 100

    # 결과 출력
    print(f"5개 이하인 행의 개수: {count_less_than_five}, 전체 대비 비율: {percentage:.2f}%")

5개 이하인 행의 개수: 927, 전체 대비 비율: 0.20%
5개 이하인 행의 개수: 1727, 전체 대비 비율: 0.12%
5개 이하인 행의 개수: 2136, 전체 대비 비율: 1.41%
5개 이하인 행의 개수: 1310, 전체 대비 비율: 0.28%
5개 이하인 행의 개수: 1066, 전체 대비 비율: 0.23%
5개 이하인 행의 개수: 2043, 전체 대비 비율: 0.11%
5개 이하인 행의 개수: 2423, 전체 대비 비율: 1.55%
5개 이하인 행의 개수: 1944, 전체 대비 비율: 0.32%
5개 이하인 행의 개수: 2364, 전체 대비 비율: 1.27%
5개 이하인 행의 개수: 1666, 전체 대비 비율: 0.07%
5개 이하인 행의 개수: 3866, 전체 대비 비율: 6.24%
5개 이하인 행의 개수: 1275, 전체 대비 비율: 0.17%
5개 이하인 행의 개수: 2220, 전체 대비 비율: 1.22%
5개 이하인 행의 개수: 1715, 전체 대비 비율: 0.08%
5개 이하인 행의 개수: 4462, 전체 대비 비율: 7.37%
5개 이하인 행의 개수: 1354, 전체 대비 비율: 0.18%


In [13]:
# 각 DataFrame에 대해 반복하며 5개 이하인 음성 파일을 가진 유저들의 행을 삭제
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupBy("user_id").agg(F.count("ad_name").alias("audio_count"))

    # 5개 이하인 유저 ID 필터링
    users_less_than_five = count_df.filter(F.col("audio_count") <= 5).select("user_id")

    # 해당 유저들의 행을 삭제
    df = df.join(users_less_than_five, "user_id", "left_anti")

### 유저별 JSON 파일로 저장

In [None]:
# 경로 설정을 위한 매핑
path_mapping = {
    'july_train_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AMR\라벨링데이터',
    'july_train_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AMR\라벨링데이터',
    'july_val_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AMR\라벨링데이터',
    'july_val_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AMR\라벨링데이터',
    'august_train_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AMR\라벨링데이터',
    'august_train_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AMR\라벨링데이터',
    'august_val_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AMR\라벨링데이터',
    'august_val_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AMR\라벨링데이터',
    'september_train_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AMR\라벨링데이터',
    'september_train_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AMR\라벨링데이터',
    'september_val_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AMR\라벨링데이터',
    'september_val_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AMR\라벨링데이터',
    'october_train_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AMR\라벨링데이터',
    'october_train_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AMR\라벨링데이터',
    'october_val_min_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AMR\라벨링데이터',
    'october_val_word_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AMR\라벨링데이터'
}

# DataFrame 이름과 객체를 튜플로 매핑
df_tuples = [
    ('july_train_min_df', july_train_min_df),
    ('july_train_word_df', july_train_word_df),
    ('july_val_min_df', july_val_min_df),
    ('july_val_word_df', july_val_word_df),
    ('august_train_min_df', august_train_min_df),
    ('august_train_word_df', august_train_word_df),
    ('august_val_min_df', august_val_min_df),
    ('august_val_word_df', august_val_word_df),
    ('september_train_min_df', september_train_min_df),
    ('september_train_word_df', september_train_word_df),
    ('september_val_min_df', september_val_min_df),
    ('september_val_word_df', september_val_word_df),
    ('october_train_min_df', october_train_min_df),
    ('october_train_word_df', october_train_word_df),
    ('october_val_min_df', october_val_min_df),
    ('october_val_word_df', october_val_word_df)
]

# 각 DataFrame에 대해 반복하며 유저별로 JSON 파일로 저장
for df_name, df in df_tuples:
    # 해당 DataFrame의 경로 가져오기
    base_path = path_mapping[df_name]

    # 데이터를 JSON 형태로 변환하여 파일에 저장
    df.write.mode('overwrite').partitionBy("user_id").json(base_path)

# '유저id 폴더'/'광고명 파일' 구조에서 'voice_id 파일'로 바꾸기

In [7]:
july_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_minutescript"
july_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_wordscript"
july_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_minutescript"
july_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_wordscript"

august_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_minutescript"
august_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_wordscript"
august_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_minutescript"
august_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_wordscript"

september_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_minutescript"
september_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_wordscript"
september_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_minutescript"
september_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_wordscript"

october_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_minutescript"
october_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_wordscript"
october_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_minutescript"
october_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_wordscript"


In [8]:
def rename_files(df, target_path):
    # DataFrame에서 필요한 정보 추출
    info = df.select("user_id", "ad_name", "voice_id").collect()

    # 각 레코드에 대한 파일명 변경
    for row in info:
        old_path = os.path.join(target_path, row.user_id, row.ad_name)
        new_path = os.path.join(target_path, row.user_id, row.voice_id)

        if os.path.exists(old_path):
            os.rename(old_path, new_path)

In [6]:
rename_files(july_train_min_df, july_train_min_target_path)

In [9]:
# rename_files(july_train_word_df, july_train_word_target_path)
# rename_files(july_val_min_df, july_val_min_target_path)
# rename_files(july_val_word_df, july_val_word_target_path)

rename_files(august_train_min_df, august_train_min_target_path)
rename_files(august_train_word_df, august_train_word_target_path)
rename_files(august_val_min_df, august_val_min_target_path)
rename_files(august_val_word_df, august_val_word_target_path)

rename_files(september_train_min_df, september_train_min_target_path)
rename_files(september_train_word_df, september_train_word_target_path)
rename_files(september_val_min_df, september_val_min_target_path)
rename_files(september_val_word_df, september_val_word_target_path)

rename_files(october_train_min_df, october_train_min_target_path)
rename_files(october_train_word_df, october_train_word_target_path)
rename_files(october_val_min_df, october_val_min_target_path)
rename_files(october_val_word_df, october_val_word_target_path)

## 유저명 폴더 없에고 그 안의 파일들을 상위폴더로 이동

In [10]:
def move_files_and_remove_folder(root_path):
    # root_path 내의 모든 폴더를 순회
    for user_folder in os.listdir(root_path):
        user_folder_path = os.path.join(root_path, user_folder)

        # 폴더인지 확인
        if os.path.isdir(user_folder_path):
            # 해당 폴더 내의 모든 파일에 대해
            for file_name in os.listdir(user_folder_path):
                file_path = os.path.join(user_folder_path, file_name)
                new_path = os.path.join(root_path, file_name)

                # 파일 이동
                shutil.move(file_path, new_path)

            # 빈 폴더 삭제
            os.rmdir(user_folder_path)

In [11]:
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_minutescript")

In [13]:
# 7월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_wordscript")

# 8월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_wordscript")

# 9월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_wordscript")

# 10월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_wordscript")

# 각각의 데이터 뽑기

In [8]:
# 후보 2
def copy_file(source, target):
    try:
        shutil.copy(source, target)
        return 1  # 성공한 경우
    except Exception as e:
        return 0  # 실패한 경우

def find_and_copy_files(source_path, target_path, df):
    # # 결과 저장을 위한 카운터
    # results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        for row in df.collect():
            user_id = row['user_id']
            ad_name = row['ad_name']
            user_folder_path = os.path.join(source_path, user_id)
            ad_file_path = os.path.join(user_folder_path, ad_name)
            target_dir = os.path.join(target_path, user_id)

            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            # 병렬 처리를 위한 작업 추가
            if os.path.isfile(ad_file_path):
                futures.append(executor.submit(copy_file, ad_file_path, target_dir))

        for future in futures:
            future.result()


In [9]:
july_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_july"

july_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_minutescript"
july_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_wordscript"
july_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_minutescript"
july_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_wordscript"

find_and_copy_files(july_source_path, july_train_min_target_path, july_train_min_df)
find_and_copy_files(july_source_path, july_train_word_target_path, july_train_word_df)
find_and_copy_files(july_source_path, july_val_min_target_path, july_val_min_df)
find_and_copy_files(july_source_path, july_val_word_target_path, july_val_word_df)

In [11]:
august_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_august"

august_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_minutescript"
august_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_wordscript"
august_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_minutescript"
august_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_wordscript"

find_and_copy_files(august_source_path, august_train_min_target_path, august_train_min_df)
find_and_copy_files(august_source_path, august_train_word_target_path, august_train_word_df)
find_and_copy_files(august_source_path, august_val_min_target_path, august_val_min_df)
find_and_copy_files(august_source_path, august_val_word_target_path, august_val_word_df)

In [13]:
september_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_september"

september_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_minutescript"
september_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_wordscript"
september_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_minutescript"
september_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_wordscript"

find_and_copy_files(september_source_path, september_train_min_target_path, september_train_min_df)
find_and_copy_files(september_source_path, september_train_word_target_path, september_train_word_df)
find_and_copy_files(september_source_path, september_val_min_target_path, september_val_min_df)
find_and_copy_files(september_source_path, september_val_word_target_path, september_val_word_df)

In [12]:
october_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_october"

october_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_minutescript"
october_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_wordscript"
october_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_minutescript"
october_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_wordscript"

find_and_copy_files(october_source_path, october_train_min_target_path, october_train_min_df)
find_and_copy_files(october_source_path, october_train_word_target_path, october_train_word_df)
find_and_copy_files(october_source_path, october_val_min_target_path, october_val_min_df)
find_and_copy_files(october_source_path, october_val_word_target_path, october_val_word_df)