In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import collect_list, to_date, month, year, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import BooleanType, TimestampType, ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

import shutil

from concurrent.futures import ThreadPoolExecutor

from sklearn.model_selection import train_test_split

import math

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("analyze multi voicemetadata") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


# 불러오기

In [None]:
july_train_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/JULY/Training/AMR/라벨링데이터/*.json"
july_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_training_wordscript_labeling/*.json"
july_val_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/JULY/Validation/AMR/라벨링데이터/*.json"
july_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_val_wordscript_labeling/*.json"

july_train_min_df = spark.read.json(july_train_min_file_path)
july_train_word_df = spark.read.json(july_train_word_file_path)
july_val_min_df = spark.read.json(july_val_min_file_path)
july_val_word_df = spark.read.json(july_val_word_file_path)

august_train_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/AUGUST/Training/AMR/라벨링데이터/*.json"
august_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_training_wordscript_labeling/*.json"
august_val_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/AUGUST/Validation//AMR/라벨링데이터/*.json"
august_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_val_wordscript_labeling/*.json"

august_train_min_df = spark.read.json(august_train_min_file_path)
august_train_word_df = spark.read.json(august_train_word_file_path)
august_val_min_df = spark.read.json(august_val_min_file_path)
august_val_word_df = spark.read.json(august_val_word_file_path)

september_train_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/SEPTEMBER/Training/AMR/라벨링데이터/*.json"
september_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_training_wordscript_labeling/*.json"
september_val_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/SEPTEMBER/Validation//AMR/라벨링데이터/*.json"
september_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_val_wordscript_labeling/*.json"

september_train_min_df = spark.read.json(september_train_min_file_path)
september_train_word_df = spark.read.json(september_train_word_file_path)
september_val_min_df = spark.read.json(september_val_min_file_path)
september_val_word_df = spark.read.json(september_val_word_file_path)

october_train_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/SEPTEMBER/Training/AMR/라벨링데이터/*.json"
october_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_training_wordscript_labeling/*.json"
october_val_min_file_path = "D:/DATA_PREPROCESS/INTEGRATED_DATASETS/new_화자별/SEPTEMBER/Validation//AMR/라벨링데이터/*.json"
october_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_val_wordscript_labeling/*.json"

october_train_min_df = spark.read.json(october_train_min_file_path)
october_train_word_df = spark.read.json(october_train_word_file_path)
october_val_min_df = spark.read.json(october_val_min_file_path)
october_val_word_df = spark.read.json(october_val_word_file_path)

In [3]:
july_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_training_minutescript_labeling/*.json"
july_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_training_wordscript_labeling/*.json"
july_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_val_minutescript_labeling/*.json"
july_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/july_val_wordscript_labeling/*.json"

july_train_min_df = spark.read.json(july_train_min_file_path)
july_train_word_df = spark.read.json(july_train_word_file_path)
july_val_min_df = spark.read.json(july_val_min_file_path)
july_val_word_df = spark.read.json(july_val_word_file_path)

In [4]:
august_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_training_minutescript_labeling/*.json"
august_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_training_wordscript_labeling/*.json"
august_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_val_minutescript_labeling/*.json"
august_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/august_val_wordscript_labeling/*.json"

august_train_min_df = spark.read.json(august_train_min_file_path)
august_train_word_df = spark.read.json(august_train_word_file_path)
august_val_min_df = spark.read.json(august_val_min_file_path)
august_val_word_df = spark.read.json(august_val_word_file_path)

In [5]:
september_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_training_minutescript_labeling/*.json"
september_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_training_wordscript_labeling/*.json"
september_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_val_minutescript_labeling/*.json"
september_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/september_val_wordscript_labeling/*.json"

september_train_min_df = spark.read.json(september_train_min_file_path)
september_train_word_df = spark.read.json(september_train_word_file_path)
september_val_min_df = spark.read.json(september_val_min_file_path)
september_val_word_df = spark.read.json(september_val_word_file_path)

In [6]:
october_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_training_minutescript_labeling/*.json"
october_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_training_wordscript_labeling/*.json"
october_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_val_minutescript_labeling/*.json"
october_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/12-08-etri-제출버전/android_VOICE_LABELING_FROM_7_TO_10/october_val_wordscript_labeling/*.json"

october_train_min_df = spark.read.json(october_train_min_file_path)
october_train_word_df = spark.read.json(october_train_word_file_path)
october_val_min_df = spark.read.json(october_val_min_file_path)
october_val_word_df = spark.read.json(october_val_word_file_path)

In [7]:
# november_train_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_training_minutescript_labeling/*.json"
# november_train_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_training_wordscript_labeling/*.json"
# november_val_min_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_val_minutescript_labeling/*.json"
# november_val_word_file_path = "D:/DATA_PREPROCESS/FIRESTORE_DATAS/android_VOICE_LABELING_FROM_7_TO_10/november_val_wordscript_labeling/*.json"

# november_train_min_df = spark.read.json(november_train_min_file_path)
# november_train_word_df = spark.read.json(november_train_word_file_path)
# november_val_min_df = spark.read.json(november_val_min_file_path)
# november_val_word_df = spark.read.json(november_val_word_file_path)

In [7]:
# 저장을 용이하게 하기 위해 spark df들을 pandas df들로 변환

july_train_min_pd_df = july_train_min_df.toPandas()
july_train_word_pd_df = july_train_word_df.toPandas()
july_val_min_pd_df = july_val_min_df.toPandas()
july_val_word_pd_df = july_val_word_df.toPandas()
august_train_min_pd_df = august_train_min_df.toPandas()
august_train_word_pd_df = august_train_word_df.toPandas()
august_val_min_pd_df = august_val_min_df.toPandas()
august_val_word_pd_df = august_val_word_df.toPandas()
september_train_min_pd_df = september_train_min_df.toPandas()
september_train_word_pd_df = september_train_word_df.toPandas()
september_val_min_pd_df = september_val_min_df.toPandas()
september_val_word_pd_df = september_val_word_df.toPandas()
october_train_min_pd_df = october_train_min_df.toPandas()
october_train_word_pd_df = october_train_word_df.toPandas()
october_val_min_pd_df = october_val_min_df.toPandas()
october_val_word_pd_df = october_val_word_df.toPandas()

# 음성파일 개수와 attend 개수 차이 원인 파악 
- pyspark df 기준

In [7]:
july_train_min_df.printSchema()

root
 |-- accuracy_array: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ad_avg_tries: double (nullable = true)
 |-- ad_duration: string (nullable = true)
 |-- ad_link: string (nullable = true)
 |-- ad_name: string (nullable = true)
 |-- ad_script: string (nullable = true)
 |-- ad_title: string (nullable = true)
 |-- average_accuracy_by_ad: double (nullable = true)
 |-- average_accuracy_by_user: double (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- created_timestamp_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- excepted_age_array: string (nullable = true)
 |-- first_created_date: long (nullable = true)
 |-- first_timestamp: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- job: string (nullable = true)
 |-- language: string (nullable = true)
 |-- level: string (nullable = true)
 |-- local_code: string (n

In [10]:
july_val_min_df.count()

151945

In [11]:
# accuracy_array의 마지막 원소가 80 이하인 행을 필터링
filtered_df = july_val_min_df.filter(
    (F.col("accuracy_array")[F.size("accuracy_array") - 1] <= 80)
)

# 필터링된 데이터프레임의 행 개수를 계산
count = filtered_df.count()

print("80% 이하의 정확도를 가진 마지막 원소를 갖는 행의 개수:", count)

80% 이하의 정확도를 가진 마지막 원소를 갖는 행의 개수: 7714


# 문서작업용

In [54]:
memor_df = pd.concat([july_train_word_pd_df, july_val_word_pd_df, august_train_word_pd_df, august_val_word_pd_df, september_train_word_pd_df, september_val_word_pd_df, october_train_word_pd_df, october_val_word_pd_df], axis=0)

In [8]:
point_df = pd.concat([july_train_min_pd_df, july_val_min_pd_df, august_train_min_pd_df, august_val_min_pd_df, september_train_min_pd_df, september_val_min_pd_df, october_train_min_pd_df, october_val_min_pd_df], axis=0)

In [66]:
# 원본 데이터프레임에서 ad_name 별 행의 개수를 계산
ad_name_counts = memor_df['ad_name'].value_counts()
# 원본 데이터프레임에 'num' 컬럼으로 ad_name 별 행의 개수 추가
memor_df['num'] = memor_df['ad_name'].map(ad_name_counts)

# 'ad_name' 컬럼을 기준으로 중복 제거
unique_memor_df = memor_df.drop_duplicates(subset=['ad_name'])
# 선택된 컬럼만 포함하는 최종 데이터프레임 생성 (이미 'num' 컬럼이 추가된 상태)
selected_columns = ['ad_name', 'ad_duration', 'ad_script', 'num']


# 선택된 컬럼만 포함하는 최종 데이터프레임 생성
unique_memor_df = unique_memor_df[selected_columns]

# 파일 저장 경로 설정
save_path = 'C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/new_amr_음성전사데이터_단어말하기_7월부터10월.xlsx'

# 엑셀 파일로 저장
unique_memor_df.to_excel(save_path, index=False)

# 저장 완료 메시지 출력
print(f"파일이 {save_path}에 저장되었습니다.")

파일이 C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/new_amr_음성전사데이터_단어말하기_7월부터10월.xlsx에 저장되었습니다.


In [9]:
# 원본 데이터프레임에서 ad_name 별 행의 개수를 계산
ad_name_counts = point_df['ad_name'].value_counts()
# 원본 데이터프레임에 'num' 컬럼으로 ad_name 별 행의 개수 추가
point_df['num'] = point_df['ad_name'].map(ad_name_counts)

# 'ad_name' 컬럼을 기준으로 중복 제거
unique_point_df = point_df.drop_duplicates(subset=['ad_name'])
# 선택된 컬럼만 포함하는 최종 데이터프레임 생성 (이미 'num' 컬럼이 추가된 상태)
selected_columns = ['ad_name', 'ad_duration', 'ad_script', 'num']


# 선택된 컬럼만 포함하는 최종 데이터프레임 생성
unique_point_df = unique_point_df[selected_columns]

# 파일 저장 경로 설정
save_path = 'C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/new_amr_음성전사데이터_1분말하기_7월부터10월.xlsx'

# 엑셀 파일로 저장
unique_point_df.to_excel(save_path, index=False)

# 저장 완료 메시지 출력
print(f"파일이 {save_path}에 저장되었습니다.")

파일이 C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/new_amr_음성전사데이터_1분말하기_7월부터10월.xlsx에 저장되었습니다.


In [61]:
july_train_word_pd_df.head(10)

Unnamed: 0,accuracy_array,ad_avg_tries,ad_duration,ad_link,ad_name,ad_script,ad_title,average_accuracy_by_ad,average_accuracy_by_user,birth_year,...,participant_count,pass_rate_by_ad,pass_rate_by_user,perfect,stt_text_array,text_length,try_count,user_avg_tries,user_id,voice_id
0,[100],1.086379,23.07.11 - 23.07.11,https://msearch.shopping.naver.com/product/863...,1+1 티블레스 스위트티 퍼퓸 바디로션 480ml 플로럴 향기 향 좋은 여름 대용량...,촉촉한바디,암기플러스,90.957187,94.44272,81,...,655.0,99.667774,99.79643,1986.0,[촉촉한 바디],5,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000709
1,[100],1.127397,23.07.12 - 23.07.12,https://msearch.shopping.naver.com/product/858...,1+1 티블레스 퍼플티 퍼퓸 바디워시 500g우디 향 향기 좋은 향수 올리브영 바디...,향기나는,암기플러스,88.651276,94.44272,81,...,825.0,100.0,99.79643,1986.0,[향기나는],4,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000732
2,[100],1.13587,23.07.12 - 23.07.12,https://msearch.shopping.naver.com/product/863...,1+1 티블레스 피그피치 우롱티 퍼퓸 바디로션 480ml 푸르츠 무화과 상큼한 향기...,탁월한보습,암기플러스,87.51555,94.44272,81,...,836.0,99.728261,99.79643,1986.0,[탁월한 보습],5,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000742
3,[100],1.034409,23.07.05 - 23.07.05,https://msearch.shopping.naver.com/product/863...,1+1 티블레스 피그피치 우롱티 퍼퓸 바디워시 500ml 푸르츠 무화과 상큼한 향기...,향좋은샴푸,암기플러스,95.178794,94.44272,81,...,481.0,99.784946,99.79643,1986.0,[향좋은 샴푸],5,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000749
4,[100],1.141956,23.07.14 - 23.07.14,https://msearch.shopping.naver.com/product/858...,1+1 티블레스 화이트티 퍼퓸 바디로션 480g머스크 향 향기 좋은 대용량 향수 올...,고급진향,암기플러스,87.292818,94.44272,81,...,724.0,99.526814,99.79643,1986.0,[고급진 향],4,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000772
5,[100],1.12585,23.07.10 - 23.07.10,https://msearch.shopping.naver.com/product/862...,1003. 여름 곰팡이 화장실 곰팡이 벽지 곰팡이 쁨이랑 10943,곰팡이,암기플러스,88.670695,94.44272,81,...,664.0,99.829932,99.79643,1986.0,[곰팡이],3,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000798
6,[100],2.042662,23.07.26 - 23.07.26,https://msearch.shopping.naver.com/product/869...,1kg 원두 커피 베트남 로부스타 블루드래곤 워시드 G1 홀빈 원두콩 맛있는 고소한...,원두콩,암기플러스,46.95071,94.44272,81,...,1202.0,95.904437,99.79643,1986.0,[원두콩],3,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000844
7,[100],1.477636,23.07.24 - 23.07.24,https://msearch.shopping.naver.com/product/842...,2023년형 휴대용 미니선풍기 핸디선풍기 손선풍기 13327,저소음,암기플러스,67.135135,94.44272,81,...,925.0,99.201278,99.79643,1986.0,[저소음],3,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000874
8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3.589581,23.07.12 - 23.07.12,https://msearch.shopping.naver.com/product/862...,74. 볏짚 마시멜로우 곤포 사일리지 쁨이랑 11397,정답이 아닙니다,암기플러스,25.699115,94.44272,81,...,2827.0,92.121982,99.79643,1986.0,"[권 보, 군포, 건 복, 곤, 군포, 고은 봉, 권 보, 공포, 고온 포, 권 보...",2,23,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000000997
9,[100],1.090343,23.07.23 - 23.07.23,https://msearch.shopping.naver.com/product/837...,[50년청주 육거리소문난만두]수제 손 미친 왕 갈비 만두 국내산고기 사용 맛집 캠핑...,왕만두,암기플러스,91.857143,94.44272,81,...,702.0,100.0,99.79643,1986.0,[왕만두],3,1,1.062011,01YMZJPSJAWojkigVF0EKIlHdup2,voice_0000001162


In [64]:
selected_ad_name = '2023년형 휴대용 미니선풍기 핸디선풍기 손선풍기 13327'  # 실제 사용하고자 하는 ad_name으로 변경 필요
ad_data = july_train_word_pd_df[july_train_word_pd_df['ad_name'] == selected_ad_name]

# gender 비율 계산
gender_ratio = ad_data['gender'].value_counts(normalize=True).round(4)

# local_code 비율 계산
local_code_ratio = ad_data['local_code'].value_counts(normalize=True).round(4)

# birth_year 비율 계산을 위한 범위 함수 정의
def categorize_birth_year(year):
    if 50 <= year < 60:
        return '50-60'
    elif 60 <= year < 70:
        return '60-70'
    elif 70 <= year < 80:
        return '70-80'
    elif 90 <= year < 100:
        return '90-00'
    elif 0 <= year < 10:
        return '00-10'
    else:
        return 'Other'

# birth_year를 두 자리 숫자로 변환하고 범위에 따라 그룹화
ad_data['birth_year_group'] = ad_data['birth_year'].apply(lambda x: int(x) % 100 if pd.notnull(x) and x.isdigit() else x).apply(categorize_birth_year)

# birth_year 그룹 비율 계산
birth_year_ratio = ad_data['birth_year_group'].value_counts(normalize=True).round(4)

# 결과 출력
print(f"Gender Ratio:\n{gender_ratio}\n")
print(f"Local Code Ratio:\n{local_code_ratio}\n")
print(f"Birth Year Ratio:\n{birth_year_ratio}\n")

Gender Ratio:
W    0.7984
M    0.2016
Name: gender, dtype: float64

Local Code Ratio:
Se    0.5519
Gy    0.2240
Ch    0.0896
Jd    0.0835
Ga    0.0305
Je    0.0122
EE    0.0081
Name: local_code, dtype: float64

Birth Year Ratio:
Other    0.3016
70-80    0.2976
60-70    0.1943
90-00    0.1093
50-60    0.0526
00-10    0.0445
Name: birth_year_group, dtype: float64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ad_data['birth_year_group'] = ad_data['birth_year'].apply(lambda x: int(x) % 100 if pd.notnull(x) and x.isdigit() else x).apply(categorize_birth_year)


In [15]:
july_train_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455486 entries, 0 to 455485
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            455486 non-null  object 
 1   ad_avg_tries              455486 non-null  float64
 2   ad_duration               455486 non-null  object 
 3   ad_link                   455486 non-null  object 
 4   ad_name                   455486 non-null  object 
 5   ad_script                 455486 non-null  object 
 6   ad_title                  455486 non-null  object 
 7   average_accuracy_by_ad    455486 non-null  float64
 8   average_accuracy_by_user  455486 non-null  float64
 9   birth_year                447007 non-null  object 
 10  collection                455486 non-null  object 
 11  created_timestamp_array   455486 non-null  object 
 12  excepted_age_array        455486 non-null  object 
 13  first_created_date        455486 non-null  i

In [10]:
count = len(july_train_min_pd_df[july_train_min_pd_df['text_length'] <= 15])
print(count)

0


In [16]:
july_train_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1403970 entries, 0 to 1403969
Data columns (total 31 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   accuracy_array            1403970 non-null  object 
 1   ad_avg_tries              1403970 non-null  float64
 2   ad_duration               1403970 non-null  object 
 3   ad_link                   1403970 non-null  object 
 4   ad_name                   1403970 non-null  object 
 5   ad_script                 1403970 non-null  object 
 6   ad_title                  1403970 non-null  object 
 7   average_accuracy_by_ad    1403970 non-null  float64
 8   average_accuracy_by_user  1403970 non-null  float64
 9   birth_year                1382615 non-null  object 
 10  collection                1403970 non-null  object 
 11  created_timestamp_array   1403970 non-null  object 
 12  excepted_age_array        1403970 non-null  object 
 13  first_created_date        1

In [11]:
count = len(july_train_word_pd_df[july_train_word_pd_df['text_length'] <= 15])
print(count)

1402403


In [42]:
unique_ads = july_train_word_pd_df['ad_name'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 4311


In [17]:
july_val_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151945 entries, 0 to 151944
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            151945 non-null  object 
 1   ad_avg_tries              151945 non-null  float64
 2   ad_duration               151945 non-null  object 
 3   ad_link                   151945 non-null  object 
 4   ad_name                   151945 non-null  object 
 5   ad_script                 151945 non-null  object 
 6   ad_title                  151945 non-null  object 
 7   average_accuracy_by_ad    151945 non-null  float64
 8   average_accuracy_by_user  151945 non-null  float64
 9   birth_year                149112 non-null  object 
 10  collection                151945 non-null  object 
 11  created_timestamp_array   151945 non-null  object 
 12  excepted_age_array        151945 non-null  object 
 13  first_created_date        151945 non-null  i

In [12]:
count = len(july_val_min_pd_df[july_val_min_pd_df['text_length'] <= 15])
print(count)

0


In [18]:
july_val_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468081 entries, 0 to 468080
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            468081 non-null  object 
 1   ad_avg_tries              468081 non-null  float64
 2   ad_duration               468081 non-null  object 
 3   ad_link                   468081 non-null  object 
 4   ad_name                   468081 non-null  object 
 5   ad_script                 468081 non-null  object 
 6   ad_title                  468081 non-null  object 
 7   average_accuracy_by_ad    468081 non-null  float64
 8   average_accuracy_by_user  468081 non-null  float64
 9   birth_year                460829 non-null  object 
 10  collection                468081 non-null  object 
 11  created_timestamp_array   468081 non-null  object 
 12  excepted_age_array        468081 non-null  object 
 13  first_created_date        468081 non-null  i

In [39]:
august_train_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471221 entries, 0 to 471220
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            471221 non-null  object 
 1   ad_avg_tries              471221 non-null  float64
 2   ad_duration               471221 non-null  object 
 3   ad_link                   471221 non-null  object 
 4   ad_name                   471221 non-null  object 
 5   ad_script                 471221 non-null  object 
 6   ad_title                  471221 non-null  object 
 7   average_accuracy_by_ad    471221 non-null  float64
 8   average_accuracy_by_user  471221 non-null  float64
 9   birth_year                463668 non-null  object 
 10  collection                471221 non-null  object 
 11  created_timestamp_array   471221 non-null  object 
 12  excepted_age_array        471221 non-null  object 
 13  first_created_date        471221 non-null  i

In [40]:
august_train_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1814321 entries, 0 to 1814320
Data columns (total 31 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   accuracy_array            object 
 1   ad_avg_tries              float64
 2   ad_duration               object 
 3   ad_link                   object 
 4   ad_name                   object 
 5   ad_script                 object 
 6   ad_title                  object 
 7   average_accuracy_by_ad    float64
 8   average_accuracy_by_user  float64
 9   birth_year                object 
 10  collection                object 
 11  created_timestamp_array   object 
 12  excepted_age_array        object 
 13  first_created_date        int64  
 14  first_timestamp           object 
 15  gender                    object 
 16  is_passed                 object 
 17  job                       object 
 18  language                  object 
 19  level                     object 
 20  local_code              

In [43]:
unique_ads = august_train_word_pd_df['ad_name'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 4406


In [21]:
august_val_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156699 entries, 0 to 156698
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            156699 non-null  object 
 1   ad_avg_tries              156699 non-null  float64
 2   ad_duration               156699 non-null  object 
 3   ad_link                   156699 non-null  object 
 4   ad_name                   156699 non-null  object 
 5   ad_script                 156699 non-null  object 
 6   ad_title                  156699 non-null  object 
 7   average_accuracy_by_ad    156699 non-null  float64
 8   average_accuracy_by_user  156699 non-null  float64
 9   birth_year                154245 non-null  object 
 10  collection                156699 non-null  object 
 11  created_timestamp_array   156699 non-null  object 
 12  excepted_age_array        156699 non-null  object 
 13  first_created_date        156699 non-null  i

In [22]:
august_val_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604722 entries, 0 to 604721
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            604722 non-null  object 
 1   ad_avg_tries              604722 non-null  float64
 2   ad_duration               604722 non-null  object 
 3   ad_link                   604722 non-null  object 
 4   ad_name                   604722 non-null  object 
 5   ad_script                 604722 non-null  object 
 6   ad_title                  604722 non-null  object 
 7   average_accuracy_by_ad    604722 non-null  float64
 8   average_accuracy_by_user  604722 non-null  float64
 9   birth_year                598838 non-null  object 
 10  collection                604722 non-null  object 
 11  created_timestamp_array   604722 non-null  object 
 12  excepted_age_array        604722 non-null  object 
 13  first_created_date        604722 non-null  i

In [23]:
september_train_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185742 entries, 0 to 185741
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            185742 non-null  object 
 1   ad_avg_tries              185742 non-null  float64
 2   ad_duration               185742 non-null  object 
 3   ad_link                   185742 non-null  object 
 4   ad_name                   185742 non-null  object 
 5   ad_script                 185742 non-null  object 
 6   ad_title                  185742 non-null  object 
 7   average_accuracy_by_ad    185742 non-null  float64
 8   average_accuracy_by_user  185742 non-null  float64
 9   birth_year                183436 non-null  object 
 10  collection                185742 non-null  object 
 11  created_timestamp_array   185742 non-null  object 
 12  excepted_age_array        185742 non-null  object 
 13  first_created_date        185742 non-null  i

In [24]:
september_train_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2232447 entries, 0 to 2232446
Data columns (total 31 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   accuracy_array            object 
 1   ad_avg_tries              float64
 2   ad_duration               object 
 3   ad_link                   object 
 4   ad_name                   object 
 5   ad_script                 object 
 6   ad_title                  object 
 7   average_accuracy_by_ad    float64
 8   average_accuracy_by_user  float64
 9   birth_year                object 
 10  collection                object 
 11  created_timestamp_array   object 
 12  excepted_age_array        object 
 13  first_created_date        int64  
 14  first_timestamp           object 
 15  gender                    object 
 16  is_passed                 object 
 17  job                       object 
 18  language                  object 
 19  level                     object 
 20  local_code              

In [44]:
unique_ads = september_train_word_pd_df['ad_name'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 3771


In [25]:
september_val_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61906 entries, 0 to 61905
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   accuracy_array            61906 non-null  object 
 1   ad_avg_tries              61906 non-null  float64
 2   ad_duration               61906 non-null  object 
 3   ad_link                   61906 non-null  object 
 4   ad_name                   61906 non-null  object 
 5   ad_script                 61906 non-null  object 
 6   ad_title                  61906 non-null  object 
 7   average_accuracy_by_ad    61906 non-null  float64
 8   average_accuracy_by_user  61906 non-null  float64
 9   birth_year                61107 non-null  object 
 10  collection                61906 non-null  object 
 11  created_timestamp_array   61906 non-null  object 
 12  excepted_age_array        61906 non-null  object 
 13  first_created_date        61906 non-null  int64  
 14  first_

In [26]:
september_val_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744174 entries, 0 to 744173
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            744174 non-null  object 
 1   ad_avg_tries              744174 non-null  float64
 2   ad_duration               744174 non-null  object 
 3   ad_link                   744174 non-null  object 
 4   ad_name                   744174 non-null  object 
 5   ad_script                 744174 non-null  object 
 6   ad_title                  744174 non-null  object 
 7   average_accuracy_by_ad    744174 non-null  float64
 8   average_accuracy_by_user  744174 non-null  float64
 9   birth_year                739144 non-null  object 
 10  collection                744174 non-null  object 
 11  created_timestamp_array   744174 non-null  object 
 12  excepted_age_array        744174 non-null  object 
 13  first_created_date        744174 non-null  i

In [27]:
october_train_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182537 entries, 0 to 182536
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            182537 non-null  object 
 1   ad_avg_tries              182537 non-null  float64
 2   ad_duration               182537 non-null  object 
 3   ad_link                   182537 non-null  object 
 4   ad_name                   182537 non-null  object 
 5   ad_script                 182537 non-null  object 
 6   ad_title                  182537 non-null  object 
 7   average_accuracy_by_ad    182537 non-null  float64
 8   average_accuracy_by_user  182537 non-null  float64
 9   birth_year                180000 non-null  object 
 10  collection                182537 non-null  object 
 11  created_timestamp_array   182537 non-null  object 
 12  excepted_age_array        182537 non-null  object 
 13  first_created_date        182537 non-null  i

In [38]:
october_train_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2221608 entries, 0 to 2221607
Data columns (total 31 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   accuracy_array            object 
 1   ad_avg_tries              float64
 2   ad_duration               object 
 3   ad_link                   object 
 4   ad_name                   object 
 5   ad_script                 object 
 6   ad_title                  object 
 7   average_accuracy_by_ad    float64
 8   average_accuracy_by_user  float64
 9   birth_year                object 
 10  collection                object 
 11  created_timestamp_array   object 
 12  excepted_age_array        object 
 13  first_created_date        int64  
 14  first_timestamp           object 
 15  gender                    object 
 16  is_passed                 object 
 17  job                       object 
 18  language                  object 
 19  level                     object 
 20  local_code              

In [45]:
unique_ads = october_train_word_pd_df['ad_name'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 4466


In [29]:
october_val_min_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60577 entries, 0 to 60576
Data columns (total 31 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   accuracy_array            60577 non-null  object 
 1   ad_avg_tries              60577 non-null  float64
 2   ad_duration               60577 non-null  object 
 3   ad_link                   60577 non-null  object 
 4   ad_name                   60577 non-null  object 
 5   ad_script                 60577 non-null  object 
 6   ad_title                  60577 non-null  object 
 7   average_accuracy_by_ad    60577 non-null  float64
 8   average_accuracy_by_user  60577 non-null  float64
 9   birth_year                59696 non-null  object 
 10  collection                60577 non-null  object 
 11  created_timestamp_array   60577 non-null  object 
 12  excepted_age_array        60577 non-null  object 
 13  first_created_date        60577 non-null  int64  
 14  first_

In [30]:
october_val_word_pd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740532 entries, 0 to 740531
Data columns (total 31 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accuracy_array            740532 non-null  object 
 1   ad_avg_tries              740532 non-null  float64
 2   ad_duration               740532 non-null  object 
 3   ad_link                   740532 non-null  object 
 4   ad_name                   740532 non-null  object 
 5   ad_script                 740532 non-null  object 
 6   ad_title                  740532 non-null  object 
 7   average_accuracy_by_ad    740532 non-null  float64
 8   average_accuracy_by_user  740532 non-null  float64
 9   birth_year                737073 non-null  object 
 10  collection                740532 non-null  object 
 11  created_timestamp_array   740532 non-null  object 
 12  excepted_age_array        740532 non-null  object 
 13  first_created_date        740532 non-null  i

## 월별 음성의 개수가 5개 이하인 유저의 퍼센트 파악 -> 삭제

In [10]:
july_train_min_df.printSchema()

root
 |-- accuracy_array: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ad_avg_tries: double (nullable = true)
 |-- ad_duration: string (nullable = true)
 |-- ad_link: string (nullable = true)
 |-- ad_name: string (nullable = true)
 |-- ad_script: string (nullable = true)
 |-- ad_title: string (nullable = true)
 |-- average_accuracy_by_ad: double (nullable = true)
 |-- average_accuracy_by_user: double (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- created_timestamp_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- excepted_age_array: string (nullable = true)
 |-- first_created_date: long (nullable = true)
 |-- first_timestamp: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- job: string (nullable = true)
 |-- language: string (nullable = true)
 |-- level: string (nullable = true)
 |-- local_code: string (n

In [11]:
# DataFrame 리스트
dfs = [
    july_train_min_df, july_train_word_df, july_val_min_df, july_val_word_df,
    august_train_min_df, august_train_word_df, august_val_min_df, august_val_word_df,
    september_train_min_df, september_train_word_df, september_val_min_df, september_val_word_df,
    october_train_min_df, october_train_word_df, october_val_min_df, october_val_word_df
]

# 각 DataFrame에 대해 반복
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupBy("user_id").agg(count("ad_name").alias("audio_count"))

    # 5개 이하인 행 필터링
    less_than_five = count_df.filter(col("audio_count") <= 5)

    # 필터링된 행의 개수
    count_less_than_five = less_than_five.count()

    # 전체 행 개수
    total_count = df.count()

    # 비율 계산
    percentage = (count_less_than_five / total_count) * 100

    # 결과 출력
    print(f"5개 이하인 행의 개수: {count_less_than_five}, 전체 대비 비율: {percentage:.2f}%")

5개 이하인 행의 개수: 927, 전체 대비 비율: 0.20%
5개 이하인 행의 개수: 1727, 전체 대비 비율: 0.12%
5개 이하인 행의 개수: 2136, 전체 대비 비율: 1.41%
5개 이하인 행의 개수: 1310, 전체 대비 비율: 0.28%
5개 이하인 행의 개수: 1066, 전체 대비 비율: 0.23%
5개 이하인 행의 개수: 2043, 전체 대비 비율: 0.11%
5개 이하인 행의 개수: 2423, 전체 대비 비율: 1.55%
5개 이하인 행의 개수: 1944, 전체 대비 비율: 0.32%
5개 이하인 행의 개수: 2364, 전체 대비 비율: 1.27%
5개 이하인 행의 개수: 1666, 전체 대비 비율: 0.07%
5개 이하인 행의 개수: 3866, 전체 대비 비율: 6.24%
5개 이하인 행의 개수: 1275, 전체 대비 비율: 0.17%
5개 이하인 행의 개수: 2220, 전체 대비 비율: 1.22%
5개 이하인 행의 개수: 1715, 전체 대비 비율: 0.08%
5개 이하인 행의 개수: 4462, 전체 대비 비율: 7.37%
5개 이하인 행의 개수: 1354, 전체 대비 비율: 0.18%


In [13]:
# 각 DataFrame에 대해 반복하며 5개 이하인 음성 파일을 가진 유저들의 행을 삭제
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupBy("user_id").agg(F.count("ad_name").alias("audio_count"))

    # 5개 이하인 유저 ID 필터링
    users_less_than_five = count_df.filter(F.col("audio_count") <= 5).select("user_id")

    # 해당 유저들의 행을 삭제
    df = df.join(users_less_than_five, "user_id", "left_anti")

## 월별 음성의 개수가 5개 이하인 광고의 퍼센트 파악 -> 삭제

In [14]:
# DataFrame 리스트
dfs = [
    july_train_word_pd_df, july_val_word_pd_df,
    august_train_word_pd_df, august_val_word_pd_df,
    september_train_word_pd_df, september_val_word_pd_df,
    october_train_word_pd_df, october_val_word_pd_df
]

# # 각 spark DataFrame에 대해 반복
# for df in dfs:
#     # 광고별 음성 파일 개수 계산
#     count_df = df.groupBy("ad_name").agg(count("user_id").alias("audio_count"))

#     # 5개 이하인 행 필터링
#     less_than_five = count_df.filter(col("audio_count") <= 5)

#     # 필터링된 행의 개수
#     count_less_than_five = less_than_five.count()

#     # 전체 행 개수
#     total_count = df.count()

#     # 비율 계산
#     percentage = (count_less_than_five / total_count) * 100

#     # 결과 출력
#     print(f"5개 이하인 행의 개수: {count_less_than_five}, 전체 대비 비율: {percentage:.2f}%")

In [13]:
july_train_word_pd_df.count()

accuracy_array              1403970
ad_avg_tries                1403970
ad_duration                 1403970
ad_link                     1403970
ad_name                     1403970
ad_script                   1403970
ad_title                    1403970
average_accuracy_by_ad      1403970
average_accuracy_by_user    1403970
birth_year                  1382615
collection                  1403970
created_timestamp_array     1403970
excepted_age_array          1403970
first_created_date          1403970
first_timestamp             1403970
gender                      1395701
is_passed                   1403970
job                         1344787
language                    1382615
level                       1403970
local_code                  1395701
participant_count           1403970
pass_rate_by_ad             1403970
pass_rate_by_user           1403970
perfect                     1382174
stt_text_array              1403970
text_length                 1403970
try_count                   

In [15]:
# # spark df 버전
# for i in range(len(dfs)):
#     # 광고별 음성 파일 개수 계산 
#     count_df = dfs[i].groupBy("ad_name").agg(F.count("user_id").alias("audio_count"))

#     # 5개 이하인 광고 ID 필터링
#     users_less_than_five = count_df.filter(F.col("audio_count") <= 5).select("ad_name")

#     # 해당 광고들의 행을 삭제
#     dfs[i] = dfs[i].join(users_less_than_five, "ad_name", "left_anti")


# pandas df 버전
for i in range(len(dfs)):
    # 광고별 음성 파일 개수 계산
    count_df = dfs[i].groupby("ad_name").agg(audio_count=('user_id', 'count'))

    # 5개 이하인 광고명 필터링
    users_less_than_five = count_df[count_df["audio_count"] <= 5].index

    # 해당 광고들의 행을 삭제
    dfs[i] = dfs[i][~dfs[i]["ad_name"].isin(users_less_than_five)]

In [16]:
july_train_word_pd_df.count()

accuracy_array              1403970
ad_avg_tries                1403970
ad_duration                 1403970
ad_link                     1403970
ad_name                     1403970
ad_script                   1403970
ad_title                    1403970
average_accuracy_by_ad      1403970
average_accuracy_by_user    1403970
birth_year                  1382615
collection                  1403970
created_timestamp_array     1403970
excepted_age_array          1403970
first_created_date          1403970
first_timestamp             1403970
gender                      1395701
is_passed                   1403970
job                         1344787
language                    1382615
level                       1403970
local_code                  1395701
participant_count           1403970
pass_rate_by_ad             1403970
pass_rate_by_user           1403970
perfect                     1382174
stt_text_array              1403970
text_length                 1403970
try_count                   

## 광고별 JSON 파일로 저장

In [17]:
# pandas df 기준

# 경로 설정을 위한 매핑
path_mapping = {
    
    'july_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Training\AMR\라벨링데이터',
    # # 'july_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AMR\라벨링데이터',
    # 'july_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Validation\AMR\라벨링데이터',
    # # 'august_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AMR\라벨링데이터',
    # 'august_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Training\AMR\라벨링데이터',
    # # 'august_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AMR\라벨링데이터',
    # 'august_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Validation\AMR\라벨링데이터',
    # # 'september_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AMR\라벨링데이터',
    # 'september_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Training\AMR\라벨링데이터',
    # # 'september_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AMR\라벨링데이터',
    # 'september_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Validation\AMR\라벨링데이터',
    # # 'october_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AMR\라벨링데이터',
    # 'october_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Training\AMR\라벨링데이터',
    # # 'october_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AMR\라벨링데이터',
    # 'october_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Validation\AMR\라벨링데이터'
}

# DataFrame 이름과 객체를 튜플로 매핑
df_tuples = [
    # ('july_train_min_pd_df', july_train_min_pd_df),
    ('july_train_word_pd_df', july_train_word_pd_df),
    # # ('july_val_min_pd_df', july_val_min_pd_df),
    # ('july_val_word_pd_df', july_val_word_pd_df),
    # # ('august_train_min_pd_df', august_train_min_pd_df),
    # ('august_train_word_pd_df', august_train_word_pd_df),
    # # ('august_val_min_pd_df', august_val_min_pd_df),
    # ('august_val_word_pd_df', august_val_word_pd_df),
    # # ('september_train_min_pd_df', september_train_min_pd_df),
    # ('september_train_word_pd_df', september_train_word_pd_df),
    # # ('september_val_min_pd_df', september_val_min_pd_df),
    # ('september_val_word_pd_df', september_val_word_pd_df),
    # # ('october_train_min_pd_df', october_train_min_pd_df),
    # ('october_train_word_pd_df', october_train_word_pd_df),
    # # ('october_val_min_pd_df', october_val_min_pd_df),
    # ('october_val_word_pd_df', october_val_word_pd_df)
]

# 각 DataFrame에 대해 반복하며 유저별로 JSON 파일로 저장
for df_name, df in df_tuples:
    # 해당 DataFrame의 경로 가져오기
    base_path = path_mapping[df_name]

    # 광고별로 그룹화
    grouped = df.groupby('ad_name')

    # 각 광고에 대해 반복
    for ad_name, group in grouped:
        # 저장할 파일 경로 설정
        file_path = os.path.join(base_path, f'{ad_name}.json')

        # 데이터를 JSON 형태로 변환하여 파일에 저장
        group.to_json(file_path, orient='records', force_ascii=False)

In [18]:
# # 경로 설정
# source_dir = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Training\AMR\원천데이터'
# target_dir = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Training\AMR\라벨링데이터'

# # 원천데이터 디렉토리의 폴더 및 파일명 추출
# source_files = set(os.listdir(source_dir))

# # 라벨링데이터 디렉토리의 파일명 추출 (확장자 제거)
# target_files = set(os.path.splitext(file)[0] for file in os.listdir(target_dir))

# # 원천데이터에만 존재하는 항목 찾기
# only_in_source = source_files - target_files

# # 결과 출력
# print("원천데이터 디렉토리에만 존재하는 항목:")
# for item in only_in_source:
#     print(item)

원천데이터 디렉토리에만 존재하는 항목:
플랭크 세계 기록
행복해지는 방법
청년, 신혼부부 매입 임대 주택 입주 모집
옷의 주인이 준비한 직업
지역축제 먹거리 가격 사전 공개 정책
당신의 핸드메이드 취향, 핸드아티코리아에서 찾자!
여름철 불청객 살모넬라 식중독
성격과 기질의 차이
책상 위의 비극
스웨그에이지, 외쳐 조선!
커피에 설탕 대신 이것을 추가하자
버스도 예약하고 타세요!
보윙 챌린지 스크립트 공모전!!
그가 돌아왔다! 미션 임파서블
탐정의 마지막 여행지
건강즙, 이런 사람에겐 독이 되기도 합니다
경기도미술관 이건희컬렉션 '사계'
광고 참여하고 리뷰 달면 포인트가 빵야빵야
자가용 소유자 90%는 하루 2시간도 이용 안 한다
모기 물렸을 때 대처법
서울 자전거 따릉이 마일리지 쌓는 꿀팁!
3D 프린터로 만든 생선은 무슨 맛일까
수도권 릴레이 오픈 기념 식자재 역대급 특가
금연 최면 치료 받은 썰
치즈 덮밥 털면 유빙 덮밥
장마철 냄새 걱정 없는 빨래 관리법
저작권 걱정없이 자유롭게 글꼴 쓰세요
불법촬영 없는 학교 만들기
올여름 얼마나 더울까
김정주 기관사의 방송
걷기의 무시할 수 없는 효과
서울에서 즐기는 북캉스 4곳
장수를 위해 필요한 근육
이럴 땐 무슨 나이 쓰나요
보윙을 소개합니다
제주 바다에 몸을 맡기다. 결국, 결
우리아이 황금응가를 위한 깐깐한 유산균 IPICKY
어지럼증 클리닉
마음에 묻은 얼룩을 지워드립니다
반려견과 물놀이 시 주의사항
거울 속 모습
반대의 매력, 엘리멘탈
가족돌봄휴가 활용하자
1.4.4 패치 노트 정리
입시에 필요한 모든 대학 정보를 한 곳에!
신나는 주말체육학교
셰프로 성장하는 즐거움을 모두 담은 요리게임
스티커 라벨 잘 떼는 법
서울아이발달지원센터 무료 검사
죽기 전 뇌의 활동
보다 깨끗하게 사용하는 법
한강공원 수영장 개장
자궁경부암 예방주사 무료접종
나만의 카페를 운영해보세요
귀염뽀짝 힐링게임
홀로 사는 어르신과 장애인을 위해
사랑하는 당신에게
도수가 높으면 살이 안찔까
보윙 12시 유튜브 라이브
안심전세 앱 2.0 

### 유저별 JSON 파일로 저장

In [23]:
# 저장을 용이하게 하기 위해 spark df들을 pandas df들로 변환

july_train_min_pd_df = july_train_min_df.toPandas()
july_train_word_pd_df = july_train_word_df.toPandas()
july_val_min_pd_df = july_val_min_df.toPandas()
july_val_word_pd_df = july_val_word_df.toPandas()
august_train_min_pd_df = august_train_min_df.toPandas()
august_train_word_pd_df = august_train_word_df.toPandas()
august_val_min_pd_df = august_val_min_df.toPandas()
august_val_word_pd_df = august_val_word_df.toPandas()
september_train_min_pd_df = september_train_min_df.toPandas()
september_train_word_pd_df = september_train_word_df.toPandas()
september_val_min_pd_df = september_val_min_df.toPandas()
september_val_word_pd_df = september_val_word_df.toPandas()
october_train_min_pd_df = october_train_min_df.toPandas()
october_train_word_pd_df = october_train_word_df.toPandas()
october_val_min_pd_df = october_val_min_df.toPandas()
october_val_word_pd_df = october_val_word_df.toPandas()

In [23]:
# 경로 설정을 위한 매핑
path_mapping = {
    'july_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AMR\라벨링데이터',
    'july_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AMR\라벨링데이터',
    'july_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AMR\라벨링데이터',
    'july_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AMR\라벨링데이터',
    'august_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AMR\라벨링데이터',
    'august_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AMR\라벨링데이터',
    'august_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AMR\라벨링데이터',
    'august_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AMR\라벨링데이터',
    'september_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AMR\라벨링데이터',
    'september_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AMR\라벨링데이터',
    'september_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AMR\라벨링데이터',
    'september_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AMR\라벨링데이터',
    'october_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AMR\라벨링데이터',
    'october_train_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AMR\라벨링데이터',
    'october_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AMR\라벨링데이터',
    'october_val_word_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AMR\라벨링데이터'
}

# DataFrame 이름과 객체를 튜플로 매핑
df_tuples = [
    ('july_train_min_pd_df', july_train_min_pd_df),
    ('july_train_word_pd_df', july_train_word_pd_df),
    ('july_val_min_pd_df', july_val_min_pd_df),
    ('july_val_word_pd_df', july_val_word_pd_df),
    ('august_train_min_pd_df', august_train_min_pd_df),
    ('august_train_word_pd_df', august_train_word_pd_df),
    ('august_val_min_pd_df', august_val_min_pd_df),
    ('august_val_word_pd_df', august_val_word_pd_df),
    ('september_train_min_pd_df', september_train_min_pd_df),
    ('september_train_word_pd_df', september_train_word_pd_df),
    ('september_val_min_pd_df', september_val_min_pd_df),
    ('september_val_word_pd_df', september_val_word_pd_df),
    ('october_train_min_pd_df', october_train_min_pd_df),
    ('october_train_word_pd_df', october_train_word_pd_df),
    ('october_val_min_pd_df', october_val_min_pd_df),
    ('october_val_word_pd_df', october_val_word_pd_df)
]

# 각 DataFrame에 대해 반복하며 유저별로 JSON 파일로 저장
for df_name, df in df_tuples:
    # 해당 DataFrame의 경로 가져오기
    base_path = path_mapping[df_name]

    # 유저별로 그룹화
    grouped = df.groupby('user_id')

    # 각 유저에 대해 반복
    for user_id, group in grouped:
        # 저장할 파일 경로 설정
        file_path = os.path.join(base_path, f'{user_id}.json')

        # 데이터를 JSON 형태로 변환하여 파일에 저장
        group.to_json(file_path, orient='records', force_ascii=False)

## 음성파일 위치 옮기기 231215(유저폴더별)

In [51]:
# 기본 파일 경로
base_current_path = r'D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10'
base_new_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS'
fixed_new_path_part = r'\AMR\원천데이터'


# DataFrame 이름에서 월과 유형을 추출하여 경로를 결정하는 함수
def get_paths(df_name):
    # 월과 유형 추출
    parts = df_name.split('_')
    month = parts[0].upper()

    type_part = 'wordscript' if 'word' in df_name else 'minutescript'
    section_part = 'validation' if 'val' in df_name else 'training'
    after_type_part = 'WORD_SCRIPT' if 'word' in df_name else 'MINUTE_SCRIPT'
    after_section_part = 'Validation' if 'val' in df_name else 'Training'

    # 현재 파일 경로
    current_file_path = os.path.join(base_current_path, month, section_part + '_' + type_part.lower())
    # 새 파일 경로
    new_file_path = os.path.join(base_new_path, after_type_part, month, after_section_part, fixed_new_path_part.lstrip('\\'))


    print(f"current_file_path = {current_file_path}, new_file_path = {new_file_path}")

    return current_file_path, new_file_path


In [None]:
# def move_file(current_file, new_file):
#     try:
#         os.rename(current_file, new_file)
#     except FileNotFoundError:
#         print(f"File not found: {current_file}, skipped.")
# def move_files(df, df_name, paths_function):
#     current_path, new_base_path = paths_function(df_name)
#     for voice_id in df['voice_id']:
#         # user_id 폴더 생성
#         user_id = df[df['voice_id'] == voice_id]['user_id'].iloc[0]
#         new_path = os.path.join(new_base_path, user_id)
#         if not os.path.exists(new_path):
#             os.makedirs(new_path)

#         # 파일 이동
#         current_file = os.path.join(current_path, voice_id)
#         new_file = os.path.join(new_path, voice_id)
#         move_file(current_file, new_file)

def move_files_for_user(df, user_id, current_path, new_base_path):
    user_df = df[df['user_id'] == user_id]
    new_path = os.path.join(new_base_path, user_id)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for voice_id in user_df['voice_id']:
        current_file = os.path.join(current_path, voice_id)
        new_file = os.path.join(new_path, voice_id)
        if os.path.exists(current_file):
            os.rename(current_file, new_file)

def move_files_parallel_by_user(df, df_name, paths_function):
    current_path, new_base_path = paths_function(df_name)
    user_ids = df['user_id'].unique()
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        for user_id in user_ids:
            executor.submit(move_files_for_user, df, user_id, current_path, new_base_path)

In [52]:
move_files_parallel_by_user(july_train_min_pd_df, 'july_train_min_pd_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\JULY\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AMR\원천데이터


In [54]:
move_files_parallel_by_user(july_train_word_pd_df, 'july_train_word_pd_df', get_paths)
move_files_parallel_by_user(july_val_min_pd_df, 'july_val_min_pd_df', get_paths)
move_files_parallel_by_user(july_val_word_pd_df, 'july_val_word_pd_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\JULY\training_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\JULY\validation_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\JULY\validation_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AMR\원천데이터


In [55]:
move_files_parallel_by_user(august_train_min_pd_df, 'august_train_min_pd_df', get_paths)
move_files_parallel_by_user(august_train_word_pd_df, 'august_train_word_pd_df', get_paths)
move_files_parallel_by_user(august_val_min_pd_df, 'august_val_min_pd_df', get_paths)
move_files_parallel_by_user(august_val_word_pd_df, 'august_val_word_pd_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\AUGUST\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\AUGUST\training_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\AUGUST\validation_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\AUGUST\validation_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AMR\원천데이터


In [56]:
move_files_parallel_by_user(september_train_min_pd_df, 'september_train_min_pd_df', get_paths)
move_files_parallel_by_user(september_train_word_pd_df, 'september_train_word_pd_df', get_paths)
move_files_parallel_by_user(september_val_min_pd_df, 'september_val_min_pd_df', get_paths)
move_files_parallel_by_user(september_val_word_pd_df, 'september_val_word_pd_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\SEPTEMBER\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\SEPTEMBER\training_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\SEPTEMBER\validation_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\SEPTEMBER\validation_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AMR\원천데이터


In [57]:
move_files_parallel_by_user(october_train_min_pd_df, 'october_train_min_pd_df', get_paths)
move_files_parallel_by_user(october_train_word_pd_df, 'october_train_word_pd_df', get_paths)
move_files_parallel_by_user(october_val_min_pd_df, 'october_val_min_pd_df', get_paths)
move_files_parallel_by_user(october_val_word_pd_df, 'october_val_word_pd_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\OCTOBER\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\OCTOBER\training_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\OCTOBER\validation_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\FIRESTORE_DATAS\12-08-etri-제출버전\android_VOICE_DATA_FROM_7_TO_10\OCTOBER\validation_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AMR\원천데이터


In [59]:
# 모든 작업이 완료되면 스레드 풀을 종료
executor.shutdown(wait=True)

NameError: name 'executor' is not defined

## 음성파일 위치 옮기기 231218(광고 폴더별)

In [27]:
july_train_word_df.printSchema()

root
 |-- accuracy_array: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- ad_avg_tries: double (nullable = true)
 |-- ad_duration: string (nullable = true)
 |-- ad_link: string (nullable = true)
 |-- ad_name: string (nullable = true)
 |-- ad_script: string (nullable = true)
 |-- ad_title: string (nullable = true)
 |-- average_accuracy_by_ad: double (nullable = true)
 |-- average_accuracy_by_user: double (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- created_timestamp_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- excepted_age_array: string (nullable = true)
 |-- first_created_date: long (nullable = true)
 |-- first_timestamp: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- job: string (nullable = true)
 |-- language: string (nullable = true)
 |-- level: string (nullable = true)
 |-- local_code: string (n

In [24]:
# 기본 파일 경로
base_current_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT'
base_new_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT'
fixed_new_path_part = r'\AMR\원천데이터'

# DataFrame 이름에서 월과 유형을 추출하여 경로를 결정하는 함수
def get_paths(df_name):
    # 월과 유형 추출
    parts = df_name.split('_')
    month = parts[0].upper()

    after_section_part = 'Validation' if 'val' in df_name else 'Training'

    # 현재 파일 경로
    current_file_path = os.path.join(base_current_path, month, after_section_part, fixed_new_path_part.lstrip('\\'))
    # 새 파일 경로
    new_file_path = os.path.join(base_new_path, month, after_section_part, fixed_new_path_part.lstrip('\\'))

    print(f"current_file_path = {current_file_path}, new_file_path = {new_file_path}")

    return current_file_path, new_file_path

In [30]:
# 광고별 음성 파일 이동 함수
def move_files_for_ad(df, ad_id, current_path, new_base_path):
    # 필터링하여 광고별 DataFrame 생성
    ad_df = df.filter(col('ad_name') == ad_id).select('voice_id', 'user_id')
    new_path = os.path.join(new_base_path, ad_id)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for row in ad_df.collect():
        voice_id = row['voice_id']
        user_id = row['user_id']
        # 원본 파일 경로에 user_id 폴더 포함
        current_file = os.path.join(current_path, user_id, voice_id)
        new_file = os.path.join(new_path, voice_id)
        if os.path.exists(current_file):
            os.rename(current_file, new_file)

# 광고별 병렬 파일 이동 함수
def move_files_parallel_by_ad(df, df_name, paths_function):
    current_path, new_base_path = paths_function(df_name)
    ad_ids = [row['ad_name'] for row in df.select('ad_name').distinct().collect()]
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        for ad_id in ad_ids:
            executor.submit(move_files_for_ad, df, ad_id, current_path, new_base_path)


In [32]:
move_files_parallel_by_ad(july_val_word_df, 'july_val_word_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Validation\AMR\원천데이터


In [34]:
move_files_parallel_by_ad(july_train_word_df, 'july_train_word_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Training\AMR\원천데이터


In [35]:
move_files_parallel_by_ad(august_train_word_df, 'august_train_word_df', get_paths)
move_files_parallel_by_ad(august_val_word_df, 'august_val_word_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Validation\AMR\원천데이터


In [36]:
move_files_parallel_by_ad(september_train_word_df, 'september_train_word_df', get_paths)
move_files_parallel_by_ad(september_val_word_df, 'september_val_word_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Validation\AMR\원천데이터


In [37]:
move_files_parallel_by_ad(october_train_word_df, 'october_train_word_df', get_paths)
move_files_parallel_by_ad(october_val_word_df, 'october_val_word_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Training\AMR\원천데이터
current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AMR\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Validation\AMR\원천데이터


# '유저id 폴더'/'광고명 파일' 구조에서 'voice_id 파일'로 바꾸기

In [7]:
july_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_minutescript"
july_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_wordscript"
july_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_minutescript"
july_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_wordscript"

august_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_minutescript"
august_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_wordscript"
august_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_minutescript"
august_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_wordscript"

september_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_minutescript"
september_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_wordscript"
september_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_minutescript"
september_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_wordscript"

october_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_minutescript"
october_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_wordscript"
october_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_minutescript"
october_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_wordscript"


In [8]:
def rename_files(df, target_path):
    # DataFrame에서 필요한 정보 추출
    info = df.select("user_id", "ad_name", "voice_id").collect()

    # 각 레코드에 대한 파일명 변경
    for row in info:
        old_path = os.path.join(target_path, row.user_id, row.ad_name)
        new_path = os.path.join(target_path, row.user_id, row.voice_id)

        if os.path.exists(old_path):
            os.rename(old_path, new_path)

In [6]:
rename_files(july_train_min_df, july_train_min_target_path)

In [9]:
# rename_files(july_train_word_df, july_train_word_target_path)
# rename_files(july_val_min_df, july_val_min_target_path)
# rename_files(july_val_word_df, july_val_word_target_path)

rename_files(august_train_min_df, august_train_min_target_path)
rename_files(august_train_word_df, august_train_word_target_path)
rename_files(august_val_min_df, august_val_min_target_path)
rename_files(august_val_word_df, august_val_word_target_path)

rename_files(september_train_min_df, september_train_min_target_path)
rename_files(september_train_word_df, september_train_word_target_path)
rename_files(september_val_min_df, september_val_min_target_path)
rename_files(september_val_word_df, september_val_word_target_path)

rename_files(october_train_min_df, october_train_min_target_path)
rename_files(october_train_word_df, october_train_word_target_path)
rename_files(october_val_min_df, october_val_min_target_path)
rename_files(october_val_word_df, october_val_word_target_path)

## 유저명 폴더 없에고 그 안의 파일들을 상위폴더로 이동

In [10]:
def move_files_and_remove_folder(root_path):
    # root_path 내의 모든 폴더를 순회
    for user_folder in os.listdir(root_path):
        user_folder_path = os.path.join(root_path, user_folder)

        # 폴더인지 확인
        if os.path.isdir(user_folder_path):
            # 해당 폴더 내의 모든 파일에 대해
            for file_name in os.listdir(user_folder_path):
                file_path = os.path.join(user_folder_path, file_name)
                new_path = os.path.join(root_path, file_name)

                # 파일 이동
                shutil.move(file_path, new_path)

            # 빈 폴더 삭제
            os.rmdir(user_folder_path)

In [11]:
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_minutescript")

In [13]:
# 7월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_wordscript")

# 8월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_wordscript")

# 9월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_wordscript")

# 10월 데이터에 대한 작업
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_wordscript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_minutescript")
move_files_and_remove_folder("D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_wordscript")

# 각각의 데이터 뽑기

In [8]:
# 후보 2
def copy_file(source, target):
    try:
        shutil.copy(source, target)
        return 1  # 성공한 경우
    except Exception as e:
        return 0  # 실패한 경우

def find_and_copy_files(source_path, target_path, df):
    # # 결과 저장을 위한 카운터
    # results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        for row in df.collect():
            user_id = row['user_id']
            ad_name = row['ad_name']
            user_folder_path = os.path.join(source_path, user_id)
            ad_file_path = os.path.join(user_folder_path, ad_name)
            target_dir = os.path.join(target_path, user_id)

            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            # 병렬 처리를 위한 작업 추가
            if os.path.isfile(ad_file_path):
                futures.append(executor.submit(copy_file, ad_file_path, target_dir))

        for future in futures:
            future.result()


In [9]:
july_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_july"

july_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_minutescript"
july_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\training_wordscript"
july_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_minutescript"
july_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\JULY\\validation_wordscript"

find_and_copy_files(july_source_path, july_train_min_target_path, july_train_min_df)
find_and_copy_files(july_source_path, july_train_word_target_path, july_train_word_df)
find_and_copy_files(july_source_path, july_val_min_target_path, july_val_min_df)
find_and_copy_files(july_source_path, july_val_word_target_path, july_val_word_df)

In [11]:
august_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_august"

august_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_minutescript"
august_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\training_wordscript"
august_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_minutescript"
august_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\AUGUST\\validation_wordscript"

find_and_copy_files(august_source_path, august_train_min_target_path, august_train_min_df)
find_and_copy_files(august_source_path, august_train_word_target_path, august_train_word_df)
find_and_copy_files(august_source_path, august_val_min_target_path, august_val_min_df)
find_and_copy_files(august_source_path, august_val_word_target_path, august_val_word_df)

In [13]:
september_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_september"

september_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_minutescript"
september_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\training_wordscript"
september_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_minutescript"
september_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\SEPTEMBER\\validation_wordscript"

find_and_copy_files(september_source_path, september_train_min_target_path, september_train_min_df)
find_and_copy_files(september_source_path, september_train_word_target_path, september_train_word_df)
find_and_copy_files(september_source_path, september_val_min_target_path, september_val_min_df)
find_and_copy_files(september_source_path, september_val_word_target_path, september_val_word_df)

In [12]:
october_source_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\voice_files_from_october"

october_train_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_minutescript"
october_train_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\training_wordscript"
october_val_min_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_minutescript"
october_val_word_target_path = f"D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\android_VOICE_DATA_FROM_7_TO_10\\OCTOBER\\validation_wordscript"

find_and_copy_files(october_source_path, october_train_min_target_path, october_train_min_df)
find_and_copy_files(october_source_path, october_train_word_target_path, october_train_word_df)
find_and_copy_files(october_source_path, october_val_min_target_path, october_val_min_df)
find_and_copy_files(october_source_path, october_val_word_target_path, october_val_word_df)

# 뭉테기 라벨링데이터 저장

In [8]:
# 7월 데이터 합치기
july_df = pd.concat([july_train_min_pd_df, july_train_word_pd_df, july_val_min_pd_df, july_val_word_pd_df], axis=0)

# 8월 데이터 합치기
august_df = pd.concat([august_train_min_pd_df, august_train_word_pd_df, august_val_min_pd_df, august_val_word_pd_df], axis=0)

# 9월 데이터 합치기
september_df = pd.concat([september_train_min_pd_df, september_train_word_pd_df, september_val_min_pd_df, september_val_word_pd_df], axis=0)

# 10월 데이터 합치기
october_df = pd.concat([october_train_min_pd_df, october_train_word_pd_df, october_val_min_pd_df, october_val_word_pd_df], axis=0)

In [9]:
july_df.count()

accuracy_array              2479482
ad_avg_tries                2479482
ad_duration                 2479482
ad_link                     2479482
ad_name                     2479482
ad_script                   2479482
ad_title                    2479482
average_accuracy_by_ad      2479482
average_accuracy_by_user    2479482
birth_year                  2439563
collection                  2479482
created_timestamp_array     2479482
excepted_age_array          2479482
first_created_date          2479482
first_timestamp             2479482
gender                      2460606
is_passed                   2479482
job                         2379139
language                    2439563
level                       2479482
local_code                  2460606
participant_count           2471780
pass_rate_by_ad             2479482
pass_rate_by_user           2479482
perfect                     2435323
stt_text_array              2479482
text_length                 2479482
try_count                   

In [34]:
unique_user_count = july_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 8582


In [31]:
august_df.count()

accuracy_array              3046963
ad_avg_tries                3046963
ad_duration                 3046963
ad_link                     3046963
ad_name                     3046963
ad_script                   3046963
ad_title                    3046963
average_accuracy_by_ad      3046963
average_accuracy_by_user    3046963
birth_year                  3013743
collection                  3046963
created_timestamp_array     3046963
excepted_age_array          3046963
first_created_date          3046963
first_timestamp             3046963
gender                      3034218
is_passed                   3046963
job                         2414312
language                    3013743
level                       3046963
local_code                  3034218
participant_count           3046963
pass_rate_by_ad             3046963
pass_rate_by_user           3046963
perfect                     3008724
stt_text_array              3046963
text_length                 3046963
try_count                   

In [35]:
unique_user_count = august_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 10385


In [32]:
september_df.count()

accuracy_array              3224269
ad_avg_tries                3224269
ad_duration                 3224269
ad_link                     3224269
ad_name                     3224269
ad_script                   3224269
ad_title                    3224269
average_accuracy_by_ad      3224269
average_accuracy_by_user    3224269
birth_year                  3201233
collection                  3224269
created_timestamp_array     3224269
excepted_age_array          3224269
first_created_date          3224269
first_timestamp             3224269
gender                      3221630
is_passed                   3224269
job                         2007092
language                    3201233
level                       3224269
local_code                  3221630
participant_count           3207561
pass_rate_by_ad             3224269
pass_rate_by_user           3224269
perfect                     3199712
stt_text_array              3224269
text_length                 3224269
try_count                   

In [36]:
unique_user_count = september_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 9859


In [33]:
october_df.count()

accuracy_array              3205254
ad_avg_tries                3205254
ad_duration                 3205254
ad_link                     3205254
ad_name                     3205254
ad_script                   3205254
ad_title                    3205254
average_accuracy_by_ad      3205254
average_accuracy_by_user    3205254
birth_year                  3188101
collection                  3205254
created_timestamp_array     3205254
excepted_age_array          3205254
first_created_date          3205254
first_timestamp             3205254
gender                      3200336
is_passed                   3205254
job                         1627253
language                    3188101
level                       3205254
local_code                  3200336
participant_count           3185224
pass_rate_by_ad             3205254
pass_rate_by_user           3205254
perfect                     3186194
stt_text_array              3205254
text_length                 3205254
try_count                   

In [37]:
unique_user_count = october_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 11009


In [10]:
def save_df_to_json(df, base_path, month):
    # 파일 경로 설정
    folder_path = os.path.join(base_path, month.upper(), "AMR", "라벨링데이터")

    # 폴더가 존재하지 않으면 생성
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # 5000행 단위로 분할
    num_splits = math.ceil(len(df) / 5000)
    
    for i in range(num_splits):
        # 분할된 데이터 프레임
        split_df = df.iloc[i*5000 : (i+1)*5000]

        # JSON 파일로 저장
        file_name = f"{month}_part_{i+1}.json"
        file_path = os.path.join(folder_path, file_name)
        split_df.to_json(file_path, orient='records', force_ascii=False, lines=True)


In [11]:
save_df_to_json(july_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'JULY')

In [11]:
save_df_to_json(august_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'AUGUST')

In [12]:
save_df_to_json(september_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'SEPTEMBER')

In [13]:
save_df_to_json(october_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'OCTOBER')