In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import to_timestamp, collect_list, to_date, month, year, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import BooleanType, TimestampType, ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

import shutil

from concurrent.futures import ThreadPoolExecutor, as_completed

from sklearn.model_selection import train_test_split

import math

import json

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("voice files classification operation") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


# 불러오기

In [3]:
labelling_point_file_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\labelling_data_from_7_to_10\\labelling_point.json\\*.json"
labelling_memor_file_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\labelling_data_from_7_to_10\\labelling_memor.json\\*.json"

point_df = spark.read.json(labelling_point_file_path)
memor_df = spark.read.json(labelling_memor_file_path)

In [9]:
test_file_path = "D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/JULY/july_training_minutescript_labeling/*.json"

test_df = spark.read.json(test_file_path)

In [11]:
test_df.show(truncate=False)

+--------+------+--------+----------+--------+-----------+----------+-----------------------------+------+---------+----------------+-----------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 유저별로 조각난 json 모아서 불러오기

In [3]:
train_point_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\JULY\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\JULY\\Training\\AAC\\라벨링데이터\\*.json"
val_point_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\JULY\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\JULY\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_july_df = spark.read.json(train_point_july_path)
train_memor_july_df = spark.read.json(train_memor_july_path)
val_point_july_df = spark.read.json(val_point_july_path)
val_memor_july_df = spark.read.json(val_memor_july_path)

In [4]:
train_point_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\AUGUST\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\AUGUST\\Training\\AAC\\라벨링데이터\\*.json"
val_point_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\AUGUST\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\AUGUST\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_august_df = spark.read.json(train_point_august_path)
train_memor_august_df = spark.read.json(train_memor_august_path)
val_point_august_df = spark.read.json(val_point_august_path)
val_memor_august_df = spark.read.json(val_memor_august_path)

In [5]:
train_point_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\SEPTEMBER\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\SEPTEMBER\\Training\\AAC\\라벨링데이터\\*.json"
val_point_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\SEPTEMBER\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\SEPTEMBER\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_september_df = spark.read.json(train_point_september_path)
train_memor_september_df = spark.read.json(train_memor_september_path)
val_point_september_df = spark.read.json(val_point_september_path)
val_memor_september_df = spark.read.json(val_memor_september_path)

In [6]:
train_point_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\OCTOBER\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\OCTOBER\\Training\\AAC\\라벨링데이터\\*.json"
val_point_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\OCTOBER\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\WORD_SCRIPT\\OCTOBER\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_october_df = spark.read.json(train_point_october_path)
train_memor_october_df = spark.read.json(train_memor_october_path)
val_point_october_df = spark.read.json(val_point_october_path)
val_memor_october_df = spark.read.json(val_memor_october_path)

In [7]:
# pandas df들을 spark df들로 바꿈

train_point_july_df = train_point_july_df.toPandas()
train_memor_july_df = train_memor_july_df.toPandas()
val_point_july_df = val_point_july_df.toPandas()
val_memor_july_df = val_memor_july_df.toPandas()

train_point_august_df = train_point_august_df.toPandas()
train_memor_august_df = train_memor_august_df.toPandas()
val_point_august_df = val_point_august_df.toPandas()
val_memor_august_df = val_memor_august_df.toPandas()

train_point_september_df = train_point_september_df.toPandas()
train_memor_september_df = train_memor_september_df.toPandas()
val_point_september_df = val_point_september_df.toPandas()
val_memor_september_df = val_memor_september_df.toPandas()

train_point_october_df = train_point_october_df.toPandas()
train_memor_october_df = train_memor_october_df.toPandas()
val_point_october_df = val_point_october_df.toPandas()
val_memor_october_df = val_memor_october_df.toPandas()

# 문서 작업용

In [8]:
memor_df = pd.concat([train_memor_july_df, val_memor_july_df, train_memor_august_df, val_memor_august_df, train_memor_september_df, val_memor_september_df, train_memor_october_df, val_memor_october_df], axis=0)

In [9]:
point_df = pd.concat([train_point_july_df, val_point_july_df, train_point_august_df, val_point_august_df, train_point_september_df, val_point_september_df, train_point_october_df, val_point_october_df], axis=0)

In [10]:
# 원본 데이터프레임에서 title 별 행의 개수를 계산
title_counts = memor_df['title'].value_counts()
# 원본 데이터프레임에 'num' 컬럼으로 title 별 행의 개수 추가
memor_df['num'] = memor_df['title'].map(title_counts)

# 'title' 컬럼을 기준으로 중복 제거
unique_memor_df = memor_df.drop_duplicates(subset=['title'])

# 날짜 포매팅
unique_memor_df['start_at_formatted'] = pd.to_datetime(unique_memor_df['start_at']).dt.strftime('%y.%m.%d')
unique_memor_df['end_at_formatted'] = pd.to_datetime(unique_memor_df['end_at']).dt.strftime('%y.%m.%d')
unique_memor_df['duration'] = unique_memor_df['start_at_formatted'] + ' - ' + unique_memor_df['end_at_formatted']

# 선택된 컬럼만 포함하는 최종 데이터프레임 생성 (이미 'num' 컬럼이 추가된 상태)
selected_columns = ['title', 'duration', 'script', 'num']
unique_memor_df = unique_memor_df[selected_columns]

# 파일 저장 경로 설정
save_path = 'C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/aac_음성전사데이터_단어말하기_7월부터10월.xlsx'

# 엑셀 파일로 저장
unique_memor_df.to_excel(save_path, index=False)

# 저장 완료 메시지 출력
print(f"파일이 {save_path}에 저장되었습니다.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_memor_df['start_at_formatted'] = pd.to_datetime(unique_memor_df['start_at']).dt.strftime('%y.%m.%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_memor_df['end_at_formatted'] = pd.to_datetime(unique_memor_df['end_at']).dt.strftime('%y.%m.%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

파일이 C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/aac_음성전사데이터_단어말하기_7월부터10월.xlsx에 저장되었습니다.


In [14]:
# 원본 데이터프레임에서 title 별 행의 개수를 계산
title_counts = point_df['title'].value_counts()
# 원본 데이터프레임에 'num' 컬럼으로 title 별 행의 개수 추가
point_df['num'] = point_df['title'].map(title_counts)

# 'title' 컬럼을 기준으로 중복 제거
unique_point_df = point_df.drop_duplicates(subset=['title'])

# 날짜 포매팅
unique_point_df['start_at_formatted'] = pd.to_datetime(unique_point_df['start_at']).dt.strftime('%y.%m.%d')
unique_point_df['end_at_formatted'] = pd.to_datetime(unique_point_df['end_at']).dt.strftime('%y.%m.%d')
unique_point_df['duration'] = unique_point_df['start_at_formatted'] + ' - ' + unique_point_df['end_at_formatted']

# 선택된 컬럼만 포함하는 최종 데이터프레임 생성 (이미 'num' 컬럼이 추가된 상태)
selected_columns = ['title', 'duration', 'script', 'num']
unique_point_df = unique_point_df[selected_columns]

# 파일 저장 경로 설정
save_path = 'C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/aac_음성전사데이터_1분말하기_7월부터10월.xlsx'

# 엑셀 파일로 저장
unique_point_df.to_excel(save_path, index=False)

# 저장 완료 메시지 출력
print(f"파일이 {save_path}에 저장되었습니다.")

파일이 C:/Users/admin/Desktop/Vowing 앱 관련/VOWING_DATA_Info/aac_음성전사데이터_1분말하기_7월부터10월.xlsx에 저장되었습니다.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_point_df['start_at_formatted'] = pd.to_datetime(unique_point_df['start_at']).dt.strftime('%y.%m.%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_point_df['end_at_formatted'] = pd.to_datetime(unique_point_df['end_at']).dt.strftime('%y.%m.%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

## 각각의 df들에 대한

In [8]:
train_point_july_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19451 entries, 0 to 19450
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          19451 non-null  int64 
 1   ads_id            19451 non-null  int64 
 2   audio_id          19451 non-null  int64 
 3   birth             19451 non-null  object
 4   category          19451 non-null  int64 
 5   description       19451 non-null  object
 6   difficulty        19451 non-null  int64 
 7   end_at            19451 non-null  object
 8   gender            19451 non-null  int64 
 9   is_passed         19451 non-null  int64 
 10  minimum_accuracy  19451 non-null  int64 
 11  participated_at   19451 non-null  int64 
 12  participation_id  19451 non-null  object
 13  recorded_text     19451 non-null  object
 14  script            19451 non-null  object
 15  signed_up_at      19451 non-null  object
 16  start_at          19451 non-null  object
 17  title       

In [37]:
unique_user_count = train_point_july_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 753


In [46]:
# 'script' 컬럼에서 중복을 제거하고 랜덤하게 10개의 스크립트를 선택
unique_scripts = train_point_july_df['script'].drop_duplicates().sample(10)

# 선택된 스크립트에 해당하는 모든 데이터를 필터링
filtered_df = train_point_july_df[train_point_july_df['script'].isin(unique_scripts)]

# 각 스크립트별로 'recorded_text' 컬럼의 유니크한 값 중 최대 3개를 추출
def get_unique_recorded_texts(group):
    return group.drop_duplicates(subset='recorded_text').head(3)

# apply를 사용하여 각 그룹에서 유니크한 'recorded_text'를 가져오고, 다른 필요한 컬럼들도 함께 가져옴
sampled_df = filtered_df.groupby('script', group_keys=False).apply(get_unique_recorded_texts)

# Excel 파일로 저장
save_path = 'C:\\Users\\admin\\Desktop\\Vowing 앱 관련\\VOWING_DATA_Info\\train_point_july_stt_sample.xlsx'
sampled_df.to_excel(save_path, index=False)

print(f"File saved at {save_path}")

File saved at C:\Users\admin\Desktop\Vowing 앱 관련\VOWING_DATA_Info\train_point_july_stt_sample.xlsx


In [9]:
train_memor_july_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27382 entries, 0 to 27381
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          27382 non-null  int64 
 1   ads_id            27382 non-null  int64 
 2   audio_id          27382 non-null  int64 
 3   birth             27382 non-null  object
 4   category          27382 non-null  int64 
 5   description       27382 non-null  object
 6   difficulty        27382 non-null  int64 
 7   end_at            27382 non-null  object
 8   gender            27382 non-null  int64 
 9   is_passed         27382 non-null  int64 
 10  minimum_accuracy  27382 non-null  int64 
 11  participated_at   27382 non-null  int64 
 12  participation_id  27382 non-null  object
 13  recorded_text     27382 non-null  object
 14  script            27382 non-null  object
 15  signed_up_at      27382 non-null  object
 16  start_at          27382 non-null  object
 17  title       

In [40]:
unique_ads = train_memor_july_df['title'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 617


In [10]:
val_point_july_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5687 entries, 0 to 5686
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          5687 non-null   int64 
 1   ads_id            5687 non-null   int64 
 2   audio_id          5687 non-null   int64 
 3   birth             5687 non-null   object
 4   category          5687 non-null   int64 
 5   description       5687 non-null   object
 6   difficulty        5687 non-null   int64 
 7   end_at            5687 non-null   object
 8   gender            5687 non-null   int64 
 9   is_passed         5687 non-null   int64 
 10  minimum_accuracy  5687 non-null   int64 
 11  participated_at   5687 non-null   int64 
 12  participation_id  5687 non-null   object
 13  recorded_text     5687 non-null   object
 14  script            5687 non-null   object
 15  signed_up_at      5687 non-null   object
 16  start_at          5687 non-null   object
 17  title         

In [11]:
val_memor_july_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9014 entries, 0 to 9013
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          9014 non-null   int64 
 1   ads_id            9014 non-null   int64 
 2   audio_id          9014 non-null   int64 
 3   birth             9014 non-null   object
 4   category          9014 non-null   int64 
 5   description       9014 non-null   object
 6   difficulty        9014 non-null   int64 
 7   end_at            9014 non-null   object
 8   gender            9014 non-null   int64 
 9   is_passed         9014 non-null   int64 
 10  minimum_accuracy  9014 non-null   int64 
 11  participated_at   9014 non-null   int64 
 12  participation_id  9014 non-null   object
 13  recorded_text     9014 non-null   object
 14  script            9014 non-null   object
 15  signed_up_at      9014 non-null   object
 16  start_at          9014 non-null   object
 17  title         

In [12]:
train_point_august_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24578 entries, 0 to 24577
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          24578 non-null  int64 
 1   ads_id            24578 non-null  int64 
 2   audio_id          24578 non-null  int64 
 3   birth             24578 non-null  object
 4   category          24578 non-null  int64 
 5   description       24578 non-null  object
 6   difficulty        24578 non-null  int64 
 7   end_at            24578 non-null  object
 8   gender            24578 non-null  int64 
 9   is_passed         24578 non-null  int64 
 10  minimum_accuracy  24578 non-null  int64 
 11  participated_at   24578 non-null  int64 
 12  participation_id  24578 non-null  object
 13  recorded_text     24578 non-null  object
 14  script            24578 non-null  object
 15  signed_up_at      24578 non-null  object
 16  start_at          24578 non-null  object
 17  title       

In [29]:
unique_user_count = train_point_august_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 977


In [13]:
train_memor_august_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114531 entries, 0 to 114530
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   accuracy          114531 non-null  int64 
 1   ads_id            114531 non-null  int64 
 2   audio_id          114531 non-null  int64 
 3   birth             114531 non-null  object
 4   category          114531 non-null  int64 
 5   description       114531 non-null  object
 6   difficulty        114531 non-null  int64 
 7   end_at            114531 non-null  object
 8   gender            114531 non-null  int64 
 9   is_passed         114531 non-null  int64 
 10  minimum_accuracy  114531 non-null  int64 
 11  participated_at   114531 non-null  int64 
 12  participation_id  114531 non-null  object
 13  recorded_text     114531 non-null  object
 14  script            114531 non-null  object
 15  signed_up_at      114531 non-null  object
 16  start_at          114531 non-null  obj

In [41]:
unique_ads = train_memor_august_df['title'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 1799


In [14]:
val_point_august_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7089 entries, 0 to 7088
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          7089 non-null   int64 
 1   ads_id            7089 non-null   int64 
 2   audio_id          7089 non-null   int64 
 3   birth             7089 non-null   object
 4   category          7089 non-null   int64 
 5   description       7089 non-null   object
 6   difficulty        7089 non-null   int64 
 7   end_at            7089 non-null   object
 8   gender            7089 non-null   int64 
 9   is_passed         7089 non-null   int64 
 10  minimum_accuracy  7089 non-null   int64 
 11  participated_at   7089 non-null   int64 
 12  participation_id  7089 non-null   object
 13  recorded_text     7089 non-null   object
 14  script            7089 non-null   object
 15  signed_up_at      7089 non-null   object
 16  start_at          7089 non-null   object
 17  title         

In [15]:
val_memor_august_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37980 entries, 0 to 37979
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          37980 non-null  int64 
 1   ads_id            37980 non-null  int64 
 2   audio_id          37980 non-null  int64 
 3   birth             37980 non-null  object
 4   category          37980 non-null  int64 
 5   description       37980 non-null  object
 6   difficulty        37980 non-null  int64 
 7   end_at            37980 non-null  object
 8   gender            37980 non-null  int64 
 9   is_passed         37980 non-null  int64 
 10  minimum_accuracy  37980 non-null  int64 
 11  participated_at   37980 non-null  int64 
 12  participation_id  37980 non-null  object
 13  recorded_text     37980 non-null  object
 14  script            37980 non-null  object
 15  signed_up_at      37980 non-null  object
 16  start_at          37980 non-null  object
 17  title       

In [16]:
train_point_september_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17922 entries, 0 to 17921
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          17922 non-null  int64 
 1   ads_id            17922 non-null  int64 
 2   audio_id          17922 non-null  int64 
 3   birth             17922 non-null  object
 4   category          17922 non-null  int64 
 5   description       17922 non-null  object
 6   difficulty        17922 non-null  int64 
 7   end_at            17922 non-null  object
 8   gender            17922 non-null  int64 
 9   is_passed         17922 non-null  int64 
 10  minimum_accuracy  17922 non-null  int64 
 11  participated_at   17922 non-null  int64 
 12  participation_id  17922 non-null  object
 13  recorded_text     17922 non-null  object
 14  script            17922 non-null  object
 15  signed_up_at      17922 non-null  object
 16  start_at          17922 non-null  object
 17  title       

In [27]:
unique_user_count = train_point_september_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 845


In [17]:
train_memor_september_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292226 entries, 0 to 292225
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   accuracy          292226 non-null  int64 
 1   ads_id            292226 non-null  int64 
 2   audio_id          292226 non-null  int64 
 3   birth             292226 non-null  object
 4   category          292226 non-null  int64 
 5   description       292226 non-null  object
 6   difficulty        292226 non-null  int64 
 7   end_at            292226 non-null  object
 8   gender            292226 non-null  int64 
 9   is_passed         292226 non-null  int64 
 10  minimum_accuracy  292226 non-null  int64 
 11  participated_at   292226 non-null  int64 
 12  participation_id  292226 non-null  object
 13  recorded_text     292226 non-null  object
 14  script            292226 non-null  object
 15  signed_up_at      292226 non-null  object
 16  start_at          292226 non-null  obj

In [42]:
unique_ads = train_memor_september_df['title'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 4523


In [18]:
val_point_september_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4856 entries, 0 to 4855
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          4856 non-null   int64 
 1   ads_id            4856 non-null   int64 
 2   audio_id          4856 non-null   int64 
 3   birth             4856 non-null   object
 4   category          4856 non-null   int64 
 5   description       4856 non-null   object
 6   difficulty        4856 non-null   int64 
 7   end_at            4856 non-null   object
 8   gender            4856 non-null   int64 
 9   is_passed         4856 non-null   int64 
 10  minimum_accuracy  4856 non-null   int64 
 11  participated_at   4856 non-null   int64 
 12  participation_id  4856 non-null   object
 13  recorded_text     4856 non-null   object
 14  script            4856 non-null   object
 15  signed_up_at      4856 non-null   object
 16  start_at          4856 non-null   object
 17  title         

In [19]:
val_memor_september_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97216 entries, 0 to 97215
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          97216 non-null  int64 
 1   ads_id            97216 non-null  int64 
 2   audio_id          97216 non-null  int64 
 3   birth             97216 non-null  object
 4   category          97216 non-null  int64 
 5   description       97216 non-null  object
 6   difficulty        97216 non-null  int64 
 7   end_at            97216 non-null  object
 8   gender            97216 non-null  int64 
 9   is_passed         97216 non-null  int64 
 10  minimum_accuracy  97216 non-null  int64 
 11  participated_at   97216 non-null  int64 
 12  participation_id  97216 non-null  object
 13  recorded_text     97216 non-null  object
 14  script            97216 non-null  object
 15  signed_up_at      97216 non-null  object
 16  start_at          97216 non-null  object
 17  title       

In [20]:
train_point_october_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16959 entries, 0 to 16958
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accuracy          16959 non-null  int64  
 1   ads_id            16959 non-null  int64  
 2   audio_id          16959 non-null  int64  
 3   birth             16959 non-null  object 
 4   category          16959 non-null  int64  
 5   description       16959 non-null  object 
 6   difficulty        16959 non-null  int64  
 7   end_at            16959 non-null  object 
 8   gender            16959 non-null  float64
 9   is_passed         16959 non-null  int64  
 10  minimum_accuracy  16959 non-null  int64  
 11  participated_at   16959 non-null  int64  
 12  participation_id  16959 non-null  object 
 13  recorded_text     16959 non-null  object 
 14  script            16959 non-null  object 
 15  signed_up_at      16959 non-null  object 
 16  start_at          16959 non-null  object

In [25]:
unique_user_count = train_point_october_df['user_id'].nunique()
print("고유한 user_id의 개수:", unique_user_count)

고유한 user_id의 개수: 883


In [21]:
train_memor_october_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245307 entries, 0 to 245306
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   accuracy          245307 non-null  int64  
 1   ads_id            245307 non-null  int64  
 2   audio_id          245307 non-null  int64  
 3   birth             245307 non-null  object 
 4   category          245307 non-null  int64  
 5   description       245307 non-null  object 
 6   difficulty        245307 non-null  int64  
 7   end_at            245307 non-null  object 
 8   gender            245307 non-null  float64
 9   is_passed         245307 non-null  int64  
 10  minimum_accuracy  245307 non-null  int64  
 11  participated_at   245307 non-null  int64  
 12  participation_id  245307 non-null  object 
 13  recorded_text     245307 non-null  object 
 14  script            245307 non-null  object 
 15  signed_up_at      245307 non-null  object 
 16  start_at          24

In [43]:
unique_ads = train_memor_october_df['title'].nunique()
print("고유한 ads_id의 개수:", unique_ads)

고유한 ads_id의 개수: 3858


In [22]:
val_point_october_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4369 entries, 0 to 4368
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accuracy          4369 non-null   int64  
 1   ads_id            4369 non-null   int64  
 2   audio_id          4369 non-null   int64  
 3   birth             4369 non-null   object 
 4   category          4369 non-null   int64  
 5   description       4369 non-null   object 
 6   difficulty        4369 non-null   int64  
 7   end_at            4369 non-null   object 
 8   gender            4369 non-null   float64
 9   is_passed         4369 non-null   int64  
 10  minimum_accuracy  4369 non-null   int64  
 11  participated_at   4369 non-null   int64  
 12  participation_id  4369 non-null   object 
 13  recorded_text     4369 non-null   object 
 14  script            4369 non-null   object 
 15  signed_up_at      4369 non-null   object 
 16  start_at          4369 non-null   object 


In [23]:
val_memor_october_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81506 entries, 0 to 81505
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accuracy          81506 non-null  int64  
 1   ads_id            81506 non-null  int64  
 2   audio_id          81506 non-null  int64  
 3   birth             81506 non-null  object 
 4   category          81506 non-null  int64  
 5   description       81506 non-null  object 
 6   difficulty        81506 non-null  int64  
 7   end_at            81506 non-null  object 
 8   gender            81506 non-null  float64
 9   is_passed         81506 non-null  int64  
 10  minimum_accuracy  81506 non-null  int64  
 11  participated_at   81506 non-null  int64  
 12  participation_id  81506 non-null  object 
 13  recorded_text     81506 non-null  object 
 14  script            81506 non-null  object 
 15  signed_up_at      81506 non-null  object 
 16  start_at          81506 non-null  object

# 광고별 음성파일 위치 옮기기

In [42]:
train_memor_july_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27382 entries, 0 to 27381
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   accuracy          27382 non-null  int64 
 1   ads_id            27382 non-null  int64 
 2   audio_id          27382 non-null  int64 
 3   birth             27382 non-null  object
 4   category          27382 non-null  int64 
 5   description       27382 non-null  object
 6   difficulty        27382 non-null  int64 
 7   end_at            27382 non-null  object
 8   gender            27382 non-null  int64 
 9   is_passed         27382 non-null  int64 
 10  minimum_accuracy  27382 non-null  int64 
 11  participated_at   27382 non-null  int64 
 12  participation_id  27382 non-null  object
 13  recorded_text     27382 non-null  object
 14  script            27382 non-null  object
 15  signed_up_at      27382 non-null  object
 16  start_at          27382 non-null  object
 17  title       

In [28]:
# 기본 파일 경로
base_current_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT'
base_new_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT'
fixed_new_path_part = r'\AAC\원천데이터'

# DataFrame 이름에서 월과 유형을 추출하여 경로를 결정하는 함수
def get_paths(df_name):
    # 월과 유형 추출
    parts = df_name.split('_')
    month = parts[2].upper()

    after_section_part = 'Validation' if 'val' in df_name else 'Training'

    # 현재 파일 경로
    current_file_path = os.path.join(base_current_path, month, after_section_part, fixed_new_path_part.lstrip('\\'))
    # 새 파일 경로
    new_file_path = os.path.join(base_new_path, month, after_section_part, fixed_new_path_part.lstrip('\\'))

    print(f"current_file_path = {current_file_path}, new_file_path = {new_file_path}")

    return current_file_path, new_file_path

In [31]:
# 광고별 음성 파일 이동 함수
def move_files_for_ad(df, ad_id, current_path, new_base_path):
    # 필터링하여 광고별 DataFrame 생성
    ad_df = df.filter(col('title') == ad_id).select('participation_id', 'user_id')
    new_path = os.path.join(new_base_path, ad_id)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for row in ad_df.collect():
        participation_id = row['participation_id']
        user_id = str(row['user_id'])
        # 원본 파일 경로에 user_id 폴더 포함
        current_file = os.path.join(current_path, user_id, participation_id)
        new_file = os.path.join(new_path, participation_id)
        if os.path.exists(current_file):
            os.rename(current_file, new_file)

# 광고별 병렬 파일 이동 함수
def move_files_parallel_by_ad(df, df_name, paths_function):
    current_path, new_base_path = paths_function(df_name)
    ad_ids = [row['title'] for row in df.select('title').distinct().collect()]
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        for ad_id in ad_ids:
            executor.submit(move_files_for_ad, df, ad_id, current_path, new_base_path)


In [32]:
move_files_parallel_by_ad(train_memor_july_df, 'train_memor_july_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Training\AAC\원천데이터


In [33]:
move_files_parallel_by_ad(val_memor_july_df, 'val_memor_july_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Validation\AAC\원천데이터


In [37]:
move_files_parallel_by_ad(train_memor_august_df, 'train_memor_august_df', get_paths)
move_files_parallel_by_ad(val_memor_august_df, 'val_memor_august_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Training\AAC\원천데이터
current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Validation\AAC\원천데이터


In [35]:
move_files_parallel_by_ad(train_memor_september_df, 'train_memor_september_df', get_paths)
move_files_parallel_by_ad(val_memor_september_df, 'val_memor_september_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Training\AAC\원천데이터
current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Validation\AAC\원천데이터


In [36]:
move_files_parallel_by_ad(train_memor_october_df, 'train_memor_october_df', get_paths)
move_files_parallel_by_ad(val_memor_october_df, 'val_memor_october_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Training\AAC\원천데이터
current_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AAC\원천데이터, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Validation\AAC\원천데이터


## 광고별 메타데이터 JSON 저장

In [40]:
# pandas df 기준

# 경로 설정을 위한 매핑
path_mapping = {
    
    'train_memor_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Training\AAC\라벨링데이터',
    # 'july_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AAC\라벨링데이터',
    'val_memor_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\JULY\Validation\AAC\라벨링데이터',
    # 'august_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AAC\라벨링데이터',
    'train_memor_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Training\AAC\라벨링데이터',
    # 'august_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AAC\라벨링데이터',
    'val_memor_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\AUGUST\Validation\AAC\라벨링데이터',
    # 'september_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AAC\라벨링데이터',
    'train_memor_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Training\AAC\라벨링데이터',
    # 'september_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AAC\라벨링데이터',
    'val_memor_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\SEPTEMBER\Validation\AAC\라벨링데이터',
    # 'october_train_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AAC\라벨링데이터',
    'train_memor_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Training\AAC\라벨링데이터',
    # 'october_val_min_pd_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AAC\라벨링데이터',
    'val_memor_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\new_WORD_SCRIPT\OCTOBER\Validation\AAC\라벨링데이터'
}

# DataFrame 이름과 객체를 튜플로 매핑
df_tuples = [
    # ('july_train_min_pd_df', july_train_min_pd_df),
    ('train_memor_july_df', train_memor_july_df),
    # ('july_val_min_pd_df', july_val_min_pd_df),
    ('val_memor_july_df', val_memor_july_df),
    # ('august_train_min_pd_df', august_train_min_pd_df),
    ('train_memor_august_df', train_memor_august_df),
    # ('august_val_min_pd_df', august_val_min_pd_df),
    ('val_memor_august_df', val_memor_august_df),
    # ('september_train_min_pd_df', september_train_min_pd_df),
    ('train_memor_september_df', train_memor_september_df),
    # ('september_val_min_pd_df', september_val_min_pd_df),
    ('val_memor_september_df', val_memor_september_df),
    # ('october_train_min_pd_df', october_train_min_pd_df),
    ('train_memor_october_df', train_memor_october_df),
    # ('october_val_min_pd_df', october_val_min_pd_df),
    ('val_memor_october_df', val_memor_october_df)
]

# 각 DataFrame에 대해 반복하며 유저별로 JSON 파일로 저장
for df_name, df in df_tuples:
    # 해당 DataFrame의 경로 가져오기
    base_path = path_mapping[df_name]

    # 광고별로 그룹화
    grouped = df.groupby('title')

    # 각 광고에 대해 반복
    for ad_name, group in grouped:
        # 저장할 파일 경로 설정
        file_path = os.path.join(base_path, f'{ad_name}.json')

        # 데이터를 JSON 형태로 변환하여 파일에 저장
        group.to_json(file_path, orient='records', force_ascii=False)

# 불러온 데이터 확인

In [4]:
point_df.printSchema()
point_df.count()

root
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- birth: string (nullable = true)
 |-- category: long (nullable = true)
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- script: string (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



113714

In [10]:
point_df.select("participated_at").show(truncate=False)

+-----------------------------+
|participated_at              |
+-----------------------------+
|2023-07-01T00:20:09.781+09:00|
|2023-07-01T00:37:11.074+09:00|
|2023-07-01T02:46:50.785+09:00|
|2023-07-01T04:19:43.956+09:00|
|2023-07-01T04:22:53.259+09:00|
|2023-07-01T05:44:08.982+09:00|
|2023-07-01T05:48:20.800+09:00|
|2023-07-01T10:42:18.906+09:00|
|2023-07-01T10:44:45.943+09:00|
|2023-07-01T16:27:26.334+09:00|
|2023-07-01T20:21:43.456+09:00|
|2023-07-01T22:59:17.667+09:00|
|2023-07-01T23:19:07.386+09:00|
|2023-07-01T23:22:06.089+09:00|
|2023-07-01T23:59:41.286+09:00|
|2023-07-02T01:17:27.976+09:00|
|2023-07-02T13:26:21.097+09:00|
|2023-07-02T13:31:01.787+09:00|
|2023-07-02T13:45:22.848+09:00|
|2023-07-02T15:18:30.472+09:00|
+-----------------------------+
only showing top 20 rows



In [30]:
point_df.show()

+--------+------+--------+----------+--------+-----------+----------+--------------------+------+---------+----------------+--------------------+-------------------+----------------------------------+----------------------------------+--------------------+--------------------+------------------------------+----+-------------------+--------------------+
|accuracy|ads_id|audio_id|     birth|category|description|difficulty|              end_at|gender|is_passed|minimum_accuracy|     participated_at|   participation_id|                     recorded_text|                            script|        signed_up_at|            start_at|                         title|type|            user_id|        withdrawn_at|
+--------+------+--------+----------+--------+-----------+----------+--------------------+------+---------+----------------+--------------------+-------------------+----------------------------------+----------------------------------+--------------------+--------------------+-------------

In [14]:
memor_df.printSchema()
memor_df.count()

root
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- birth: string (nullable = true)
 |-- category: long (nullable = true)
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- script: string (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



907406

# 월별로 라벨링 데이터 분리
  
포인트벌기, 암기플러스 별 월별 데이터

In [4]:
# participated_at 컬럼을 timestamp 타입으로 변환
point_df = point_df.withColumn("participated_at", to_timestamp("participated_at"))
memor_df = memor_df.withColumn("participated_at", to_timestamp("participated_at"))

In [5]:
# 함수 정의: 주어진 연도와 월에 해당하는 데이터만 필터링하는 함수
def filter_df_by_month(df, year_val, month_val):
    return df.filter((year(col("participated_at")) == year_val) & (month(col("participated_at")) == month_val))

point_july_df = filter_df_by_month(point_df, 2023, 7)
point_august_df = filter_df_by_month(point_df, 2023, 8)
point_september_df = filter_df_by_month(point_df, 2023, 9)
point_october_df = filter_df_by_month(point_df, 2023, 10)

memor_july_df = filter_df_by_month(memor_df, 2023, 7)
memor_august_df = filter_df_by_month(memor_df, 2023, 8)
memor_september_df = filter_df_by_month(memor_df, 2023, 9)
memor_october_df = filter_df_by_month(memor_df, 2023, 10)

In [6]:
print(f"7월 총: {point_july_df.count() + memor_july_df.count()} 포인트: {point_july_df.count()} + 7월 암기플: {memor_july_df.count()} ")
print(f"8월 총: {point_august_df.count() + memor_august_df.count()} 포인트: {point_august_df.count()} + 7월 암기플: {memor_august_df.count()} ")
print(f"9월 총: {point_september_df.count() + memor_september_df.count()} 포인트: {point_september_df.count()} + 7월 암기플: {memor_september_df.count()} ")
print(f"10월 총: {point_october_df.count() + memor_october_df.count()} 포인트: {point_october_df.count()} + 7월 암기플: {memor_october_df.count()} ")

7월 총: 64551 포인트: 27853 + 7월 암기플: 36698 
8월 총: 188471 포인트: 35209 + 7월 암기플: 153262 
9월 총: 415980 포인트: 25992 + 7월 암기플: 389988 
10월 총: 352118 포인트: 24660 + 7월 암기플: 327458 


# Training & Validation sets로 데이터셋 분리

포인트벌기, 암기플러스 별 > 월별 > Training, Validation 별

In [9]:
# PySpark DataFrame을 Pandas DataFrame으로 변환하는 함수
def split_dataframe(spark_df, test_size=0.25):
    # PySpark DataFrame을 Pandas DataFrame으로 변환
    pandas_df = spark_df.toPandas()
    
    # train_test_split 함수를 사용해 데이터 분할
    train_df, val_df = train_test_split(pandas_df, test_size=test_size, random_state=42)
    
    return train_df, val_df

In [10]:
train_point_july_df, val_point_july_df = split_dataframe(point_july_df)
train_point_august_df, val_point_august_df = split_dataframe(point_august_df)
train_point_september_df, val_point_september_df = split_dataframe(point_september_df)
train_point_october_df, val_point_october_df = split_dataframe(point_october_df)

train_memor_july_df, val_memor_july_df = split_dataframe(memor_july_df)
train_memor_august_df, val_memor_august_df = split_dataframe(memor_august_df)
train_memor_september_df, val_memor_september_df = split_dataframe(memor_september_df)
train_memor_october_df, val_memor_october_df = split_dataframe(memor_october_df)

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


## 월별 음성의 개수가 5개 이하인 유저의 퍼센트 파악 -> 삭제

In [13]:
# DataFrame 리스트
dfs = [
    train_point_july_df, val_point_july_df, train_memor_july_df, val_memor_july_df,
    train_point_august_df, val_point_august_df, train_memor_august_df, val_memor_august_df,
    train_point_september_df, val_point_september_df, train_memor_september_df, val_memor_september_df,
    train_point_october_df, val_point_october_df, train_memor_october_df, val_memor_october_df
]

# 각 Pandas DataFrame에 대해 반복
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupby("user_id").size().reset_index(name='audio_count')

    # 5개 이하인 행 필터링
    less_than_five = count_df[count_df['audio_count'] <= 5]

    # 필터링된 행의 개수
    count_less_than_five = less_than_five.shape[0]

    # 전체 행 개수
    total_count = df.shape[0]

    # 비율 계산
    percentage = (count_less_than_five / total_count) * 100

    # 결과 출력
    print(f"5개 이하인 행의 개수: {count_less_than_five}, 전체 대비 비율: {percentage:.2f}%")

5개 이하인 행의 개수: 555, 전체 대비 비율: 2.66%
5개 이하인 행의 개수: 573, 전체 대비 비율: 8.23%
5개 이하인 행의 개수: 72, 전체 대비 비율: 0.26%
5개 이하인 행의 개수: 68, 전체 대비 비율: 0.74%
5개 이하인 행의 개수: 755, 전체 대비 비율: 2.86%
5개 이하인 행의 개수: 792, 전체 대비 비율: 9.00%
5개 이하인 행의 개수: 209, 전체 대비 비율: 0.18%
5개 이하인 행의 개수: 175, 전체 대비 비율: 0.46%
5개 이하인 행의 개수: 668, 전체 대비 비율: 3.43%
5개 이하인 행의 개수: 727, 전체 대비 비율: 11.19%
5개 이하인 행의 개수: 123, 전체 대비 비율: 0.04%
5개 이하인 행의 개수: 131, 전체 대비 비율: 0.13%
5개 이하인 행의 개수: 621, 전체 대비 비율: 3.36%
5개 이하인 행의 개수: 758, 전체 대비 비율: 12.30%
5개 이하인 행의 개수: 126, 전체 대비 비율: 0.05%
5개 이하인 행의 개수: 159, 전체 대비 비율: 0.19%


In [14]:
# 각 DataFrame에 대해 반복하며 5개 이하인 음성 파일을 가진 유저들의 행을 삭제
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupby("user_id").size().reset_index(name='audio_count')

    # 5개 이하인 유저 ID 필터링
    users_less_than_five = count_df[count_df['audio_count'] <= 5]['user_id']

    # 해당 유저들의 행을 삭제
    df.drop(df[df['user_id'].isin(users_less_than_five)].index, inplace=True)

## 로컬에 저장

In [15]:
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

def save_df_as_json_chunks(df, base_path, chunk_size):
    create_directory(os.path.dirname(base_path))  
    
    # 청크 개수 계산
    num_chunks = math.ceil(len(df) / chunk_size)
    
    # 각 청크를 별도의 파일로 저장
    for i in range(num_chunks):
        chunk_df = df[i * chunk_size:(i + 1) * chunk_size]
        chunk_path = f"{base_path}_part{i}.json"
        chunk_df.to_json(chunk_path, orient='records', lines=True)

In [18]:
# 각 DataFrame을 여러 JSON 파일로 분할하여 저장
chunk_size = 500000  # 청크 크기를 50만으로 설정

save_df_as_json_chunks(train_point_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_training_minutescript_labeling/df_july_train_point', chunk_size)
save_df_as_json_chunks(train_memor_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_training_wordscript_labeling/df_july_train_memo', chunk_size)
save_df_as_json_chunks(val_point_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_val_minutescript_labeling/df_july_val_point', chunk_size)
save_df_as_json_chunks(val_memor_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_val_wordscript_labeling/df_july_val_memo', chunk_size)

save_df_as_json_chunks(train_point_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_training_minutescript_labeling/df_august_train_point', chunk_size)
save_df_as_json_chunks(train_memor_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_training_wordscript_labeling/df_august_train_memo', chunk_size)
save_df_as_json_chunks(val_point_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_val_minutescript_labeling/df_august_val_point', chunk_size)
save_df_as_json_chunks(val_memor_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_val_wordscript_labeling/df_august_val_memo', chunk_size)

save_df_as_json_chunks(train_point_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_training_minutescript_labeling/df_september_train_point', chunk_size)
save_df_as_json_chunks(train_memor_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_training_wordscript_labeling/df_september_train_memo', chunk_size)
save_df_as_json_chunks(val_point_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_val_minutescript_labeling/df_september_val_point', chunk_size)
save_df_as_json_chunks(val_memor_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_val_wordscript_labeling/df_september_val_memo', chunk_size)

save_df_as_json_chunks(train_point_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_training_minutescript_labeling/df_october_train_point', chunk_size)
save_df_as_json_chunks(train_memor_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_training_wordscript_labeling/df_october_train_memo', chunk_size)
save_df_as_json_chunks(val_point_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_val_minutescript_labeling/df_october_val_point', chunk_size)
save_df_as_json_chunks(val_memor_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_val_wordscript_labeling/df_october_val_memo', chunk_size)


### 유저 별 로컬에 저장

In [18]:
# 경로 설정을 위한 매핑
path_mapping = {
    # 7월 데이터
    'train_point_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AAC\라벨링데이터',
    'val_point_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AAC\라벨링데이터',
    'train_memor_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AAC\라벨링데이터',
    'val_memor_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AAC\라벨링데이터',
    # 8월 데이터
    'train_point_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AAC\라벨링데이터',
    'val_point_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AAC\라벨링데이터',
    'train_memor_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AAC\라벨링데이터',
    'val_memor_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AAC\라벨링데이터',
    # 9월 데이터
    'train_point_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AAC\라벨링데이터',
    'val_point_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AAC\라벨링데이터',
    'train_memor_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AAC\라벨링데이터',
    'val_memor_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AAC\라벨링데이터',
    # 10월 데이터
    'train_point_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AAC\라벨링데이터',
    'val_point_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AAC\라벨링데이터',
    'train_memor_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AAC\라벨링데이터',
    'val_memor_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AAC\라벨링데이터'
}

# DataFrame 이름과 객체를 튜플로 매핑
df_tuples = [
    ('train_point_july_df', train_point_july_df), 
    ('val_point_july_df', val_point_july_df),
    ('train_memor_july_df', train_memor_july_df), 
    ('val_memor_july_df', val_memor_july_df),
    ('train_point_august_df', train_point_august_df),
    ('val_point_august_df', val_point_august_df),
    ('train_memor_august_df', train_memor_august_df),
    ('val_memor_august_df', val_memor_august_df),
    ('train_point_september_df', train_point_september_df),
    ('val_point_september_df', val_point_september_df),
    ('train_memor_september_df', train_memor_september_df),
    ('val_memor_september_df', val_memor_september_df),
    ('train_point_october_df', train_point_october_df),
    ('val_point_october_df', val_point_october_df),
    ('train_memor_october_df', train_memor_october_df),
    ('val_memor_october_df', val_memor_october_df)
]

# 각 DataFrame에 대해 반복하며 유저별로 JSON 파일로 저장
for df_name, df in df_tuples:
    # 해당 DataFrame의 경로 가져오기
    base_path = path_mapping[df_name]

    # 유저별로 그룹화
    grouped = df.groupby('user_id')

    # 각 유저에 대해 반복
    for user_id, group in grouped:
        # 저장할 파일 경로 설정
        file_path = os.path.join(base_path, f'{user_id}.json')

        # 데이터를 JSON 형태로 변환하여 파일에 저장
        group.to_json(file_path, orient='records', force_ascii=False)

# 음성파일 위치 옮기기 231215(유저폴더별)

In [17]:
train_memor_july_df['participation_id']

0        2023073120144239400
1        2023071808572965800
2        2023072113255427400
3        2023071907030767500
4        2023072907212905300
                ...         
27377    2023073112580651100
27378    2023073112532940700
27379    2023073112572585600
27380    2023073112542483300
27381    2023073112584255700
Name: participation_id, Length: 27382, dtype: object

In [46]:
# 병렬처리 로직을 사용한 셀!
########################################################################################################

# 기본 파일 경로
base_current_path = r'D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10'
base_new_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS'
fixed_new_path_part = r'AAC\원천데이터'

# DataFrame 이름에서 월과 유형을 추출하여 경로를 결정하는 함수
def get_paths(df_name):
    # 월과 유형 추출
    parts = df_name.split('_')
    month = parts[2].upper()

    type_part = 'wordscript' if 'memor' in df_name else 'minutescript'
    section_part = 'validation' if 'val' in df_name else 'training'
    after_type_part = 'WORD_SCRIPT' if 'memor' in df_name else 'MINUTE_SCRIPT'
    after_section_part = 'Validation' if 'val' in df_name else 'Training'

    # 현재 파일 경로
    current_file_path = os.path.join(base_current_path, month, fixed_new_path_part.lstrip('\\'), section_part.capitalize(), section_part + '_' + type_part)
    # 새 파일 경로
    new_file_path = os.path.join(base_new_path, after_type_part, month, after_section_part, fixed_new_path_part.lstrip('\\'))


    print(f"current_file_path = {current_file_path}, new_file_path = {new_file_path}")

    return current_file_path, new_file_path


# def move_files_for_user(df, user_id, current_path, new_base_path):
#     user_id = str(user_id)
#     df['user_id'] = df['user_id'].astype(str)
    
#     user_df = df[df['user_id'] == user_id]

#     new_path = os.path.join(new_base_path, user_id)
#     if not os.path.exists(new_path):
#         os.makedirs(new_path)

#     for voice_id in user_df['participation_id']:
#         current_file = os.path.join(current_path, voice_id + '.wav')

#         # 파일이 기본 경로에 없으면 대체 경로에서 찾음
#         if not os.path.exists(current_file):
#             current_file = os.path.join(alternate_path, voice_id + '.wav')
#         new_file = os.path.join(new_path, voice_id)
#         if os.path.exists(current_file):
#             os.rename(current_file, new_file)
def move_files_for_user(df, user_id, current_path, new_base_path, alternate_path):
    user_id = str(user_id)
    df['user_id'] = df['user_id'].astype(str)
    
    user_df = df[df['user_id'] == user_id]
    
    new_path = os.path.join(new_base_path, user_id)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for voice_id in user_df['participation_id']:
        current_file = os.path.join(current_path, voice_id + '.wav')
        
        if not os.path.exists(current_file):
            current_file = os.path.join(alternate_path, voice_id + '.wav')
        
        new_file = os.path.join(new_path, voice_id)
        if os.path.exists(current_file):
            os.rename(current_file, new_file)



def move_files_parallel_by_user(df, df_name, paths_function):
    current_path, new_base_path = paths_function(df_name)

    # 대체 경로 설정
    if 'Training' in current_path:
        alternate_path = current_path.replace('Training', 'Validation')
    else:
        alternate_path = current_path.replace('Validation', 'Training')
    
    user_ids = df['user_id'].unique()
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        for user_id in user_ids:
            executor.submit(move_files_for_user, df, user_id, current_path, new_base_path)
# def move_files_parallel_by_user(df, df_name, paths_function):
#     current_path, new_base_path = paths_function(df_name)

#     # 대체 경로 설정
#     if 'Training' in current_path:
#         alternate_path = current_path.replace('Training', 'Validation')
#     else:
#         alternate_path = current_path.replace('Validation', 'Training')

#     user_ids = df['user_id'].unique()

#     print(f"Total unique user_ids: {len(user_ids)}")

#     if len(user_ids) == 0:
#         print("No user_ids found in DataFrame.")
#         return

#     futures = []
#     with ThreadPoolExecutor(max_workers=10) as executor:
#         for user_id in user_ids:
#             future = executor.submit(move_files_for_user, df, user_id, current_path, new_base_path, alternate_path)
#             futures.append(future)

#     # 결과 확인
#     for future in as_completed(futures):
#         try:
#             future.result()
#         except Exception as e:
#             print(f"Error in thread execution: {e}")

In [49]:
# 순차적으로 파일 이동 작업을 수행하는 셀
#################################################################################################
# 기본 파일 경로
base_current_path = r'D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10'
base_new_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS'
fixed_new_path_part = r'AAC\원천데이터'

# DataFrame 이름에서 월과 유형을 추출하여 경로를 결정하는 함수
def get_paths(df_name):
    # 월과 유형 추출
    parts = df_name.split('_')
    month = parts[2].upper()

    type_part = 'wordscript' if 'memor' in df_name else 'minutescript'
    section_part = 'validation' if 'val' in df_name else 'training'
    after_type_part = 'WORD_SCRIPT' if 'memor' in df_name else 'MINUTE_SCRIPT'
    after_section_part = 'Validation' if 'val' in df_name else 'Training'

    # 현재 파일 경로
    current_file_path = os.path.join(base_current_path, month, fixed_new_path_part.lstrip('\\'), section_part.capitalize(), section_part + '_' + type_part)
    # 새 파일 경로
    new_file_path = os.path.join(base_new_path, after_type_part, month, after_section_part, fixed_new_path_part.lstrip('\\'))


    print(f"current_file_path = {current_file_path}, new_file_path = {new_file_path}")

    return current_file_path, new_file_path

def move_files_for_user(df, user_id, current_path, new_base_path, alternate_path):
    user_id = str(user_id)
    df['user_id'] = df['user_id'].astype(str)
    
    user_df = df[df['user_id'] == user_id]
    
    new_path = os.path.join(new_base_path, user_id)
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for voice_id in user_df['participation_id']:
        current_file = os.path.join(current_path, voice_id + '.wav')
        
        if not os.path.exists(current_file):
            current_file = os.path.join(alternate_path, voice_id + '.wav')
        
        new_file = os.path.join(new_path, voice_id)
        if os.path.exists(current_file):
            os.rename(current_file, new_file)

def move_files_parallel_by_user(df, df_name, paths_function):
    current_path, new_base_path = paths_function(df_name)

    # 대체 경로 설정
    if 'Training' in current_path:
        alternate_path = current_path.replace('Training', 'Validation')
    else:
        alternate_path = current_path.replace('Validation', 'Training')
    
    user_ids = df['user_id'].unique()

    # 각 user_id에 대해 순차적으로 move_files_for_user 함수 호출
    for user_id in user_ids:
        move_files_for_user(df, user_id, current_path, new_base_path, alternate_path)

In [28]:
move_files_parallel_by_user(train_point_july_df, 'train_point_july_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\JULY\AAC\원천데이터\Training\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AAC\원천데이터


In [50]:
move_files_parallel_by_user(train_memor_july_df, 'train_memor_july_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\JULY\AAC\원천데이터\Training\training_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AAC\원천데이터


In [51]:
move_files_parallel_by_user(train_memor_july_df, 'train_memor_july_df', get_paths)
move_files_parallel_by_user(val_point_july_df, 'val_point_july_df', get_paths)
move_files_parallel_by_user(val_memor_july_df, 'val_memor_july_df', get_paths)

move_files_parallel_by_user(train_point_august_df, 'train_point_august_df', get_paths)
move_files_parallel_by_user(train_memor_august_df, 'train_memor_august_df', get_paths)
move_files_parallel_by_user(val_point_august_df, 'val_point_august_df', get_paths)
move_files_parallel_by_user(val_memor_august_df, 'val_memor_august_df', get_paths)

move_files_parallel_by_user(train_point_september_df, 'train_point_september_df', get_paths)
move_files_parallel_by_user(train_memor_september_df, 'train_memor_september_df', get_paths)
move_files_parallel_by_user(val_point_september_df, 'val_point_september_df', get_paths)
move_files_parallel_by_user(val_memor_september_df, 'val_memor_september_df', get_paths)

move_files_parallel_by_user(train_point_october_df, 'train_point_october_df', get_paths)
move_files_parallel_by_user(train_memor_october_df, 'train_memor_october_df', get_paths)
move_files_parallel_by_user(val_point_october_df, 'val_point_october_df', get_paths)
move_files_parallel_by_user(val_memor_october_df, 'val_memor_october_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\JULY\AAC\원천데이터\Training\training_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AAC\원천데이터
current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\JULY\AAC\원천데이터\Validation\validation_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AAC\원천데이터
current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\JULY\AAC\원천데이터\Validation\validation_wordscript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AAC\원천데이터
current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\AUGUST\AAC\원천데이터\Training\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AAC\원천데이터
current_file_path = D:\DATA_

In [59]:
val_memor_october_df.count()

accuracy            81506
ads_id              81506
audio_id            81506
birth               81506
category            81506
description         81506
difficulty          81506
end_at              81506
gender              81506
is_passed           81506
minimum_accuracy    81506
participated_at     81506
participation_id    81506
recorded_text       81506
script              81506
signed_up_at        81506
start_at            81506
title               81506
type                81506
user_id             81506
withdrawn_at        81506
dtype: int64

# 데이터 뽑아와서 분류

In [27]:
# Pandas DataFrame을 Spark DataFrame으로 변환하는 함수
def convert_to_spark_df(pandas_df):
    return spark.createDataFrame(pandas_df)

# Point DataFrame들 변환
train_point_july_df = convert_to_spark_df(train_point_july_df)
val_point_july_df = convert_to_spark_df(val_point_july_df)

train_point_august_df = convert_to_spark_df(train_point_august_df)
val_point_august_df = convert_to_spark_df(val_point_august_df)

train_point_september_df = convert_to_spark_df(train_point_september_df)
val_point_september_df = convert_to_spark_df(val_point_september_df)

train_point_october_df = convert_to_spark_df(train_point_october_df)
val_point_october_df = convert_to_spark_df(val_point_october_df)

# Memor DataFrame들 변환
train_memor_july_df = convert_to_spark_df(train_memor_july_df)
val_memor_july_df = convert_to_spark_df(val_memor_july_df)

train_memor_august_df = convert_to_spark_df(train_memor_august_df)
val_memor_august_df = convert_to_spark_df(val_memor_august_df)

train_memor_september_df = convert_to_spark_df(train_memor_september_df)
val_memor_september_df = convert_to_spark_df(val_memor_september_df)

train_memor_october_df = convert_to_spark_df(train_memor_october_df)
val_memor_october_df = convert_to_spark_df(val_memor_october_df)

In [33]:
def copy_file(source, target):
    try:
        shutil.copy(source, target)
        return 1  # 성공한 경우
    except Exception as e:
        return 0  # 실패한 경우

def find_and_copy_files(source_path, target_path, df):
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        for row in df.collect():
            participation_id = row['participation_id']
            source_file_path = os.path.join(source_path, f"{participation_id}.wav")  

            if not os.path.exists(target_path):
                os.makedirs(target_path)

            # 병렬 처리를 위한 작업 추가
            if os.path.isfile(source_file_path):
                target_file_path = os.path.join(target_path, f"{participation_id}.wav")  # 타겟 파일 경로에도 파일명 추가
                futures.append(executor.submit(copy_file, source_file_path, target_file_path))

        for future in futures:
            future.result()


In [34]:
point_source_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\point_from_7_to_10"
memor_source_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\memor_from_7_to_10"

train_point_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\training_minutescript"
train_memor_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\training_wordscript"
val_point_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\validation_minutescript"
val_memor_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_july_target_path, train_point_july_df)
find_and_copy_files(memor_source_path, train_memor_july_target_path, train_memor_july_df)
find_and_copy_files(point_source_path, val_point_july_target_path, val_point_july_df)
find_and_copy_files(memor_source_path, val_memor_july_target_path, val_memor_july_df)

In [35]:
train_point_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\training_minutescript"
train_memor_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\training_wordscript"
val_point_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\validation_minutescript"
val_memor_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_august_target_path, train_point_august_df)
find_and_copy_files(memor_source_path, train_memor_august_target_path, train_memor_august_df)
find_and_copy_files(point_source_path, val_point_august_target_path, val_point_august_df)
find_and_copy_files(memor_source_path, val_memor_august_target_path, val_memor_august_df)

In [36]:
train_point_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\training_minutescript"
train_memor_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\training_wordscript"
val_point_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\validation_minutescript"
val_memor_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_september_target_path, train_point_september_df)
find_and_copy_files(memor_source_path, train_memor_september_target_path, train_memor_september_df)
find_and_copy_files(point_source_path, val_point_september_target_path, val_point_september_df)
find_and_copy_files(memor_source_path, val_memor_september_target_path, val_memor_september_df)

In [37]:
train_point_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\training_minutescript"
train_memor_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\training_wordscript"
val_point_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\validation_minutescript"
val_memor_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_october_target_path, train_point_october_df)
find_and_copy_files(memor_source_path, train_memor_october_target_path, train_memor_october_df)
find_and_copy_files(point_source_path, val_point_october_target_path, val_point_october_df)
find_and_copy_files(memor_source_path, val_memor_october_target_path, val_memor_october_df)

# 뭉테기 라벨링데이터 저장

In [52]:
# 7월 데이터 합치기
july_df = pd.concat([train_point_july_df, train_memor_july_df, val_point_july_df, val_memor_july_df], axis=0)

# 8월 데이터 합치기
august_df = pd.concat([train_point_august_df, train_memor_august_df, val_point_august_df, val_memor_august_df], axis=0)

# 9월 데이터 합치기
september_df = pd.concat([train_point_september_df, train_memor_september_df, val_point_september_df, val_memor_september_df], axis=0)

# 10월 데이터 합치기
october_df = pd.concat([train_point_october_df, train_memor_october_df, val_point_october_df, val_memor_october_df], axis=0)

In [45]:
def save_df_to_json(df, base_path, month):
    # 파일 경로 설정
    folder_path = os.path.join(base_path, month.upper(), "AAC", "라벨링데이터")

    # 폴더가 존재하지 않으면 생성
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # 5000행 단위로 분할
    num_splits = math.ceil(len(df) / 5000)
    
    for i in range(num_splits):
        # 분할된 데이터 프레임
        split_df = df.iloc[i*5000 : (i+1)*5000]

        # JSON 파일로 저장
        file_name = f"{month}_part_{i+1}.json"
        file_path = os.path.join(folder_path, file_name)
        split_df.to_json(file_path, orient='records', force_ascii=False, lines=True)


In [46]:
save_df_to_json(july_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'JULY')

In [47]:
save_df_to_json(august_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'AUGUST')
save_df_to_json(september_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'SEPTEMBER')
save_df_to_json(october_df, 'E:\\monthly_ETRI_VOWING_VOICE_DATASET_7_to_10', 'OCTOBER')

In [53]:
july_df.count()

accuracy            61534
ads_id              61534
audio_id            61534
birth               61534
category            61534
description         61534
difficulty          61534
end_at              61534
gender              61534
is_passed           61534
minimum_accuracy    61534
participated_at     61534
participation_id    61534
recorded_text       61534
script              61534
signed_up_at        61534
start_at            61534
title               61534
type                61534
user_id             61534
withdrawn_at        61534
dtype: int64

In [54]:
august_df.count()

accuracy            184178
ads_id              184178
audio_id            184178
birth               184178
category            184178
description         184178
difficulty          184178
end_at              184178
gender              184178
is_passed           184178
minimum_accuracy    184178
participated_at     184178
participation_id    184178
recorded_text       184178
script              184178
signed_up_at        184178
start_at            184178
title               184178
type                184178
user_id             184178
withdrawn_at        184178
dtype: int64

In [55]:
september_df.count()

accuracy            412220
ads_id              412220
audio_id            412220
birth               412220
category            412220
description         412220
difficulty          412220
end_at              412220
gender              412220
is_passed           412220
minimum_accuracy    412220
participated_at     412220
participation_id    412220
recorded_text       412220
script              412220
signed_up_at        412220
start_at            412220
title               412220
type                412220
user_id             412220
withdrawn_at        412220
dtype: int64

In [56]:
october_df.count()

accuracy            348141
ads_id              348141
audio_id            348141
birth               348141
category            348141
description         348141
difficulty          348141
end_at              348141
gender              348141
is_passed           348141
minimum_accuracy    348141
participated_at     348141
participation_id    348141
recorded_text       348141
script              348141
signed_up_at        348141
start_at            348141
title               348141
type                348141
user_id             348141
withdrawn_at        348141
dtype: int64