In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import to_timestamp, collect_list, to_date, month, year, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import BooleanType, TimestampType, ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

import shutil

from concurrent.futures import ThreadPoolExecutor

from sklearn.model_selection import train_test_split

import math

import json

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("voice files classification operation") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


# 불러오기

In [3]:
labelling_point_file_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\labelling_data_from_7_to_10\\labelling_point.json\\*.json"
labelling_memor_file_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\labelling_data_from_7_to_10\\labelling_memor.json\\*.json"

point_df = spark.read.json(labelling_point_file_path)
memor_df = spark.read.json(labelling_memor_file_path)

In [9]:
test_file_path = "D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/JULY/july_training_minutescript_labeling/*.json"

test_df = spark.read.json(test_file_path)

In [11]:
test_df.show(truncate=False)

+--------+------+--------+----------+--------+-----------+----------+-----------------------------+------+---------+----------------+-----------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## 유저별로 조각난 json 모아서 불러오기

In [3]:
train_point_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\JULY\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\JULY\\Training\\AAC\\라벨링데이터\\*.json"
val_point_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\JULY\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_july_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\JULY\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_july_df = spark.read.json(train_point_july_path)
train_memor_july_df = spark.read.json(train_memor_july_path)
val_point_july_df = spark.read.json(val_point_july_path)
val_memor_july_df = spark.read.json(val_memor_july_path)

In [4]:
train_point_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\AUGUST\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\AUGUST\\Training\\AAC\\라벨링데이터\\*.json"
val_point_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\AUGUST\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_august_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\AUGUST\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_august_df = spark.read.json(train_point_august_path)
train_memor_august_df = spark.read.json(train_memor_august_path)
val_point_august_df = spark.read.json(val_point_august_path)
val_memor_august_df = spark.read.json(val_memor_august_path)

In [5]:
train_point_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\SEPTEMBER\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\SEPTEMBER\\Training\\AAC\\라벨링데이터\\*.json"
val_point_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\SEPTEMBER\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_september_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\SEPTEMBER\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_september_df = spark.read.json(train_point_september_path)
train_memor_september_df = spark.read.json(train_memor_september_path)
val_point_september_df = spark.read.json(val_point_september_path)
val_memor_september_df = spark.read.json(val_memor_september_path)

In [6]:
train_point_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\OCTOBER\\Training\\AAC\\라벨링데이터\\*.json"
train_memor_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\OCTOBER\\Training\\AAC\\라벨링데이터\\*.json"
val_point_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\OCTOBER\\Validation\\AAC\\라벨링데이터\\*.json"
val_memor_october_path = "D:\\DATA_PREPROCESS\\INTEGRATED_DATASETS\\MINUTE_SCRIPT\\OCTOBER\\Validation\\AAC\\라벨링데이터\\*.json"

train_point_october_df = spark.read.json(train_point_october_path)
train_memor_october_df = spark.read.json(train_memor_october_path)
val_point_october_df = spark.read.json(val_point_october_path)
val_memor_october_df = spark.read.json(val_memor_october_path)

In [8]:
# pandas df들을 spark df들로 바꿈

train_point_july_df = train_point_july_df.toPandas()
train_memor_july_df = train_memor_july_df.toPandas()
val_point_july_df = val_point_july_df.toPandas()
val_memor_july_df = val_memor_july_df.toPandas()

train_point_august_df = train_point_august_df.toPandas()
train_memor_august_df = train_memor_august_df.toPandas()
val_point_august_df = val_point_august_df.toPandas()
val_memor_august_df = val_memor_august_df.toPandas()

train_point_september_df = train_point_september_df.toPandas()
train_memor_september_df = train_memor_september_df.toPandas()
val_point_september_df = val_point_september_df.toPandas()
val_memor_september_df = val_memor_september_df.toPandas()

train_point_october_df = train_point_october_df.toPandas()
train_memor_october_df = train_memor_october_df.toPandas()
val_point_october_df = val_point_october_df.toPandas()
val_memor_october_df = val_memor_october_df.toPandas()

# 불러온 데이터 확인

In [4]:
point_df.printSchema()
point_df.count()

root
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- birth: string (nullable = true)
 |-- category: long (nullable = true)
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- script: string (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



113714

In [10]:
point_df.select("participated_at").show(truncate=False)

+-----------------------------+
|participated_at              |
+-----------------------------+
|2023-07-01T00:20:09.781+09:00|
|2023-07-01T00:37:11.074+09:00|
|2023-07-01T02:46:50.785+09:00|
|2023-07-01T04:19:43.956+09:00|
|2023-07-01T04:22:53.259+09:00|
|2023-07-01T05:44:08.982+09:00|
|2023-07-01T05:48:20.800+09:00|
|2023-07-01T10:42:18.906+09:00|
|2023-07-01T10:44:45.943+09:00|
|2023-07-01T16:27:26.334+09:00|
|2023-07-01T20:21:43.456+09:00|
|2023-07-01T22:59:17.667+09:00|
|2023-07-01T23:19:07.386+09:00|
|2023-07-01T23:22:06.089+09:00|
|2023-07-01T23:59:41.286+09:00|
|2023-07-02T01:17:27.976+09:00|
|2023-07-02T13:26:21.097+09:00|
|2023-07-02T13:31:01.787+09:00|
|2023-07-02T13:45:22.848+09:00|
|2023-07-02T15:18:30.472+09:00|
+-----------------------------+
only showing top 20 rows



In [30]:
point_df.show()

+--------+------+--------+----------+--------+-----------+----------+--------------------+------+---------+----------------+--------------------+-------------------+----------------------------------+----------------------------------+--------------------+--------------------+------------------------------+----+-------------------+--------------------+
|accuracy|ads_id|audio_id|     birth|category|description|difficulty|              end_at|gender|is_passed|minimum_accuracy|     participated_at|   participation_id|                     recorded_text|                            script|        signed_up_at|            start_at|                         title|type|            user_id|        withdrawn_at|
+--------+------+--------+----------+--------+-----------+----------+--------------------+------+---------+----------------+--------------------+-------------------+----------------------------------+----------------------------------+--------------------+--------------------+-------------

In [14]:
memor_df.printSchema()
memor_df.count()

root
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- birth: string (nullable = true)
 |-- category: long (nullable = true)
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- script: string (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



907406

# 월별로 라벨링 데이터 분리
  
포인트벌기, 암기플러스 별 월별 데이터

In [4]:
# participated_at 컬럼을 timestamp 타입으로 변환
point_df = point_df.withColumn("participated_at", to_timestamp("participated_at"))
memor_df = memor_df.withColumn("participated_at", to_timestamp("participated_at"))

In [5]:
# 함수 정의: 주어진 연도와 월에 해당하는 데이터만 필터링하는 함수
def filter_df_by_month(df, year_val, month_val):
    return df.filter((year(col("participated_at")) == year_val) & (month(col("participated_at")) == month_val))

point_july_df = filter_df_by_month(point_df, 2023, 7)
point_august_df = filter_df_by_month(point_df, 2023, 8)
point_september_df = filter_df_by_month(point_df, 2023, 9)
point_october_df = filter_df_by_month(point_df, 2023, 10)

memor_july_df = filter_df_by_month(memor_df, 2023, 7)
memor_august_df = filter_df_by_month(memor_df, 2023, 8)
memor_september_df = filter_df_by_month(memor_df, 2023, 9)
memor_october_df = filter_df_by_month(memor_df, 2023, 10)

In [6]:
print(f"7월 총: {point_july_df.count() + memor_july_df.count()} 포인트: {point_july_df.count()} + 7월 암기플: {memor_july_df.count()} ")
print(f"8월 총: {point_august_df.count() + memor_august_df.count()} 포인트: {point_august_df.count()} + 7월 암기플: {memor_august_df.count()} ")
print(f"9월 총: {point_september_df.count() + memor_september_df.count()} 포인트: {point_september_df.count()} + 7월 암기플: {memor_september_df.count()} ")
print(f"10월 총: {point_october_df.count() + memor_october_df.count()} 포인트: {point_october_df.count()} + 7월 암기플: {memor_october_df.count()} ")

7월 총: 64551 포인트: 27853 + 7월 암기플: 36698 
8월 총: 188471 포인트: 35209 + 7월 암기플: 153262 
9월 총: 415980 포인트: 25992 + 7월 암기플: 389988 
10월 총: 352118 포인트: 24660 + 7월 암기플: 327458 


# Training & Validation sets로 데이터셋 분리

포인트벌기, 암기플러스 별 > 월별 > Training, Validation 별

In [9]:
# PySpark DataFrame을 Pandas DataFrame으로 변환하는 함수
def split_dataframe(spark_df, test_size=0.25):
    # PySpark DataFrame을 Pandas DataFrame으로 변환
    pandas_df = spark_df.toPandas()
    
    # train_test_split 함수를 사용해 데이터 분할
    train_df, val_df = train_test_split(pandas_df, test_size=test_size, random_state=42)
    
    return train_df, val_df

In [10]:
train_point_july_df, val_point_july_df = split_dataframe(point_july_df)
train_point_august_df, val_point_august_df = split_dataframe(point_august_df)
train_point_september_df, val_point_september_df = split_dataframe(point_september_df)
train_point_october_df, val_point_october_df = split_dataframe(point_october_df)

train_memor_july_df, val_memor_july_df = split_dataframe(memor_july_df)
train_memor_august_df, val_memor_august_df = split_dataframe(memor_august_df)
train_memor_september_df, val_memor_september_df = split_dataframe(memor_september_df)
train_memor_october_df, val_memor_october_df = split_dataframe(memor_october_df)

  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)
  series = series.astype(t, copy=False)


## 월별 음성의 개수가 5개 이하인 유저의 퍼센트 파악 -> 삭제

In [13]:
# DataFrame 리스트
dfs = [
    train_point_july_df, val_point_july_df, train_memor_july_df, val_memor_july_df,
    train_point_august_df, val_point_august_df, train_memor_august_df, val_memor_august_df,
    train_point_september_df, val_point_september_df, train_memor_september_df, val_memor_september_df,
    train_point_october_df, val_point_october_df, train_memor_october_df, val_memor_october_df
]

# 각 Pandas DataFrame에 대해 반복
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupby("user_id").size().reset_index(name='audio_count')

    # 5개 이하인 행 필터링
    less_than_five = count_df[count_df['audio_count'] <= 5]

    # 필터링된 행의 개수
    count_less_than_five = less_than_five.shape[0]

    # 전체 행 개수
    total_count = df.shape[0]

    # 비율 계산
    percentage = (count_less_than_five / total_count) * 100

    # 결과 출력
    print(f"5개 이하인 행의 개수: {count_less_than_five}, 전체 대비 비율: {percentage:.2f}%")

5개 이하인 행의 개수: 555, 전체 대비 비율: 2.66%
5개 이하인 행의 개수: 573, 전체 대비 비율: 8.23%
5개 이하인 행의 개수: 72, 전체 대비 비율: 0.26%
5개 이하인 행의 개수: 68, 전체 대비 비율: 0.74%
5개 이하인 행의 개수: 755, 전체 대비 비율: 2.86%
5개 이하인 행의 개수: 792, 전체 대비 비율: 9.00%
5개 이하인 행의 개수: 209, 전체 대비 비율: 0.18%
5개 이하인 행의 개수: 175, 전체 대비 비율: 0.46%
5개 이하인 행의 개수: 668, 전체 대비 비율: 3.43%
5개 이하인 행의 개수: 727, 전체 대비 비율: 11.19%
5개 이하인 행의 개수: 123, 전체 대비 비율: 0.04%
5개 이하인 행의 개수: 131, 전체 대비 비율: 0.13%
5개 이하인 행의 개수: 621, 전체 대비 비율: 3.36%
5개 이하인 행의 개수: 758, 전체 대비 비율: 12.30%
5개 이하인 행의 개수: 126, 전체 대비 비율: 0.05%
5개 이하인 행의 개수: 159, 전체 대비 비율: 0.19%


In [14]:
# 각 DataFrame에 대해 반복하며 5개 이하인 음성 파일을 가진 유저들의 행을 삭제
for df in dfs:
    # 유저별 음성 파일 개수 계산
    count_df = df.groupby("user_id").size().reset_index(name='audio_count')

    # 5개 이하인 유저 ID 필터링
    users_less_than_five = count_df[count_df['audio_count'] <= 5]['user_id']

    # 해당 유저들의 행을 삭제
    df.drop(df[df['user_id'].isin(users_less_than_five)].index, inplace=True)

## 로컬에 저장

In [15]:
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

def save_df_as_json_chunks(df, base_path, chunk_size):
    create_directory(os.path.dirname(base_path))  
    
    # 청크 개수 계산
    num_chunks = math.ceil(len(df) / chunk_size)
    
    # 각 청크를 별도의 파일로 저장
    for i in range(num_chunks):
        chunk_df = df[i * chunk_size:(i + 1) * chunk_size]
        chunk_path = f"{base_path}_part{i}.json"
        chunk_df.to_json(chunk_path, orient='records', lines=True)

In [18]:
# 각 DataFrame을 여러 JSON 파일로 분할하여 저장
chunk_size = 500000  # 청크 크기를 50만으로 설정

save_df_as_json_chunks(train_point_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_training_minutescript_labeling/df_july_train_point', chunk_size)
save_df_as_json_chunks(train_memor_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_training_wordscript_labeling/df_july_train_memo', chunk_size)
save_df_as_json_chunks(val_point_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_val_minutescript_labeling/df_july_val_point', chunk_size)
save_df_as_json_chunks(val_memor_july_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/july_val_wordscript_labeling/df_july_val_memo', chunk_size)

save_df_as_json_chunks(train_point_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_training_minutescript_labeling/df_august_train_point', chunk_size)
save_df_as_json_chunks(train_memor_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_training_wordscript_labeling/df_august_train_memo', chunk_size)
save_df_as_json_chunks(val_point_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_val_minutescript_labeling/df_august_val_point', chunk_size)
save_df_as_json_chunks(val_memor_august_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/august_val_wordscript_labeling/df_august_val_memo', chunk_size)

save_df_as_json_chunks(train_point_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_training_minutescript_labeling/df_september_train_point', chunk_size)
save_df_as_json_chunks(train_memor_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_training_wordscript_labeling/df_september_train_memo', chunk_size)
save_df_as_json_chunks(val_point_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_val_minutescript_labeling/df_september_val_point', chunk_size)
save_df_as_json_chunks(val_memor_september_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/september_val_wordscript_labeling/df_september_val_memo', chunk_size)

save_df_as_json_chunks(train_point_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_training_minutescript_labeling/df_october_train_point', chunk_size)
save_df_as_json_chunks(train_memor_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_training_wordscript_labeling/df_october_train_memo', chunk_size)
save_df_as_json_chunks(val_point_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_val_minutescript_labeling/df_october_val_point', chunk_size)
save_df_as_json_chunks(val_memor_october_df, 'D:/DATA_PREPROCESS/iOS_DATAS/ios_VOICE_DATA_FROM_7_TO_10/labelling_data_from_7_to_10/october_val_wordscript_labeling/df_october_val_memo', chunk_size)


### 유저 별 로컬에 저장

In [18]:
# 경로 설정을 위한 매핑
path_mapping = {
    # 7월 데이터
    'train_point_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AAC\라벨링데이터',
    'val_point_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Validation\AAC\라벨링데이터',
    'train_memor_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Training\AAC\라벨링데이터',
    'val_memor_july_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\JULY\Validation\AAC\라벨링데이터',
    # 8월 데이터
    'train_point_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Training\AAC\라벨링데이터',
    'val_point_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\AUGUST\Validation\AAC\라벨링데이터',
    'train_memor_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Training\AAC\라벨링데이터',
    'val_memor_august_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\AUGUST\Validation\AAC\라벨링데이터',
    # 9월 데이터
    'train_point_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Training\AAC\라벨링데이터',
    'val_point_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\SEPTEMBER\Validation\AAC\라벨링데이터',
    'train_memor_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Training\AAC\라벨링데이터',
    'val_memor_september_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\SEPTEMBER\Validation\AAC\라벨링데이터',
    # 10월 데이터
    'train_point_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Training\AAC\라벨링데이터',
    'val_point_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\OCTOBER\Validation\AAC\라벨링데이터',
    'train_memor_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Training\AAC\라벨링데이터',
    'val_memor_october_df': r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS\WORD_SCRIPT\OCTOBER\Validation\AAC\라벨링데이터'
}

# DataFrame 이름과 객체를 튜플로 매핑
df_tuples = [
    ('train_point_july_df', train_point_july_df), 
    ('val_point_july_df', val_point_july_df),
    ('train_memor_july_df', train_memor_july_df), 
    ('val_memor_july_df', val_memor_july_df),
    ('train_point_august_df', train_point_august_df),
    ('val_point_august_df', val_point_august_df),
    ('train_memor_august_df', train_memor_august_df),
    ('val_memor_august_df', val_memor_august_df),
    ('train_point_september_df', train_point_september_df),
    ('val_point_september_df', val_point_september_df),
    ('train_memor_september_df', train_memor_september_df),
    ('val_memor_september_df', val_memor_september_df),
    ('train_point_october_df', train_point_october_df),
    ('val_point_october_df', val_point_october_df),
    ('train_memor_october_df', train_memor_october_df),
    ('val_memor_october_df', val_memor_october_df)
]

# 각 DataFrame에 대해 반복하며 유저별로 JSON 파일로 저장
for df_name, df in df_tuples:
    # 해당 DataFrame의 경로 가져오기
    base_path = path_mapping[df_name]

    # 유저별로 그룹화
    grouped = df.groupby('user_id')

    # 각 유저에 대해 반복
    for user_id, group in grouped:
        # 저장할 파일 경로 설정
        file_path = os.path.join(base_path, f'{user_id}.json')

        # 데이터를 JSON 형태로 변환하여 파일에 저장
        group.to_json(file_path, orient='records', force_ascii=False)

# 음성파일 위치 옮기기 231215(유저폴더별)

In [27]:
# 기본 파일 경로
base_current_path = r'D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10'
base_new_path = r'D:\DATA_PREPROCESS\INTEGRATED_DATASETS'
fixed_new_path_part = r'AAC\원천데이터'

# DataFrame 이름에서 월과 유형을 추출하여 경로를 결정하는 함수
def get_paths(df_name):
    # 월과 유형 추출
    parts = df_name.split('_')
    month = parts[2].upper()

    type_part = 'wordscript' if 'memor' in df_name else 'minutescript'
    section_part = 'validation' if 'val' in df_name else 'training'
    after_type_part = 'WORD_SCRIPT' if 'memor' in df_name else 'MINUTE_SCRIPT'
    after_section_part = 'Validation' if 'val' in df_name else 'Training'

    # 현재 파일 경로
    current_file_path = os.path.join(base_current_path, month, fixed_new_path_part.lstrip('\\'), section_part.capitalize(), section_part + '_' + type_part)
    # 새 파일 경로
    new_file_path = os.path.join(base_new_path, after_type_part, month, after_section_part, fixed_new_path_part.lstrip('\\'))


    print(f"current_file_path = {current_file_path}, new_file_path = {new_file_path}")

    return current_file_path, new_file_path


def move_files_for_user(df, user_id, current_path, new_base_path):
    user_df = df[df['user_id'] == user_id]
    print('hi')
    new_path = os.path.join(new_base_path, user_id)
    print('hello')
    if not os.path.exists(new_path):
        os.makedirs(new_path)

    for participation_id in user_df['participation_id']:
        current_file = os.path.join(current_path, participation_id)
        new_file = os.path.join(new_path, participation_id + '.wav')
        if os.path.exists(current_file):
            os.rename(current_file, new_file)

def move_files_parallel_by_user(df, df_name, paths_function):
    current_path, new_base_path = paths_function(df_name)
    user_ids = df['user_id'].unique()
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        for user_id in user_ids:
            executor.submit(move_files_for_user, df, user_id, current_path, new_base_path)

In [28]:
move_files_parallel_by_user(train_point_july_df, 'train_point_july_df', get_paths)

current_file_path = D:\DATA_PREPROCESS\iOS_DATAS\ios_VOICE_DATA_FROM_7_TO_10\voice_file_from_7_to_10\JULY\AAC\원천데이터\Training\training_minutescript, new_file_path = D:\DATA_PREPROCESS\INTEGRATED_DATASETS\MINUTE_SCRIPT\JULY\Training\AAC\원천데이터
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi


In [12]:
train_point_july_df['participation_id']

0        2023070113223818100
1        2023070201562334300
2        2023071222281229200
3        2023071501293111100
4        2023070210571120400
                ...         
19446    2023071509581287700
19447    2023070914143708000
19448    2023071510005151700
19449    2023071020080467600
19450    2023071020042376500
Name: participation_id, Length: 19451, dtype: object

# 데이터 뽑아와서 분류

In [27]:
# Pandas DataFrame을 Spark DataFrame으로 변환하는 함수
def convert_to_spark_df(pandas_df):
    return spark.createDataFrame(pandas_df)

# Point DataFrame들 변환
train_point_july_df = convert_to_spark_df(train_point_july_df)
val_point_july_df = convert_to_spark_df(val_point_july_df)

train_point_august_df = convert_to_spark_df(train_point_august_df)
val_point_august_df = convert_to_spark_df(val_point_august_df)

train_point_september_df = convert_to_spark_df(train_point_september_df)
val_point_september_df = convert_to_spark_df(val_point_september_df)

train_point_october_df = convert_to_spark_df(train_point_october_df)
val_point_october_df = convert_to_spark_df(val_point_october_df)

# Memor DataFrame들 변환
train_memor_july_df = convert_to_spark_df(train_memor_july_df)
val_memor_july_df = convert_to_spark_df(val_memor_july_df)

train_memor_august_df = convert_to_spark_df(train_memor_august_df)
val_memor_august_df = convert_to_spark_df(val_memor_august_df)

train_memor_september_df = convert_to_spark_df(train_memor_september_df)
val_memor_september_df = convert_to_spark_df(val_memor_september_df)

train_memor_october_df = convert_to_spark_df(train_memor_october_df)
val_memor_october_df = convert_to_spark_df(val_memor_october_df)

In [33]:
def copy_file(source, target):
    try:
        shutil.copy(source, target)
        return 1  # 성공한 경우
    except Exception as e:
        return 0  # 실패한 경우

def find_and_copy_files(source_path, target_path, df):
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []

        for row in df.collect():
            participation_id = row['participation_id']
            source_file_path = os.path.join(source_path, f"{participation_id}.wav")  

            if not os.path.exists(target_path):
                os.makedirs(target_path)

            # 병렬 처리를 위한 작업 추가
            if os.path.isfile(source_file_path):
                target_file_path = os.path.join(target_path, f"{participation_id}.wav")  # 타겟 파일 경로에도 파일명 추가
                futures.append(executor.submit(copy_file, source_file_path, target_file_path))

        for future in futures:
            future.result()


In [34]:
point_source_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\point_from_7_to_10"
memor_source_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\memor_from_7_to_10"

train_point_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\training_minutescript"
train_memor_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\training_wordscript"
val_point_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\validation_minutescript"
val_memor_july_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\JULY\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_july_target_path, train_point_july_df)
find_and_copy_files(memor_source_path, train_memor_july_target_path, train_memor_july_df)
find_and_copy_files(point_source_path, val_point_july_target_path, val_point_july_df)
find_and_copy_files(memor_source_path, val_memor_july_target_path, val_memor_july_df)

In [35]:
train_point_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\training_minutescript"
train_memor_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\training_wordscript"
val_point_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\validation_minutescript"
val_memor_august_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\AUGUST\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_august_target_path, train_point_august_df)
find_and_copy_files(memor_source_path, train_memor_august_target_path, train_memor_august_df)
find_and_copy_files(point_source_path, val_point_august_target_path, val_point_august_df)
find_and_copy_files(memor_source_path, val_memor_august_target_path, val_memor_august_df)

In [36]:
train_point_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\training_minutescript"
train_memor_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\training_wordscript"
val_point_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\validation_minutescript"
val_memor_september_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\SEPTEMBER\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_september_target_path, train_point_september_df)
find_and_copy_files(memor_source_path, train_memor_september_target_path, train_memor_september_df)
find_and_copy_files(point_source_path, val_point_september_target_path, val_point_september_df)
find_and_copy_files(memor_source_path, val_memor_september_target_path, val_memor_september_df)

In [37]:
train_point_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\training_minutescript"
train_memor_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\training_wordscript"
val_point_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\validation_minutescript"
val_memor_october_target_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\voice_file_from_7_to_10\\OCTOBER\\validation_wordscript"

find_and_copy_files(point_source_path, train_point_october_target_path, train_point_october_df)
find_and_copy_files(memor_source_path, train_memor_october_target_path, train_memor_october_df)
find_and_copy_files(point_source_path, val_point_october_target_path, val_point_october_df)
find_and_copy_files(memor_source_path, val_memor_october_target_path, val_memor_october_df)