In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import lit, to_timestamp, to_date, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("voice_metadata extract s3 operation") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


In [3]:
# Parquet 파일의 경로
# file_path = 'D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\voice_metadata_parquet_231030_특성엔지니어링_수정후\\*.parquet'
file_path = 'D:\\DATA_PREPROCESS\\iOS_DATAS\\ios-integrated-voice-db-231123\\*.parquet'

# Parquet 파일을 읽어서 DataFrame에 저장
df = spark.read.parquet(file_path)
# df = pd.read_parquet(file_path, engine='pyarrow')

In [7]:
df.printSchema()
df.count()

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: long (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- category: integer (nullable = true)
 |-- birth: string (nullable = true)
 |-- dormant_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- type: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



1410375

In [8]:
# 문자열 컬럼을 timestamp 형태로 변환
df = df.withColumn("participated_at", to_timestamp("participated_at"))
df = df.withColumn("dormant_at", to_timestamp("dormant_at"))
df = df.withColumn("signed_up_at", to_timestamp("signed_up_at"))
df = df.withColumn("start_at", to_timestamp("start_at"))
df = df.withColumn("end_at", to_timestamp("end_at"))

# 문자열 컬럼을 date 형태로 변환
df = df.withColumn("birth", to_date("birth"))

# 결과 확인
df.printSchema()

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: timestamp (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: long (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- participated_at: timestamp (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- category: integer (nullable = true)
 |-- birth: date (nullable = true)
 |-- dormant_at: timestamp (nullable = true)
 |-- gender: long (nullable = true)
 |-- signed_up_at: timestamp (nullable = true)
 |-- type: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



In [12]:
# 조건에 맞는 행 필터링
filtered_df = df.filter(
    (col("participated_at") > lit("2023-07-01 00:00:00")) &
    (col("participated_at") <= lit("2023-11-01 00:00:00"))
)

# 'participated_at' 기준으로 정렬
sorted_df = filtered_df.orderBy("participated_at")

# 가장 처음과 마지막 행의 'participated_id' 값 가져오기
first_participation_id = sorted_df.select("participation_id").first()
last_participation_id = sorted_df.select("participation_id").orderBy(col("participated_at").desc()).first()

# 결과 출력
if first_participation_id is not None and last_participation_id is not None:
    first_participation_id_value = first_participation_id["participation_id"]
    last_participation_id_value = last_participation_id["participation_id"]
    print("가장 처음 participation_id:", first_participation_id_value)
    print("가장 마지막 participation_id:", last_participation_id_value)
else:
    print("조건에 맞는 행이 없습니다.")

가장 처음 participation_id: 2023070100001057300
가장 마지막 participation_id: 2023103123595306100


In [19]:
# 포인트벌기 필터링
filtered_point_df = filtered_df.filter(col('category') == 0)
filtered_memor_df = filtered_df.filter(col('category') == 1)

In [31]:
filtered_point_df.count()

157198

In [26]:
filtered_memor_df.count()

983434

In [27]:
filtered_df.count()

1140632

# S3로부터 데이터 가져오기

In [16]:
import boto3

# AWS 접근 키와 비밀 키 설정
aws_access_key_id = 'AKIAWFT4JEJIS5Q6RNON'
aws_secret_access_key = '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'

# Boto3 S3 클라이언트 초기화
s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
# vowing_bucket = s3_client.list_objects_v2(Bucket='vowing')

In [21]:
# # Paginator 초기화
# paginator = s3_client.get_paginator('list_objects_v2')

# # vowing 버킷 내에서 파일명의 날짜에 따라 파일들을 탐색하고 다운로드
# def download_files(bucket, prefix, min_id, max_id):
#     paginator = s3_client.get_paginator('list_objects_v2')

#     for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
#         for content in page.get('Contents', []):
#             key = content['Key']
#             if 'recorded/' in key and key.endswith('.wav'):
#                 file_id = key.split('/')[-1].split('.')[0]
#                 if min_id <= file_id <= max_id:
#                     local_file_path = os.path.join('D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_voice_files_from_7_to_10', key.split('/')[-1])
#                     s3_client.download_file(bucket, key, local_file_path)
#                     # print(f'Downloaded {key} to {local_file_path}')

# # 각 광고 ID 폴더를 탐색
# ad_folders = paginator.paginate(Bucket='vowing', Prefix='ad/', Delimiter='/')

# for folder in ad_folders:
#     for prefix in folder.get('CommonPrefixes', []):
#         ad_id = prefix['Prefix']
#         download_files('vowing', ad_id + 'recorded/', '2023070100001057300', '2023103123595306100')

## 병렬처리 수행으로 S3로부터 다운도르

In [29]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_file(bucket, key, local_path):
    s3_client.download_file(bucket, key, local_path)
    # print(f'Downloaded {key} to {local_path}')

# 병렬 다운로드
def parallel_download_files(bucket, prefix, min_id, max_id):
    with ThreadPoolExecutor(max_workers=20) as executor:  # max_workers = 동시에 실행할 스레드 수
        futures = []

        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            for content in page.get('Contents', []):
                key = content['Key']
                if 'recorded/' in key and key.endswith('.wav'):
                    file_id = key.split('/')[-1].split('.')[0]
                    if min_id <= file_id <= max_id:
                        local_file_path = os.path.join('D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_voice_files_from_7_to_10', key.split('/')[-1])
                        futures.append(executor.submit(download_file, bucket, key, local_file_path))

        # 모든 다운로드가 완료될 때까지 기다림
        for future in as_completed(futures):
            future.result()

# Paginator 초기화
paginator = s3_client.get_paginator('list_objects_v2')

In [None]:
# 각 포인트벌기 광고 ID 폴더를 탐색
ad_folders = paginator.paginate(Bucket='vowing', Prefix='ad/', Delimiter='/')

for folder in ad_folders:
    for prefix in folder.get('CommonPrefixes', []):
        ad_id = prefix['Prefix']
        parallel_download_files('vowing', ad_id + 'recorded/', '2023070100001057300', '2023103123595306100')

In [30]:
# 각 암기플러스 광고 ID 폴더를 탐색
ad_folders = paginator.paginate(Bucket='vowing', Prefix='m10n/', Delimiter='/')

for folder in ad_folders:
    for prefix in folder.get('CommonPrefixes', []):
        ad_id = prefix['Prefix']
        parallel_download_files('vowing', ad_id + 'recorded/', '2023070100001057300', '2023103123595306100')

# 라벨링 데이터 정제

## 가져온 파일과 매칭되는 라벨링 데이터들만 필터링

In [56]:
# 파일명을 사전에 저장하는 함수
def create_file_dict(directory):
    file_dict = {}
    for filename in os.listdir(directory):
        file_name = os.path.splitext(filename)[0]
        file_dict[file_name] = 0
    return file_dict

# 파일명 사전 생성
file_dict = create_file_dict("D:/DATA_PREPROCESS/iOS_DATAS/voice_file_from_7_to_10/point_from_7_to_10")

# participation_id를 문자열로 변환
filtered_point_df = filtered_point_df.withColumn("participation_id", col("participation_id").cast("string"))
# exist 컬럼 추가 (기본값 0)
filtered_point_df = filtered_point_df.withColumn("exist", lit(0))

# file_dict의 키에 따라 exist 컬럼 업데이트
def update_exist(participation_id, exist):
    if participation_id in file_dict:
        return 1
    else:
        return exist

update_exist_udf = udf(update_exist, IntegerType())

# exist 컬럼 업데이트 적용
filtered_point_df = filtered_point_df.withColumn("exist", update_exist_udf(col("participation_id"), col("exist")))

exist_point_df = filtered_point_df.filter(filtered_point_df['exist'] == 1)
exist_point_df = exist_point_df.drop('exist')

In [70]:
exist_point_df.printSchema()
exist_point_df.count()

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: timestamp (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- participated_at: timestamp (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- category: integer (nullable = true)
 |-- birth: date (nullable = true)
 |-- dormant_at: timestamp (nullable = true)
 |-- gender: long (nullable = true)
 |-- signed_up_at: timestamp (nullable = true)
 |-- type: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



113714

In [69]:
# 파일명 사전 생성
memor_file_dict = create_file_dict("D:/DATA_PREPROCESS/iOS_DATAS/voice_file_from_7_to_10/memor_from_7_to_10")

# filtered_memor_df에 participation_id 컬럼을 문자열로 변환
filtered_memor_df = filtered_memor_df.withColumn("participation_id", col("participation_id").cast("string"))

filtered_memor_df = filtered_memor_df.withColumn("exist", lit(0))

# memor_file_dict의 키에 따라 exist 컬럼 업데이트
def update_exist_for_memor(participation_id, exist):
    if participation_id in memor_file_dict:
        return 1
    else:
        return exist
update_exist_for_memor_udf = udf(update_exist_for_memor, IntegerType())

# exist 컬럼 업데이트 적용
filtered_memor_df = filtered_memor_df.withColumn("exist", update_exist_for_memor_udf(col("participation_id"), col("exist")))

exist_memor_df = filtered_memor_df.filter(filtered_memor_df['exist'] == 1)
exist_memor_df = exist_memor_df.drop('exist')

exist_memor_df.printSchema()
exist_memor_df.count()

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: timestamp (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- participated_at: timestamp (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- category: integer (nullable = true)
 |-- birth: date (nullable = true)
 |-- dormant_at: timestamp (nullable = true)
 |-- gender: long (nullable = true)
 |-- signed_up_at: timestamp (nullable = true)
 |-- type: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



907406

## 라벨링 데이터 저장

In [71]:
# exist_point_df를 JSON으로 저장
exist_point_df.write.json("D:/DATA_PREPROCESS/iOS_DATAS/labelling_data_from_7_to_10/labelling_point.json")

# exist_memor_df를 JSON으로 저장
exist_memor_df.write.json("D:/DATA_PREPROCESS/iOS_DATAS/labelling_data_from_7_to_10/labelling_memor.json")