# 'sample_voice_metadata_230918'을 분석하는 코드
보윙의 음성데이터에 대해서 알리고 홍보하는 문서를 작성하기 위함

In [1]:
import pandas as pd
import numpy as np

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, DoubleType, IntegerType
from functools import reduce

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("voice_metadata integrating operation") \
    .config("spark.driver.memory", "5g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.driver.maxResultSize", "3g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

In [3]:
# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


In [4]:
# Parquet 파일의 경로
file_path = 'D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\sample_voice_metadata_230918\\public_sample_voice_metadata_230920.parquet'

# Parquet 파일을 읽어서 DataFrame에 저장
df = pd.read_parquet(file_path, engine='pyarrow')
spark_df = spark.read.parquet(file_path)

In [5]:
# 원본 Parquet 파일에서 데이터 스키마 정보 출력

parquet_file = pq.ParquetFile(file_path)
schema = parquet_file.schema

In [13]:
spark_df.printSchema()

root
 |-- ad_name: string (nullable = true)
 |-- attend: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- ad_script: string (nullable = true)
 |-- ad_title: string (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- local_code: string (nullable = true)
 |-- is_test: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- job: string (nullable = true)
 |-- language: string (nullable = true)
 |-- perfect: double (nullable = true)
 |-- excepted_age_array: string (nullable = true)
 |-- collection: string (nullable = true)
 |-- ad_duration: string (nullable = true)
 |-- level: string (nullable = true)
 |-- ad_link: string (nullable = true)
 |-- participant_count: long (nullable = true)
 |-- accuracy_array: string (nullable = true)
 |-- stt_text_array: string (nullable = true)
 |-- created_timestamp_array: string (nullable = true)
 |-- average_accuracy_by_ad: double (nullable = true)
 |-- average_accuracy_by_user: double (nullable = true)
 |-- pas

In [7]:
spark_df.count()

10000

# Spark DataFrame의 스키마를 변경

In [14]:
# # 기존 스키마를 가져옵니다.
# old_schema = spark_df.schema

# # 새로운 스키마를 만듭니다.
# new_fields = []
# for field in old_schema.fields:
#     if field.name in ['ad_name', 'public_uid', 'accuracy_array', 'video_id']:
#         new_field = StructField(field.name, field.dataType, nullable=False)
#     else:
#         new_field = field
#     new_fields.append(new_field)
# new_schema = StructType(fields=new_fields)

# # 새로운 스키마를 적용합니다.
# new_spark_df = spark.createDataFrame(spark_df.rdd.map(lambda row: row), schema=new_schema)

# new_spark_df.printSchema()

In [12]:
# # Parquet 형식으로 저장합니다.
# new_spark_df.write.parquet('D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\sample_voice_metadata_230918\\the_new')

# new_path = 'D:\\DATA_PREPROCESS\\FIRESTORE_DATAS\\sample_voice_metadata_230918\\the_new\\part-00000-0ef7dd07-2178-4775-b8a9-534865cb62c4-c000.snappy.parquet'
# new_df = spark.read.parquet(new_path)
# new_df.printSchema()

nullable 스키마 적용을 spark df를 통해 하는 건 까다롭고 복잡하거나 반복적이라고 한다  
일단 skip

# 