In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import lit, row_number, rand, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("voice_metadata integrating operation") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


In [3]:
## JSON 파일 로드
json_file_path = 'D:/DATA_PREPROCESS/iOS_DATAS/sql_tables_231123/vowing_ad.json'
ad_df = spark.read.json(json_file_path)
json_file_path = 'D:/DATA_PREPROCESS/iOS_DATAS/sql_tables_231123/vowing_ad_participation.json'
ad_participation_df = spark.read.json(json_file_path)

json_file_path = 'D:/DATA_PREPROCESS/iOS_DATAS/sql_tables_231123/vowing_m10n.json'
m10n_df = spark.read.json(json_file_path)
json_file_path = 'D:/DATA_PREPROCESS/iOS_DATAS/sql_tables_231123/vowing_m10n_participation.json'
m10n_participation_df = spark.read.json(json_file_path)

json_file_path = 'D:/DATA_PREPROCESS/iOS_DATAS/sql_tables_231123/vowing_user.json'
user_df = spark.read.json(json_file_path)

In [8]:
ad_df = ad_df.filter(col('_corrupt_record').isNull()).drop('_corrupt_record')
ad_participation_df = ad_participation_df.filter(col('_corrupt_record').isNull()).drop('_corrupt_record')
m10n_df = m10n_df.filter(col('_corrupt_record').isNull()).drop('_corrupt_record')
m10n_participation_df = m10n_participation_df.filter(col('_corrupt_record').isNull()).drop('_corrupt_record')
user_df = user_df.filter(col('_corrupt_record').isNull()).drop('_corrupt_record')

In [9]:
ad_df.show()

+---+-----------------------------------+----------+--------------------+----------------+----------------------------------+--------------------+-------------------------+
|_id|                        description|difficulty|              end_at|minimum_accuracy|                            script|            start_at|                    title|
+---+-----------------------------------+----------+--------------------+----------------+----------------------------------+--------------------+-------------------------+
|  2|  닭갈비의 본고장 춘천생산 진짜 ...|         1|2022-09-22 00:00:...|              75| 춘천 홍익 닭갈비는 춘천에서 직...|2022-09-19 00:00:...|               태범프레시|
|  3|  닭갈비의 본고장 춘천생산 진짜 ...|         1|2022-09-22 00:00:...|              75| 춘천 홍익 닭갈비는 춘천에서 직...|2022-09-19 00:00:...|               태범프레시|
|  4|  닭갈비의 본고장 춘천생산 진짜 ...|         1|2022-09-22 00:00:...|              75| 춘천 홍익 닭갈비는 춘천에서 직...|2022-09-19 00:00:...|               태범프레시|
|  5|  닭갈비의 본고장 춘천생산 진짜 ...|         1|2022-09-22 00:00

# 테이블 조인

In [10]:
# 포인트 벌기: ad, ad_participation 테이블을 조인

# 조인 수행
point_df = ad_df.join(ad_participation_df, ad_df._id == ad_participation_df.ad_id, "left")
# '_id' 컬럼 제거
point_df = point_df.drop(ad_df._id)
# '_id' 컬럼을 'participation_id'로 이름 변경
point_df = point_df.withColumnRenamed("_id", "participation_id")

In [11]:
point_df.show()

+-----------------------------------+----------+--------------------+----------------+----------------------------------+--------------------+-------------------------+-------------------+--------+-----+--------+---------+--------------------+---------------------------------+-------------------+
|                        description|difficulty|              end_at|minimum_accuracy|                            script|            start_at|                    title|   participation_id|accuracy|ad_id|audio_id|is_passed|     participated_at|                    recorded_text|            user_id|
+-----------------------------------+----------+--------------------+----------------+----------------------------------+--------------------+-------------------------+-------------------+--------+-----+--------+---------+--------------------+---------------------------------+-------------------+
|                슈퍼비전 멀티비타민|         2|2022-09-23 00:01:...|              50|슈퍼비전 멀티비타민 미네랄 로얄...|2022-09-

In [12]:
# 암기플러스: m10n, m10n_participation 테이블을 조인

m10n_df = m10n_df.withColumnRenamed("_id", "memor_id")

# 조인 수행
m10n_df = m10n_df.join(m10n_participation_df, m10n_df.memor_id == m10n_participation_df.m10n_id, "left")
# '_id' 컬럼 제거
m10n_df = m10n_df.drop(m10n_df.memor_id)
# '_id' 컬럼을 'participation_id'로 이름 변경
m10n_df = m10n_df.withColumnRenamed("_id", "participation_id")

In [13]:
m10n_df.count()

1057799

In [15]:
m10n_df.printSchema()
point_df.printSchema()

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: long (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- m10n_id: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- user_id: long (nullable = true)

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: long (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- ad_id: lo

In [20]:
user_df.cache()
print(user_df.count())

8708


In [18]:
user_df.printSchema()

root
 |-- _id: long (nullable = true)
 |-- birth: string (nullable = true)
 |-- dormant_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- type: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



In [21]:
# 포인트 벌기 통합 데이터와 암기플러스 통합 데이터를 완전 외부 조인

# point_df에 'category' 컬럼 추가 (값은 0) 및 'ad_id'를 'ads_id'로 이름 변경
point_df = point_df.withColumn("category", lit(0)).withColumnRenamed("ad_id", "ads_id")
# m10n_df에 'category' 컬럼 추가 (값은 1) 및 'm10n_id'를 'ads_id'로 이름 변경
m10n_df = m10n_df.withColumn("category", lit(1)).withColumnRenamed("m10n_id", "ads_id")

ads_df = point_df.union(m10n_df)

In [22]:
ads_df.printSchema()

root
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- script: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- participation_id: long (nullable = true)
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- category: integer (nullable = false)



In [23]:
# 음성데이터와 유저데이터를 조인: ads_df와 user_df를 조인

joined_df = ads_df.join(user_df, ads_df.user_id == user_df._id, "left")
joined_df = joined_df.drop(user_df._id)

# 저장

In [24]:
# 오늘의 날짜를 얻어 'yymmdd' 포맷으로 변환
today = datetime.today()
formatted_date = today.strftime('%y%m%d')

# 저장 경로 설정
file_path = f"D:/DATA_PREPROCESS/iOS_DATAS/ios-integrated-voice-db-{formatted_date}"

# DataFrame을 로컬에 Parquet 형식으로 저장
joined_df.write.parquet(file_path)