In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re # 정규식 사용을 위한 모듈
# from scipy.stats import gaussian_kde

import pyarrow.parquet as pq

from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window
from pyspark.sql.functions import row_number, rand, broadcast, element_at, regexp_extract, udf, col, isnull, count, when, substring, coalesce, from_json, size, avg, expr, concat_ws
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, DoubleType, IntegerType, FloatType

from functools import reduce
from matplotlib.ticker import FuncFormatter
from collections import Counter
from datetime import datetime

In [2]:
# 가상 환경의 Python 경로를 지정
python_path = "C:/Users/admin/anaconda3/envs/my_conda_01/python.exe"

# Spark 세션 생성
spark = SparkSession.builder \
    .appName("analyze ios voicemetadata operation") \
    .config("spark.driver.memory", "9g") \
    .config("spark.executor.memory", "9g") \
    .config("spark.driver.maxResultSize", "5g") \
    .config("spark.pyspark.python", python_path) \
    .config("spark.local.dir", "D:/spark_tmp") \
    .getOrCreate()

# SparkContext 가져오기
sc = spark.sparkContext

# Spark 중간 파일이 저장되는 경로 확인
current_spark_local_dir = spark.conf.get("spark.local.dir", "Not Set")
print("Spark local directory:", current_spark_local_dir)

Spark local directory: D:/spark_tmp


# 대상 파일

## 231127 ios 통합 라벨링 데이터

In [3]:
ios_point_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\labelling_data_from_7_to_10\\labelling_point.json\\*.json"
ios_memor_path = "D:\\DATA_PREPROCESS\\iOS_DATAS\\ios_VOICE_DATA_FROM_7_TO_10\\labelling_data_from_7_to_10\\labelling_memor.json\\*.json"

ios_point_df = spark.read.json(ios_point_path)
ios_memor_df = spark.read.json(ios_memor_path)

In [5]:
print(ios_point_df.count())
print(ios_memor_df.count())

113714
907406


In [6]:
print(ios_point_df.count() + ios_memor_df.count())

1021120


## iOS_암기플러스_231115.csv & iOS_DATAS\\iOS_포인트벌기_231115.csv

In [8]:
# csv 파일의 경로
memor_file_path = 'D:\\DATA_PREPROCESS\\iOS_DATAS\\iOS_암기플러스_231115.csv'  # 광고와 사용자정보가 join된 데이터
point_file_path = 'D:\\DATA_PREPROCESS\\iOS_DATAS\\iOS_포인트벌기_231115.csv'  # 광고와 사용자정보가 join된 데이터

# csv 파일을 읽어서 DataFrame에 저장
memor_df = spark.read.csv(memor_file_path, header=True)
point_df = spark.read.csv(point_file_path, header=True)

# 유저, 광고별 5개씩 추출, 변형, 저장

In [9]:
point_df.printSchema()

root
 |-- _id0: string (nullable = true)
 |-- ad_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- audio_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- accuracy: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- participation_type7: string (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- _id9: string (nullable = true)
 |-- email: string (nullable = true)
 |-- password: string (nullable = true)
 |-- password_salt: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- account_name: string (nullable = true)
 |-- recommender_id: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- type: string (nullable = true)
 |-- sign_in_method: string (nullable = true)
 |-- service_agreement_id: string (nullable = true)
 |-- privacy_agreement_id: string (nullable = true)
 |-- location_agreement_id: strin

In [11]:
memor_df.printSchema()

root
 |-- _id0: string (nullable = true)
 |-- m10n_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- audio_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- accuracy: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- _id8: string (nullable = true)
 |-- email: string (nullable = true)
 |-- password: string (nullable = true)
 |-- password_salt: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- nickname: string (nullable = true)
 |-- account_name: string (nullable = true)
 |-- recommender_id: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- type: string (nullable = true)
 |-- sign_in_method: string (nullable = true)
 |-- service_agreement_id: string (nullable = true)
 |-- privacy_agreement_id: string (nullable = true)
 |-- location_agreement_id: string (nullable = true)
 |-- commercial_notification_

In [13]:
null_count_point = point_df.filter(col("audio_id").isNull()).count()
null_count_memor = memor_df.filter(col("audio_id").isNull()).count()
print(f"point의 audio_id의 null 개수:{null_count_point}, memor의 audio_id의 null 개수:{null_count_memor}")

point의 audio_id의 null 개수:792920, memor의 audio_id의 null 개수:0


In [19]:
selected_point_df = point_df.select(
    "_id0", "ad_id", "user_id", "recorded_text", "accuracy", "is_passed", 
    "participation_type7", "participated_at", "birth", "gender", "type", 
    "title", "description", "script", "difficulty", "minimum_accuracy", 
    "created_at", "start_at", "end_at", "exposed_at"
)
selected_memor_df = memor_df.select(
    "_id0", "m10n_id", "user_id", "recorded_text", "accuracy", "is_passed", 
    "participated_at", "birth", "gender", "type", 
    "title", "description", "script", "difficulty", "minimum_accuracy", 
    "created_at", "start_at", "end_at", "exposed_at"
)

In [21]:
selected_point_df.printSchema()

root
 |-- _id0: string (nullable = true)
 |-- ad_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- accuracy: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- participation_type7: string (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- script: string (nullable = true)
 |-- difficulty: string (nullable = true)
 |-- minimum_accuracy: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- end_at: string (nullable = true)
 |-- exposed_at: string (nullable = true)



In [59]:
# 광고 2개를 랜덤하게 선택
point_ads = selected_point_df.select("ad_id").distinct().sample(False, 0.1).limit(2)
memor_ads = selected_memor_df.select("m10n_id").distinct().sample(False, 0.1).limit(2)

# 윈도우 정의: 각 광고 ID별로
windowSpec = Window.partitionBy("ad_id").orderBy(rand())

# 각 광고별로 유저 5명을 랜덤하게 선택 (point_df)
point_5users_df = selected_point_df.join(point_ads, "ad_id") \
                                   .withColumn("row_number", row_number().over(windowSpec)) \
                                   .filter(col("row_number") <= 5) \
                                   .drop("row_number")

# 윈도우 정의: 각 광고 ID별로
windowSpec = Window.partitionBy("m10n_id").orderBy(rand())

# 각 광고별로 유저 5명을 랜덤하게 선택 (memor_df)
memor_5users_df = selected_memor_df.join(memor_ads, "m10n_id") \
                                   .withColumn("row_number", row_number().over(windowSpec)) \
                                   .filter(col("row_number") <= 5) \
                                   .drop("row_number")

In [68]:
# # 마지막 컬럼의 이름을 가져옴
# last_column_name = point_5users_df.columns[-1]

# # 마지막 컬럼을 삭제
# point_5users_df = point_5users_df.drop(last_column_name)

# 결과 확인
point_5users_df.printSchema()
point_5users_df.show()

root
 |-- ad_id: string (nullable = true)
 |-- _id0: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- accuracy: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- participation_type7: string (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- script: string (nullable = true)
 |-- difficulty: string (nullable = true)
 |-- minimum_accuracy: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- end_at: string (nullable = true)
 |-- exposed_at: string (nullable = true)

+-----+-------------------+-------------------+----------------------------------+--------+---------+-------------------+--------------------+----------+------+----+------------

In [69]:
# # 마지막 컬럼의 이름을 가져옴
# memor_last_column_name = memor_5users_df.columns[-1]

# # 마지막 컬럼을 삭제
# memor_5users_df = memor_5users_df.drop(memor_last_column_name)

# 결과 확인
memor_5users_df.printSchema()
memor_5users_df.show()

root
 |-- m10n_id: string (nullable = true)
 |-- _id0: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- accuracy: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- script: string (nullable = true)
 |-- difficulty: string (nullable = true)
 |-- minimum_accuracy: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- end_at: string (nullable = true)
 |-- exposed_at: string (nullable = true)

+-------+-------------------+-------------------+---------------------+--------+---------+--------------------+----------+------+----+-----------------------------------+-----------+--------------------+----------+------------

In [93]:
# 사용자 2명을 랜덤하게 선택
selected_users = selected_point_df.select("user_id").distinct().sample(False, 0.1).limit(2)

# 윈도우 정의: 각 사용자별로
windowSpec = Window.partitionBy("user_id").orderBy(rand())

# 각 사용자별로 광고 5개를 랜덤하게 선택 (created_at 컬럼의 값이 null이 아닌 경우)
user_ads_df = selected_point_df.join(selected_users, "user_id") \
                               .filter(col("created_at").isNotNull()) \
                               .withColumn("row_number", row_number().over(windowSpec)) \
                               .filter(col("row_number") <= 5) \
                               .drop("row_number")

In [94]:
user_ads_df.printSchema()
user_ads_df.count()

root
 |-- user_id: string (nullable = true)
 |-- _id0: string (nullable = true)
 |-- ad_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- accuracy: string (nullable = true)
 |-- is_passed: string (nullable = true)
 |-- participation_type7: string (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- script: string (nullable = true)
 |-- difficulty: string (nullable = true)
 |-- minimum_accuracy: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- end_at: string (nullable = true)
 |-- exposed_at: string (nullable = true)



10

In [95]:
user_ads_df.show()

+-------------------+-------------------+-----+----------------------------------+--------+---------+-------------------+--------------------+----------+------+----+--------------------------------+--------------------+-----------------------------------+----------+----------------+--------------------+--------------------+--------------------+--------------------+
|            user_id|               _id0|ad_id|                     recorded_text|accuracy|is_passed|participation_type7|     participated_at|     birth|gender|type|                           title|         description|                             script|difficulty|minimum_accuracy|          created_at|            start_at|              end_at|          exposed_at|
+-------------------+-------------------+-----+----------------------------------+--------+---------+-------------------+--------------------+----------+------+----+--------------------------------+--------------------+-----------------------------------+---------

In [96]:
# # Spark DataFrame을 CSV로 저장
# point_5users_df.write.csv("D:/temp_point_5users_df.csv", header=True)
# memor_5users_df.write.csv("D:/temp_memor_5users_df.csv", header=True)
# user_ads_df.write.csv("D:/temp_user_ads_df.csv", header=True)

# Spark DataFrame을 Pandas DataFrame으로 변환
# point_5users_pd_df = point_5users_df.toPandas()
# memor_5users_pd_df = memor_5users_df.toPandas()
user_ads_pd_df = user_ads_df.toPandas()

# Pandas DataFrame을 Excel 파일로 저장
# point_5users_pd_df.to_excel("D:/temp_point_5users_df.xlsx", index=False)
# memor_5users_pd_df.to_excel("D:/temp_memor_5users_df.xlsx", index=False)
user_ads_pd_df.to_excel("D:/temp_user_ads_df.xlsx", index=False)

# ios 라벨링 데이터 구조 파악

In [5]:
# 두 DataFrame의 스키마를 비교
schemas_equal = ios_point_df.schema == ios_memor_df.schema

# 결과 출력
print(f"두 DataFrame의 스키마가 동일한가? {schemas_equal}")

두 DataFrame의 스키마가 동일한가? True


In [6]:
ios_point_df.printSchema()

root
 |-- accuracy: long (nullable = true)
 |-- ads_id: long (nullable = true)
 |-- audio_id: long (nullable = true)
 |-- birth: string (nullable = true)
 |-- category: long (nullable = true)
 |-- description: string (nullable = true)
 |-- difficulty: long (nullable = true)
 |-- end_at: string (nullable = true)
 |-- gender: long (nullable = true)
 |-- is_passed: long (nullable = true)
 |-- minimum_accuracy: long (nullable = true)
 |-- participated_at: string (nullable = true)
 |-- participation_id: string (nullable = true)
 |-- recorded_text: string (nullable = true)
 |-- script: string (nullable = true)
 |-- signed_up_at: string (nullable = true)
 |-- start_at: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- withdrawn_at: string (nullable = true)



In [8]:
ios_memor_df.show()

+--------+------+--------+----------+--------+-----------+----------+--------------------+------+---------+----------------+--------------------+-------------------+-------------+--------+--------------------+--------------------+----------------------------------+----+-------------------+------------+
|accuracy|ads_id|audio_id|     birth|category|description|difficulty|              end_at|gender|is_passed|minimum_accuracy|     participated_at|   participation_id|recorded_text|  script|        signed_up_at|            start_at|                             title|type|            user_id|withdrawn_at|
+--------+------+--------+----------+--------+-----------+----------+--------------------+------+---------+----------------+--------------------+-------------------+-------------+--------+--------------------+--------------------+----------------------------------+----+-------------------+------------+
|     100|     0|       1|1900-01-01|       1|          1|         0|2023-07-17T12:00:..