In [14]:
import pandas as pd
import glob
import os

# Cari semua file parquet di folder payments
payment_files = glob.glob("raw/payments/**/*.parquet", recursive=True)

# Baca semua file dan gabungkan
df_payments = pd.concat([pd.read_parquet(f) for f in payment_files])

# 1. Cek tipe data - Perhatikan kolom amount_usd!
print("--- Tipe Data Kolom ---")
print(df_payments.dtypes)

# 2. Cek apakah ada campuran tipe data (String & Float) di amount_usd
print("\n--- Deteksi Mixed Types di amount_usd ---")
print(df_payments['amount_usd'].apply(type).value_counts())

# 3. Cek data dari bulan Desember (Bulan ke-12)
print("\n--- Contoh Data Bulan Desember ---")
print(f"Total events di snapshot: {len(df_payments)}")
print(df_payments[df_payments['batch_month'] == '2024-12'].head())

--- Tipe Data Kolom ---
payment_id                            str
user_id                               str
amount_usd                         object
status                                str
attempt_number                      int64
payment_timestamp_local    datetime64[ms]
payment_timestamp_utc      datetime64[ms]
batch_month                           str
ingestion_source                      str
promo_code                            str
dtype: object

--- Deteksi Mixed Types di amount_usd ---
amount_usd
<class 'str'>    3291
<class 'int'>    2934
Name: count, dtype: int64

--- Contoh Data Bulan Desember ---
Total events di snapshot: 6225
                             payment_id                               user_id  \
0  12175d8e-4b47-4a5f-ae9f-1dfb253a379e  92df9574-7aaa-472d-86fd-7bf09874132b   
1  88060ccb-b0da-4280-9550-46bb40dc85fd  602b7b62-1662-46e8-ba8e-04878334e842   
2  009a5169-9406-4e00-8cd9-25a99b60576f  8f8eb58a-9fa5-45be-bb4e-1945cbccd7c3   
3  6f43f12d-ab2a-4caa-b4b4-

In [15]:
import pandas as pd
import glob
import os

# Cari semua file parquet di folder product_events
product_files = glob.glob("raw/product_events/**/*.parquet", recursive=True)

# Baca semua file dan gabungkan
df_product = pd.concat([pd.read_parquet(f) for f in product_files])

# 1. Cek tipe data 
print("--- Tipe Data Kolom ---")
print(df_product.dtypes)

# 2. Cek data dari bulan Desember (Bulan ke-12)
print("\n--- Contoh Data Bulan Desember ---")
print(f"Total events di snapshot: {len(df_product)}")
print(df_product[df_product['batch_month'] == '2024-12'].head())

--- Tipe Data Kolom ---
event_id                            str
user_id                             str
event_type                          str
plan                                str
event_timestamp_local    datetime64[ms]
event_timestamp_utc      datetime64[ms]
batch_month                         str
ingestion_source                    str
promo_code                          str
dtype: object

--- Contoh Data Bulan Desember ---
Total events di snapshot: 515816
                                event_id  \
24  09c11b3d-0ed2-4eec-b33c-91ecac2beffd   
25  40fcb715-2b5c-4826-9d84-256e02157302   
26  44cfcc48-3624-4166-83ec-348907c5e91b   
27  5e557408-69e6-4a1d-a013-159e594111a4   
28  bc2e5891-c718-45e1-9816-9576c2652fc5   

                                 user_id     event_type plan  \
24  12f2c98d-e894-4bfb-9468-f6300176a497  product_usage  Pro   
25  57a2c12c-ef5b-4d71-9bc0-5dc16dc50a7f  product_usage  Pro   
26  0c5061ac-03c7-4f9c-9708-6afcd3005683  product_usage  Pro   
27  0c5061ac

In [13]:
import pandas as pd
import glob
import os

# Cari semua file parquet di folder subscription_events
subscription_files = glob.glob("raw/subscription_events/**/*.parquet", recursive=True)

# Baca semua file dan gabungkan
df_subscription = pd.concat([pd.read_parquet(f) for f in subscription_files])

# 1. Cek tipe data 
print("--- Tipe Data Kolom ---")
print(df_subscription.dtypes)

# 2. Cek data dari bulan Desember (Bulan ke-12)
print("\n--- Contoh Data Bulan Desember ---")
print(f"Total events di snapshot: {len(df_subscription)}")
print(df_subscription[df_subscription['batch_month'] == '2024-12'].head())

--- Tipe Data Kolom ---
event_id                            str
user_id                             str
event_type                          str
plan                                str
event_timestamp_local    datetime64[ms]
event_timestamp_utc      datetime64[ms]
country                             str
batch_month                         str
ingestion_source                    str
promo_code                          str
dtype: object

--- Contoh Data Bulan Desember ---
Total events di snapshot: 6728
                               event_id                               user_id  \
0  1a0e4dc6-e314-4106-8813-44d43587ba67  6e8b6581-7fbe-4adb-b331-e35e8a8b3e52   
1  927454fe-d332-446d-9956-5a4665523a2f  b043fc39-3a26-4521-8ee1-d1de075a2e51   
2  6622ca82-44f4-4f4c-8fcf-7e20a23549dd  62e662b1-4ee2-4958-8678-59fa719828ef   
3  e385fc94-10d7-48c8-93aa-6711e4150042  c01c3b0a-084d-44c1-ae94-e519035d7a13   
4  0e796a8d-b36d-4a0f-b601-eb161d20e86d  616d41e6-46a4-4c96-b2d0-3275c9d598e7   

    even

In [11]:
import pandas as pd
import glob
import os

# Cari semua file parquet di folder users
users_files = glob.glob("raw/users/**/*.parquet", recursive=True)

# Baca semua file dan gabungkan
df_users = pd.concat([pd.read_parquet(f) for f in users_files])

# 1. Cek tipe data 
print("--- Tipe Data Kolom ---")
print(df_users.dtypes)

# 2. Cek data semua user
print("\n--- Seluruh Data User ---")
print(f"Total user di snapshot: {len(df_users)}")
print(df_users['current_plan'].value_counts())

--- Tipe Data Kolom ---
user_id                      str
country                      str
timezone                     str
current_status               str
current_plan                 str
created_at_utc    datetime64[ms]
dtype: object

--- Seluruh Data User ---
Total user di snapshot: 1405
current_plan
Expired     857
Pro         188
Canceled    164
Business    136
Free         60
Name: count, dtype: int64
