# Transactions

- Combine transactions.csv and transaction_v2.csv.
- Add year & month column
- Then transform into parquet.

In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType


In [2]:
from pyspark.sql.functions import countDistinct

In [3]:
from pyspark.sql.functions import lit

In [4]:
from pyspark.sql.functions import substring

In [5]:
os.getcwd()

'/app'

In [6]:
spark = (
    pyspark.sql.SparkSession.builder
    .appName("dev")
    .master("local[12]")                     # Use all 12 logical cores
    .config("spark.driver.memory", "10g")   # Allocate ~10 GB memory
    .config("spark.sql.shuffle.partitions", "12")  # Match to number of threads
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 16:20:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
print(spark.sparkContext.uiWebUrl)

http://c88f750897f2:4040


In [8]:
# Read CSV file
v1 = spark.read.csv("data/transactions.csv", header=True, inferSchema=True)

                                                                                

In [9]:
v2 = spark.read.csv("data/transactions_v2.csv", header=True, inferSchema=True)

                                                                                

In [10]:
#v1.show(5)

In [11]:
#v2.show(5)

In [12]:
# Add source file tracking
v1_tagged = v1.withColumn("source_file", lit("transactions.csv"))
v2_tagged = v2.withColumn("source_file", lit("transactions_v2.csv"))

In [13]:
# Union both datasets
combined = v1_tagged.union(v2_tagged)

In [14]:
# Deduplicate based on all original columns (excluding source_file)
data_columns = v1.columns  # Original columns without source_file

In [15]:
merged = combined.dropDuplicates(subset=data_columns)

In [16]:
#merged.show(10)

In [17]:
merged.count()

                                                                                

22975416

In [18]:
merged.select("transaction_date").distinct().orderBy("transaction_date").show()



+----------------+
|transaction_date|
+----------------+
|        20150101|
|        20150102|
|        20150103|
|        20150104|
|        20150105|
|        20150106|
|        20150107|
|        20150108|
|        20150109|
|        20150110|
|        20150111|
|        20150112|
|        20150113|
|        20150114|
|        20150115|
|        20150116|
|        20150117|
|        20150118|
|        20150119|
|        20150120|
+----------------+
only showing top 20 rows



                                                                                

In [19]:
# Extract year and month
df_with_ym = merged.withColumn("year", substring("transaction_date", 1, 4)) \
                   .withColumn("month", substring("transaction_date", 5, 2))

# Count distinct years and months
df_with_ym.select("year").distinct().count(), df_with_ym.select("month").distinct().count()

                                                                                

(3, 12)

In [20]:
df_with_ym

DataFrame[msno: string, payment_method_id: int, payment_plan_days: int, plan_list_price: int, actual_amount_paid: int, is_auto_renew: int, transaction_date: int, membership_expire_date: int, is_cancel: int, source_file: string, year: string, month: string]

In [21]:
df_with_ym.write \
    .mode("overwrite") \
    .option("header", "true") \
    .partitionBy("year", "month") \
    .parquet("datamart/bronze/transactions")

                                                                                

In [22]:
result = df_with_ym.agg(countDistinct("msno").alias("distinct_msno_count"))
result.show()



+-------------------+
|distinct_msno_count|
+-------------------+
|            2426143|
+-------------------+



                                                                                

# Members

- Add year & month column
- Then transform into parquet.

In [8]:
# Read CSV file
df = spark.read.csv("data/members_v3.csv", header=True, inferSchema=True)

                                                                                

In [9]:
df.show(10)

+--------------------+----+---+------+--------------+----------------------+
|                msno|city| bd|gender|registered_via|registration_init_time|
+--------------------+----+---+------+--------------+----------------------+
|Rb9UwLQTrxzBVwCB6...|   1|  0|  NULL|            11|              20110911|
|+tJonkh+O1CA796Fm...|   1|  0|  NULL|             7|              20110914|
|cV358ssn7a0f7jZOw...|   1|  0|  NULL|            11|              20110915|
|9bzDeJP6sQodK73K5...|   1|  0|  NULL|            11|              20110915|
|WFLY3s7z4EZsieHCt...|   6| 32|female|             9|              20110915|
|yLkV2gbZ4GLFwqTOX...|   4| 30|  male|             9|              20110916|
|jNCGK78YkTyId3H3w...|   1|  0|  NULL|             7|              20110916|
|WH5Jq4mgtfUFXh2yz...|   5| 34|  male|             9|              20110916|
|tKmbR4X5VXjHmxERr...|   5| 19|  male|             9|              20110917|
|I0yFvqMoNkM8ZNHb6...|  13| 63|  male|             9|              20110918|

In [10]:
df.count()

                                                                                

6769473

In [11]:
df.select("registration_init_time").distinct().orderBy("registration_init_time").show()



+----------------------+
|registration_init_time|
+----------------------+
|              20040326|
|              20040327|
|              20040328|
|              20040329|
|              20040330|
|              20040331|
|              20040401|
|              20040402|
|              20040403|
|              20040404|
|              20040405|
|              20040406|
|              20040407|
|              20040408|
|              20040409|
|              20040410|
|              20040411|
|              20040412|
|              20040413|
|              20040414|
+----------------------+
only showing top 20 rows



                                                                                

In [12]:
# Extract year and month
dfm_with_ym = df.withColumn("year", substring("registration_init_time", 1, 4)) \
                   .withColumn("month", substring("registration_init_time", 5, 2))

# Count distinct years and months
dfm_with_ym.select("year").distinct().count(), dfm_with_ym.select("month").distinct().count()

                                                                                

(14, 12)

In [13]:
dfm_with_ym.show(10)

+--------------------+----+---+------+--------------+----------------------+----+-----+
|                msno|city| bd|gender|registered_via|registration_init_time|year|month|
+--------------------+----+---+------+--------------+----------------------+----+-----+
|Rb9UwLQTrxzBVwCB6...|   1|  0|  NULL|            11|              20110911|2011|   09|
|+tJonkh+O1CA796Fm...|   1|  0|  NULL|             7|              20110914|2011|   09|
|cV358ssn7a0f7jZOw...|   1|  0|  NULL|            11|              20110915|2011|   09|
|9bzDeJP6sQodK73K5...|   1|  0|  NULL|            11|              20110915|2011|   09|
|WFLY3s7z4EZsieHCt...|   6| 32|female|             9|              20110915|2011|   09|
|yLkV2gbZ4GLFwqTOX...|   4| 30|  male|             9|              20110916|2011|   09|
|jNCGK78YkTyId3H3w...|   1|  0|  NULL|             7|              20110916|2011|   09|
|WH5Jq4mgtfUFXh2yz...|   5| 34|  male|             9|              20110916|2011|   09|
|tKmbR4X5VXjHmxERr...|   5| 19| 

In [14]:
result = dfm_with_ym.groupBy("year", "month") \
           .count() \
           .orderBy("year", "month")
result.show()

[Stage 22:>                                                       (0 + 12) / 12]

+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2004|   03| 4258|
|2004|   04| 1417|
|2004|   05|  436|
|2004|   06|  550|
|2004|   07| 4713|
|2004|   08| 2474|
|2004|   09| 3323|
|2004|   10| 4547|
|2004|   11| 3075|
|2004|   12| 1441|
|2005|   01| 1382|
|2005|   02| 2520|
|2005|   03| 2752|
|2005|   04| 2132|
|2005|   05| 1939|
|2005|   06| 1993|
|2005|   07| 2177|
|2005|   08| 2053|
|2005|   09| 5054|
|2005|   10| 8642|
+----+-----+-----+
only showing top 20 rows



                                                                                

In [15]:
result = dfm_with_ym.groupBy("year") \
           .count() \
           .orderBy("year")
result.show()

[Stage 25:>                                                       (0 + 12) / 12]

+----+-------+
|year|  count|
+----+-------+
|2004|  26234|
|2005|  41349|
|2006|  53953|
|2007|  89830|
|2008|  67690|
|2009|  63633|
|2010| 115075|
|2011| 179051|
|2012| 283190|
|2013| 524722|
|2014| 975776|
|2015|1620525|
|2016|2246761|
|2017| 481684|
+----+-------+



                                                                                

No need to split to folders

In [16]:
dfm_with_ym.count()

                                                                                

6769473

In [21]:
dfm_with_ym.write \
    .mode("overwrite") \
    .option("header", "true") \
    .parquet("datamart/bronze/members")

                                                                                