# Process user_logs.csv (bronze) DO NOT RUN UNLESS NECESSARY
This file creates the userlogs parquet files seperated by month from user_logs.csv

In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType


In [2]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .config("spark.driver.memory", "8g") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/17 09:58:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Read Data from CSV

In [3]:
# Read CSV file
df = spark.read.csv("data/user_logs.csv", header=True, inferSchema=True)

df.show()

                                                                                

+--------------------+--------+------+------+------+-------+-------+-------+----------+
|                msno|    date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|
+--------------------+--------+------+------+------+-------+-------+-------+----------+
|rxIP2f2aN0rYNp+to...|20150513|     0|     0|     0|      0|      1|      1|   280.335|
|rxIP2f2aN0rYNp+to...|20150709|     9|     1|     0|      0|      7|     11|  1658.948|
|yxiEWwE9VR5utpUec...|20150105|     3|     3|     0|      0|     68|     36| 17364.956|
|yxiEWwE9VR5utpUec...|20150306|     1|     0|     1|      1|     97|     27| 24667.317|
|yxiEWwE9VR5utpUec...|20150501|     3|     0|     0|      0|     38|     38|  9649.029|
|yxiEWwE9VR5utpUec...|20150702|     4|     0|     1|      1|     33|     10|  10021.52|
|yxiEWwE9VR5utpUec...|20150830|     3|     1|     0|      0|      4|      7|  1119.555|
|yxiEWwE9VR5utpUec...|20151107|     1|     0|     0|      0|      4|      5|   938.022|
|yxiEWwE9VR5utpUec...|20160110| 

In [4]:
df.count()

                                                                                

392106543

In [5]:
df.select("date").distinct().orderBy("date").show()




+--------+
|    date|
+--------+
|20150101|
|20150102|
|20150103|
|20150104|
|20150105|
|20150106|
|20150107|
|20150108|
|20150109|
|20150110|
|20150111|
|20150112|
|20150113|
|20150114|
|20150115|
|20150116|
|20150117|
|20150118|
|20150119|
|20150120|
+--------+
only showing top 20 rows



                                                                                

In [4]:
from pyspark.sql.functions import substring
# Extract year and month
df_with_ym = df.withColumn("year", substring("date", 1, 4)) \
               .withColumn("month", substring("date", 5, 2))

# Count distinct years and months
df_with_ym.select("year").distinct().count(), df_with_ym.select("month").distinct().count()

                                                                                

(3, 12)

In [5]:
df_with_ym.show(10)

+--------------------+--------+------+------+------+-------+-------+-------+----------+----+-----+
|                msno|    date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|year|month|
+--------------------+--------+------+------+------+-------+-------+-------+----------+----+-----+
|rxIP2f2aN0rYNp+to...|20150513|     0|     0|     0|      0|      1|      1|   280.335|2015|   05|
|rxIP2f2aN0rYNp+to...|20150709|     9|     1|     0|      0|      7|     11|  1658.948|2015|   07|
|yxiEWwE9VR5utpUec...|20150105|     3|     3|     0|      0|     68|     36| 17364.956|2015|   01|
|yxiEWwE9VR5utpUec...|20150306|     1|     0|     1|      1|     97|     27| 24667.317|2015|   03|
|yxiEWwE9VR5utpUec...|20150501|     3|     0|     0|      0|     38|     38|  9649.029|2015|   05|
|yxiEWwE9VR5utpUec...|20150702|     4|     0|     1|      1|     33|     10|  10021.52|2015|   07|
|yxiEWwE9VR5utpUec...|20150830|     3|     1|     0|      0|      4|      7|  1119.555|2015|   08|
|yxiEWwE9V

3 years worth of data

In [6]:
df_with_ym.write \
    .mode("overwrite") \
    .option("header", "true") \
    .partitionBy("year", "month") \
    .parquet("datamart/bronze/user_logs")


                                                                                

# Do the same for V2

In [3]:
# Read CSV file
df2 = spark.read.csv("data/user_logs_v2.csv/user_logs_v2.csv", header=True, inferSchema=True)

df2.show()

                                                                                

+--------------------+--------+------+------+------+-------+-------+-------+----------+
|                msno|    date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|
+--------------------+--------+------+------+------+-------+-------+-------+----------+
|u9E91QDTvHLq6NXjE...|20170331|     8|     4|     0|      1|     21|     18|  6309.273|
|nTeWW/eOZA/UHKdD5...|20170330|     2|     2|     1|      0|      9|     11|  2390.699|
|2UqkWXwZbIjs03dHL...|20170331|    52|     3|     5|      3|     84|    110| 23203.337|
|ycwLc+m2O0a85jSLA...|20170331|   176|     4|     2|      2|     19|    191|  7100.454|
|EGcbTofOSOkMmQyN1...|20170331|     2|     1|     0|      1|    112|     93| 28401.558|
|qR/ndQ5B+1cY+c9ih...|20170331|     3|     0|     0|      0|     39|     41|  9786.842|
|N6ch5ArfJixq9mvAR...|20170330|     9|     1|     0|      0|     18|     26|  4920.255|
|JEjl2W1ivEI6epeob...|20170331|   181|    68|     5|      3|     54|    291| 22433.105|
|lPK4IYIFdfTT6pq7x...|20170331| 

In [4]:
df2.count()

                                                                                

18396362

In [5]:
df2.select("date").distinct().orderBy("date").show()




+--------+
|    date|
+--------+
|20170301|
|20170302|
|20170303|
|20170304|
|20170305|
|20170306|
|20170307|
|20170308|
|20170309|
|20170310|
|20170311|
|20170312|
|20170313|
|20170314|
|20170315|
|20170316|
|20170317|
|20170318|
|20170319|
|20170320|
+--------+
only showing top 20 rows



                                                                                

In [6]:
from pyspark.sql.functions import substring
# Extract year and month
df2_with_ym = df2.withColumn("year", substring("date", 1, 4)) \
               .withColumn("month", substring("date", 5, 2))

# Count distinct years and months
df2_with_ym.select("year").distinct().count(), df2_with_ym.select("month").distinct().count()

                                                                                

(1, 1)

In [7]:
df2_with_ym.show(10)

+--------------------+--------+------+------+------+-------+-------+-------+----------+----+-----+
|                msno|    date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|year|month|
+--------------------+--------+------+------+------+-------+-------+-------+----------+----+-----+
|u9E91QDTvHLq6NXjE...|20170331|     8|     4|     0|      1|     21|     18|  6309.273|2017|   03|
|nTeWW/eOZA/UHKdD5...|20170330|     2|     2|     1|      0|      9|     11|  2390.699|2017|   03|
|2UqkWXwZbIjs03dHL...|20170331|    52|     3|     5|      3|     84|    110| 23203.337|2017|   03|
|ycwLc+m2O0a85jSLA...|20170331|   176|     4|     2|      2|     19|    191|  7100.454|2017|   03|
|EGcbTofOSOkMmQyN1...|20170331|     2|     1|     0|      1|    112|     93| 28401.558|2017|   03|
|qR/ndQ5B+1cY+c9ih...|20170331|     3|     0|     0|      0|     39|     41|  9786.842|2017|   03|
|N6ch5ArfJixq9mvAR...|20170330|     9|     1|     0|      0|     18|     26|  4920.255|2017|   03|
|JEjl2W1iv

In [None]:
df2_with_ym.write \
    .mode("overwrite") \
    .option("header", "true") \
    .partitionBy("year", "month") \
    .parquet("datamart/bronze/user_logs_v2")

In [8]:
# Cleanup
spark.stop()
print("\n✓ Spark session stopped")


✓ Spark session stopped
