# Create datasets e import in Hive

## Initial imports

In [1]:
import java.io.File
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hive.HiveContext
val hiveContext = new HiveContext(sc)

val warehouseLocation = "hdfs://namenode:8020/user/hive/warehouse"

// val warehouseLocation = new File("spark-warehouse").getAbsolutePath
// println(warehouseLocation)

spark.sql("CREATE DATABASE IF NOT EXISTS db_nubank")
spark.sql("USE db_nubank")

val data_base = "db_nubank"
var table = ""
var dir_path = warehouseLocation + "/" + data_base + ".db/" + table
var db_table = data_base + "." + table

Intitializing Scala interpreter ...

Spark Web UI available at http://2a03b9dad4f2:4041
SparkContext available as 'sc' (version = 3.0.2, master = local[*], app id = local-1615077938167)
SparkSession available as 'spark'


import java.io.File
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hive.HiveContext
hiveContext: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@64fdba00
warehouseLocation: String = hdfs://namenode:8020/user/hive/warehouse
data_base: String = db_nubank
table: String = ""
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/
db_table: String = db_nubank.


### Create datasets and saving in hive

In [3]:
var df_accounts = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/accounts/")

df_accounts.printSchema()

df_accounts = df_accounts.selectExpr("account_id"
                      ,"customer_id"
                      ,"cast(created_at as timestamp)"
                      ,"status"
                      ,"account_branch"
                      ,"account_check_digit"
                      ,"account_number")
df_accounts.show(2,true)

table = "accounts"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_accounts.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- account_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- status: string (nullable = true)
 |-- account_branch: string (nullable = true)
 |-- account_check_digit: string (nullable = true)
 |-- account_number: string (nullable = true)

+-------------------+-------------------+--------------------+------+--------------+-------------------+--------------+
|         account_id|        customer_id|          created_at|status|account_branch|account_check_digit|account_number|
+-------------------+-------------------+--------------------+------+--------------+-------------------+--------------+
| 509281836645315264|3287830764476260864|2019-11-27 22:02:...|active|          7763|                  9|         38218|
|1464307209104691456|2739905374464312320|2019-04-02 12:42:...|active|          7183|                  2|          1684|
+-------------------+-------------------+--------------------+------+--------------+--

df_accounts: org.apache.spark.sql.DataFrame = [account_id: string, customer_id: string ... 5 more fields]
df_accounts: org.apache.spark.sql.DataFrame = [account_id: string, customer_id: string ... 5 more fields]
table: String = accounts
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/accounts
db_table: String = db_nubank.accounts


In [4]:
var df_city = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/city/")

df_city.printSchema()

df_city.show(2,false)

df_city = df_city.selectExpr("cast(city_id as bigint)"
                      ,"city"
                      ,"state_id")
df_city.show(2,false)

table = "city"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_city.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- city: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- city_id: string (nullable = true)

+-------------------+-------------------+-------------------+
|city               |state_id           |city_id            |
+-------------------+-------------------+-------------------+
|São José do Goiabal|2755422274446512640|670134511382200832 |
|Pederneiras        |2066774635771587840|1734898293086970880|
+-------------------+-------------------+-------------------+
only showing top 2 rows

+-------------------+-------------------+-------------------+
|city_id            |city               |state_id           |
+-------------------+-------------------+-------------------+
|670134511382200832 |São José do Goiabal|2755422274446512640|
|1734898293086970880|Pederneiras        |2066774635771587840|
+-------------------+-------------------+-------------------+
only showing top 2 rows



df_city: org.apache.spark.sql.DataFrame = [city_id: bigint, city: string ... 1 more field]
df_city: org.apache.spark.sql.DataFrame = [city_id: bigint, city: string ... 1 more field]
table: String = city
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/city
db_table: String = db_nubank.city


In [7]:
var df_country = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/country/")

df_country.printSchema()

df_country.show(2)

df_country = df_country.selectExpr("country_id"
                                   ,"country")
df_country.show(2,false)

table = "country"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_country.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- country: string (nullable = true)
 |-- country_id: string (nullable = true)

+-------+------------------+
|country|        country_id|
+-------+------------------+
| Brasil|465471097668177088|
+-------+------------------+

+------------------+-------+
|country_id        |country|
+------------------+-------+
|465471097668177088|Brasil |
+------------------+-------+



df_country: org.apache.spark.sql.DataFrame = [country_id: string, country: string]
df_country: org.apache.spark.sql.DataFrame = [country_id: string, country: string]
table: String = country
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/country
db_table: String = db_nubank.country


In [8]:
var df_customers = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/customers/")

df_customers.printSchema()

df_customers.show(2)

df_customers = df_customers.selectExpr("customer_id"
                                       ,"first_name"
                                      ,"last_name"
                                      ,"cast(customer_city as bigint)"
                                      ,"country_name"
                                      ,"cast(cpf as bigint)"                                      
                                      )
df_customers.show(2,false)

table = "customers"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_customers.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- customer_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- cpf: string (nullable = true)
 |-- country_name: string (nullable = true)

+-------------------+----------+---------+-------------------+-----------+------------+
|        customer_id|first_name|last_name|      customer_city|        cpf|country_name|
+-------------------+----------+---------+-------------------+-----------+------------+
|3287830764476260864|     Janet|   Ritter|1512698933146656256|85974914067|      Brasil|
|2739905374464312320|      Tina| Lamaster|2215180425815138560|78347385617|      Brasil|
+-------------------+----------+---------+-------------------+-----------+------------+
only showing top 2 rows

+-------------------+----------+---------+-------------------+------------+-----------+
|customer_id        |first_name|last_name|customer_city      |country_name|cpf        |
+--------------

df_customers: org.apache.spark.sql.DataFrame = [customer_id: string, first_name: string ... 4 more fields]
df_customers: org.apache.spark.sql.DataFrame = [customer_id: string, first_name: string ... 4 more fields]
table: String = customers
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/customers
db_table: String = db_nubank.customers


In [None]:
var df_d_month = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/d_month/")
var df_d_time = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/d_time/")
var df_d_week = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/d_week/")
var df_d_weekday = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/d_weekday/")
var df_d_year = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/d_year/")

///////////////////////////////////////////////////////
df_d_month.printSchema()
df_d_month = df_d_month.selectExpr("cast(month_id as bigint)"
                                   ,"cast(action_month as int)"                                      
                                      )
df_d_month.show(2,false)
table = "d_month"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_d_month.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)
///////////////////////////////////////////////////////
df_d_time.printSchema()
df_d_time = df_d_time.selectExpr("cast(time_id as bigint)"
                                 ,"cast(action_timestamp as timestamp)"
                                 ,"cast(week_id as bigint)"
                                 ,"cast(month_id as bigint)"
                                 ,"cast(year_id as bigint)"
                                 ,"cast(weekday_id as bigint)"
                                      )
df_d_time.show(2,false)
table = "d_time"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_d_time.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)
///////////////////////////////////////////////////////
df_d_week.printSchema()
df_d_week = df_d_week.selectExpr("cast(week_id as bigint)"
                                 ,"cast(action_week as int)"                                 
                                      )
df_d_week.show(2,false)
table = "d_week"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_d_week.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)
///////////////////////////////////////////////////////
df_d_weekday.printSchema()
df_d_weekday = df_d_weekday.selectExpr("cast(weekday_id as bigint)"
                                 ,"cast(action_weekday as string)"                                 
                                      )
df_d_weekday.show(2,false)
table = "d_weekday"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_d_weekday.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)
///////////////////////////////////////////////////////
df_d_year.printSchema()
df_d_year = df_d_year.selectExpr("cast(year_id as bigint)"
                                 ,"cast(action_year as string)"                                 
                                      )
df_d_year.show(2,false)
table = "d_year"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_d_year.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

In [19]:
var df_pix_movements = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/pix_movements/")

df_pix_movements.printSchema()
df_pix_movements = df_pix_movements.selectExpr("id"
                                               ,"account_id"
                                               ,"in_or_out"
                                               ,"cast(pix_amount as float)"
                                               ,"cast(pix_requested_at as bigint)"
                                               ,"cast(pix_completed_at as bigint)"
                                               ,"status"
                                              )
df_pix_movements.show(2,false)
table = "pix_movements"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_pix_movements.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- in_or_out: string (nullable = true)
 |-- pix_amount: string (nullable = true)
 |-- pix_requested_at: string (nullable = true)
 |-- pix_completed_at: string (nullable = true)
 |-- status: string (nullable = true)

+-------------------+------------------+---------+----------+----------------+----------------+---------+
|id                 |account_id        |in_or_out|pix_amount|pix_requested_at|pix_completed_at|status   |
+-------------------+------------------+---------+----------+----------------+----------------+---------+
|1362907709468179968|509281836645315264|pix_out  |1894.77   |1579693633580   |1579693646070   |completed|
|2246794118022659072|509281836645315264|pix_out  |419.79    |1587309244550   |1587309252480   |completed|
+-------------------+------------------+---------+----------+----------------+----------------+---------+
only showing top 2 rows



df_pix_movements: org.apache.spark.sql.DataFrame = [id: string, account_id: string ... 5 more fields]
df_pix_movements: org.apache.spark.sql.DataFrame = [id: string, account_id: string ... 5 more fields]
table: String = pix_movements
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/pix_movements
db_table: String = db_nubank.pix_movements


In [20]:
var df_state = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/state/")

df_state.printSchema()
df_state = df_state.selectExpr("state_id"                
                               ,"state"
                               ,"cast(country_id as bigint)"
                              )
df_state.show(2,false)
table = "state"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_state.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- state: string (nullable = true)
 |-- country_id: string (nullable = true)
 |-- state_id: string (nullable = true)

+-------------------+-----+------------------+
|state_id           |state|country_id        |
+-------------------+-----+------------------+
|2755422274446512640|MG   |465471097668177088|
|2066774635771587840|SP   |465471097668177088|
+-------------------+-----+------------------+
only showing top 2 rows



df_state: org.apache.spark.sql.DataFrame = [state_id: string, state: string ... 1 more field]
df_state: org.apache.spark.sql.DataFrame = [state_id: string, state: string ... 1 more field]
table: String = state
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/state
db_table: String = db_nubank.state


In [21]:
var df_transfer_ins = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/transfer_ins/")
df_transfer_ins.printSchema()
df_transfer_ins = df_transfer_ins.selectExpr("id"                
                                             ,"account_id"
                                             ,"cast(amount as float)"
                                             ,"cast(transaction_requested_at as bigint)"
                                             ,"cast(transaction_completed_at as bigint)"
                                             ,"status"
                              )
df_transfer_ins.show(2,false)
table = "transfer_ins"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_transfer_ins.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- transaction_requested_at: string (nullable = true)
 |-- transaction_completed_at: string (nullable = true)
 |-- status: string (nullable = true)

+-------------------+------------------+-------+------------------------+------------------------+---------+
|id                 |account_id        |amount |transaction_requested_at|transaction_completed_at|status   |
+-------------------+------------------+-------+------------------------+------------------------+---------+
|652457358030649600 |509281836645315264|1481.33|1579073609010           |1579073618020           |completed|
|1622746437516706816|509281836645315264|995.36 |1595780510110           |1595780513170           |completed|
+-------------------+------------------+-------+------------------------+------------------------+---------+
only showing top 2 rows



df_transfer_ins: org.apache.spark.sql.DataFrame = [id: string, account_id: string ... 4 more fields]
df_transfer_ins: org.apache.spark.sql.DataFrame = [id: string, account_id: string ... 4 more fields]
table: String = transfer_ins
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/transfer_ins
db_table: String = db_nubank.transfer_ins


In [22]:
var df_transfer_outs = spark.read.format("csv").option("header", "true").load("file:///dataset/nubank/transfer_outs/")
df_transfer_outs.printSchema()
df_transfer_outs = df_transfer_outs.selectExpr("id"                
                                             ,"account_id"
                                             ,"cast(amount as float)"
                                             ,"cast(transaction_requested_at as bigint)"
                                             ,"cast(transaction_completed_at as bigint)"
                                             ,"status"
                              )
df_transfer_outs.show(2,false)
table = "transfer_outs"
dir_path = warehouseLocation + "/" + data_base + ".db/" + table
db_table = data_base + "." + table

spark.sql("DROP TABLE IF EXISTS " + db_table)
df_transfer_outs.toDF.write
  .option("path", dir_path)
  .mode("Overwrite")
  .saveAsTable(db_table)

root
 |-- id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- transaction_requested_at: string (nullable = true)
 |-- transaction_completed_at: string (nullable = true)
 |-- status: string (nullable = true)

+-------------------+------------------+-------+------------------------+------------------------+---------+
|id                 |account_id        |amount |transaction_requested_at|transaction_completed_at|status   |
+-------------------+------------------+-------+------------------------+------------------------+---------+
|327917790912071680 |509281836645315264|1794.11|1578150053870           |1578150055960           |completed|
|3029924863588401664|509281836645315264|1197.53|1602776874060           |1602776879440           |completed|
+-------------------+------------------+-------+------------------------+------------------------+---------+
only showing top 2 rows



df_transfer_outs: org.apache.spark.sql.DataFrame = [id: string, account_id: string ... 4 more fields]
df_transfer_outs: org.apache.spark.sql.DataFrame = [id: string, account_id: string ... 4 more fields]
table: String = transfer_outs
dir_path: String = hdfs://namenode:8020/user/hive/warehouse/db_nubank.db/transfer_outs
db_table: String = db_nubank.transfer_outs
