In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import os
from datetime import datetime
import random
import gc
import pickle

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,IntegerType,StringType,DateType,FloatType,StructField,TimestampType
from pyspark.sql.functions import *
import pyspark_jobs

In [3]:
spark = SparkSession \
        .builder \
        .appName("sample balanced") \
        .config("spark.driver.memory", "2g") \
        .config("spark.executor.memory","4g") \
        .getOrCreate()

In [6]:
schema = StructType([
   StructField("event_time", StringType(), True),
   StructField("event_type", StringType(), True),
   StructField("product_id", IntegerType(), True),
   StructField("category_id", StringType(), True),
   StructField("category_code", StringType(), True),
   StructField("brand", StringType(), True),
   StructField("price", FloatType(), True),
   StructField("user_id", IntegerType(), True),
   StructField("user_session", StringType(), True)])

In [7]:
df = spark.read.format('csv').load('Data/2019-Oct.csv',header=True,schema=schema)

In [8]:
df = df.na.drop()

In [9]:
df.groupBy('event_type').count().show()

+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  549507|
|      view|25201706|
|      cart|  809407|
+----------+--------+



In [10]:
purchased = df.where(df.event_type == 'purchase')

In [11]:
cart = df.where(df.event_type == 'cart').take(549507)

In [12]:
data = spark.createDataFrame(cart)
data.show(5)

+--------------------+----------+----------+-------------------+--------------------+-------+------------------+---------+--------------------+
|          event_time|event_type|product_id|        category_id|       category_code|  brand|             price|  user_id|        user_session|
+--------------------+----------+----------+-------------------+--------------------+-------+------------------+---------+--------------------+
|2019-10-01 00:09:...|      cart|   1002524|2053013555631882655|electronics.smart...|  apple| 515.6699829101562|524325294|0b74a829-f9d7-465...|
|2019-10-01 00:11:...|      cart|   4804056|2053013554658804075|electronics.audio...|  apple|161.97999572753906|533624186|e5ac3caa-e6d5-4d6...|
|2019-10-01 02:17:...|      cart|   1004833|2053013555631882655|electronics.smart...|samsung|174.75999450683594|536415846|685b5b42-f597-4a6...|
|2019-10-01 02:19:...|      cart|   1005003|2053013555631882655|electronics.smart...| huawei| 258.2099914550781|513632293|f2cc68f7-39d1-

In [13]:
data_balanced = data.union(purchased)
data_balanced.count()

1099014

In [14]:
data_balanced.repartition(1).write.csv("Data/balanced", sep=',')

In [28]:
from pyspark import StorageLevel

In [None]:
data_balanced.persist(StorageLevel.MEMORY_AND_DISK_2)
data_balanced.count()

In [5]:
data_balanced.show(5)

NameError: name 'data_balanced' is not defined

In [15]:
data_balanced.unpersist()

DataFrame[event_time: string, event_type: string, product_id: bigint, category_id: string, category_code: string, brand: string, price: double, user_id: bigint, user_session: string]

In [18]:
1099014*100/(549507+25201706+809407)

4.137757326447952

In [19]:
spark.stop()