In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import udf

In [2]:
def get_session():
    return SparkSession \
        .builder \
        .appName("Market Analytics") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [3]:
sess = get_session()

In [4]:
def load_data(session):
    return session.read.format("csv").option("header", "true").load("dataset/raw.csv")

In [5]:
load = load_data(sess)
load.show(5)

+--------------------+---------+---------+---------+--------------------+--------------------+
|          breadcrumb|item_sold|      pid|    price|               title|                 url|
+--------------------+---------+---------+---------+--------------------+--------------------+
|Dapur ;Bekal ;Cup...|        2|389668528|   70.000|Mayonnaise & Ketc...|https://www.tokop...|
|Rumah Tangga ;Keb...|        0|390053120|  160.000|BOLDe Super BROOM...|https://www.tokop...|
|Perawatan Hewan ;...|        0|390055891|1.100.000|Two Tone Brown (S...|https://www.tokop...|
|Fashion Muslim ;P...|        0|389610348|  325.000|MUKENA ALDIVA har...|https://www.tokop...|
|     Produk Lainnya |        1|389666117|  179.900|terpal kolam kota...|https://www.tokop...|
+--------------------+---------+---------+---------+--------------------+--------------------+
only showing top 5 rows



In [6]:
def clean_dataframe(df):
    return df.where(
        df['item_sold'].isNotNull() & df['pid'].isNotNull() & \
        df['pid'].isNotNull() & df['price'].isNotNull() & \
        df['title'].isNotNull() & df['url'].isNotNull() & \
        ~df['item_sold'].like("http%") & \
        df['url'].like("http%")
    )

In [7]:
clean = clean_dataframe(load)

In [8]:
@udf
def extract_seller(x):
    return x.split("/")[3]

@udf
def extract_seller_url(seller):
    return 'https://www.tokopedia.com/' + seller

@udf
def price(x):
    return int(x.replace(".", ""))


def extract_dataframe(df):
    df = df.withColumn('seller', extract_seller(df.url))
    df = df.withColumn('seller_url', extract_seller_url(df.seller))
    df = df.withColumn('item_sold', df.item_sold.cast("int"))
    df = df.withColumn('price', price(df.price))
    return df

In [9]:
extract = extract_dataframe(clean)

## Total Transaction (QTY)

In [10]:
def get_total_transaction(df):
    return df.agg(functions.sum('item_sold')).first()[0]

In [11]:
get_total_transaction(extract)

12298303

## Total Transaction (Rupiah)

In [12]:
def get_omset(df):
    return df.agg(functions.sum('price')).first()[0]

In [13]:
get_omset(extract)

862413587916.0

## Total Seller

In [14]:
def get_total_seller(df):
    return df.select('seller').distinct().count()

In [15]:
get_total_seller(extract)

90144

## Total Product

In [16]:
def get_total_product(df):
    return df.count()

In [17]:
get_total_product(extract)

1281242

## 10 Best Selling

In [18]:
def get_best_selling(df):
    return df.groupBy('pid').agg(functions.sum('item_sold').alias('sale')).orderBy(functions.desc('sale'))

In [19]:
get_best_selling(extract).show(5)

+---------+------+
|      pid|  sale|
+---------+------+
|315500698|105400|
|106556412| 97155|
|110892467| 88119|
|107184722| 77750|
|101040765| 73713|
+---------+------+
only showing top 5 rows



## 10 Best Seller

In [20]:
def get_best_seller(df):
    return df.groupBy('seller').agg(functions.sum('item_sold').alias('sale')).orderBy(functions.desc('sale'))

In [21]:
get_best_seller(extract).show(5)

+--------------+------+
|        seller|  sale|
+--------------+------+
| rgaksesorishp|215670|
| BAZAARFASHION|171702|
|     reawstore|142616|
|   tokosabilah|117575|
|mode-packaging|115712|
+--------------+------+
only showing top 5 rows

