In [36]:
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import udf

In [37]:
def get_session():
    return SparkSession \
        .builder \
        .appName("Market Analytics") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [38]:
sess = get_session()

In [39]:
def load_data(session):
    return session.read.format("csv").option("header", "true").load("dataset/raw.csv")

In [40]:
load = load_data(sess)
load.show(5)

+--------------------+---------+---------+---------+--------------------+--------------------+
|          breadcrumb|item_sold|      pid|    price|               title|                 url|
+--------------------+---------+---------+---------+--------------------+--------------------+
|Dapur ;Bekal ;Cup...|        2|389668528|   70.000|Mayonnaise & Ketc...|https://www.tokop...|
|Rumah Tangga ;Keb...|        0|390053120|  160.000|BOLDe Super BROOM...|https://www.tokop...|
|Perawatan Hewan ;...|        0|390055891|1.100.000|Two Tone Brown (S...|https://www.tokop...|
|Fashion Muslim ;P...|        0|389610348|  325.000|MUKENA ALDIVA har...|https://www.tokop...|
|     Produk Lainnya |        1|389666117|  179.900|terpal kolam kota...|https://www.tokop...|
+--------------------+---------+---------+---------+--------------------+--------------------+
only showing top 5 rows



In [41]:
def clean_dataframe(df):
    return df.where(
        df['item_sold'].isNotNull() & df['pid'].isNotNull() & \
        df['pid'].isNotNull() & df['price'].isNotNull() & \
        df['title'].isNotNull() & df['url'].isNotNull() & \
        ~df['item_sold'].like("http%") & \
        df['url'].like("http%")
    )

In [42]:
clean = clean_dataframe(load)

In [43]:
@udf
def extract_seller(x):
    return x.split("/")[3]

@udf
def extract_seller_url(seller):
    return 'https://www.tokopedia.com/' + seller

@udf
def price(x):
    return int(x.replace(".", ""))


def extract_dataframe(df):
    df = df.withColumn('seller', extract_seller(df.url))
    df = df.withColumn('seller_url', extract_seller_url(df.seller))
    df = df.withColumn('item_sold', df.item_sold.cast("int"))
    df = df.withColumn('price', price(df.price))
    return df

In [44]:
extract = extract_dataframe(clean)

In [45]:
extract.count()

1281242

In [49]:
extract.select("pid").drop_duplicates().count()

1281242

## Total Transaction (QTY)

In [13]:
def get_total_transaction(df):
    return df.agg(functions.sum('item_sold')).first()[0]

In [1]:
get_total_transaction(extract)

NameError: name 'get_total_transaction' is not defined

## Total Transaction (Rupiah)

In [12]:
def get_omset(df):
    return df.agg(functions.sum('price')).first()[0]

In [13]:
get_omset(extract)

862413587916.0

## Total Seller

In [14]:
def get_total_seller(df):
    return df.select('seller').distinct().count()

In [15]:
get_total_seller(extract)

90144

## Total Product

In [16]:
def get_total_product(df):
    return df.count()

In [17]:
get_total_product(extract)

1281242

## 10 Best Selling

In [27]:
def get_best_selling(df):
    return df.groupBy('pid').agg(functions.sum('item_sold').alias('sale')).orderBy(functions.desc('sale'))

In [28]:
def selling_to_dict(df):
    rows = df.collect()
    
    result = []
    for r in rows:
        result.append({
            "pid": r.pid,
            "sale": r.sale
        })
    return result

In [29]:
best_selling = get_best_selling(extract).limit(10)
selling_to_dict(best_selling)

[{'pid': '315500698', 'sale': 105400},
 {'pid': '106556412', 'sale': 97155},
 {'pid': '110892467', 'sale': 88119},
 {'pid': '107184722', 'sale': 77750},
 {'pid': '101040765', 'sale': 73713},
 {'pid': '207550504', 'sale': 72713},
 {'pid': '102077448', 'sale': 65455},
 {'pid': '105978964', 'sale': 64986},
 {'pid': '105110726', 'sale': 56123},
 {'pid': '106156810', 'sale': 49900}]

## 10 Best Seller

In [20]:
def get_best_seller(df):
    return df.groupBy('seller').agg(functions.sum('item_sold').alias('sale')).orderBy(functions.desc('sale'))

In [30]:
def seller_to_dict(df):
    rows = df.collect()
    
    result = []
    for r in rows:
        result.append({
            "seller": r.seller,
            "sale": r.sale
        })
    return result

In [31]:
best_seller = get_best_seller(extract).limit(10)
seller_to_dict(best_seller)

[{'seller': 'rgaksesorishp', 'sale': 215670},
 {'seller': 'BAZAARFASHION', 'sale': 171702},
 {'seller': 'reawstore', 'sale': 142616},
 {'seller': 'tokosabilah', 'sale': 117575},
 {'seller': 'mode-packaging', 'sale': 115712},
 {'seller': 'agenkurma', 'sale': 112732},
 {'seller': 'ikadella', 'sale': 105400},
 {'seller': 'tokokainflanel1', 'sale': 104590},
 {'seller': 'platinumpack', 'sale': 100852},
 {'seller': '190spt', 'sale': 100680}]