# 0.&nbsp; Connect Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
default_dir = '/content/drive/MyDrive/DE Digital Skola/hw_apache_spark'
os.chdir(default_dir)

# 1.&nbsp; Setting up PySpark in Colab

In [3]:
!sudo apt update

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub[0m                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcont

In [4]:
# download Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
# download Spark
# !wget -q https://dlcdn.apache.org/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz

In [5]:
# unzip that folder.
!tar xf spark-3.4.2-bin-hadoop3.tgz

In [6]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/drive/MyDrive/DE Digital Skola/hw_apache_spark/spark-3.4.2-bin-hadoop3"

In [7]:
!pip install pyspark
!pip install -q findspark



In [8]:
import findspark
findspark.init()
findspark.find()

'/content/drive/MyDrive/DE Digital Skola/hw_apache_spark/spark-3.4.2-bin-hadoop3'

# 2.&nbsp; Create a SparkSession

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Laptop Store Data Analysis") \
        .getOrCreate()

In [10]:
spark

# 3.&nbsp;  Import and Read the Dataset
as a Spark DataFrame

In [11]:
# Step 2: Import and read the dataset as a Spark DataFrame
file_path = "laptop_pricing_dataset.csv"
laptop_df = spark.read.csv(
    file_path,
    header=True,
    inferSchema=True
)

# 4.&nbsp;  Answer the Business Questions

In [21]:
from pyspark.sql.functions import (
    avg, count, when, desc)

## a. Q1: How the average RAM GB per manufacturer ? which manufacturer has higher average RAM ?


In [15]:
# Calculate average RAM GB per manufacturer
avg_ram_per_manufacturer = laptop_df \
                           .groupBy("Manufacturer") \
                           .agg(avg("RAM_GB") \
                           .alias("avg_ram_gb"))

avg_ram_per_manufacturer.show()

+------------+-----------------+
|Manufacturer|       avg_ram_gb|
+------------+-----------------+
|       Razer|             16.0|
|      Huawei|              8.0|
|      Xiaomi|              8.0|
|          HP|7.346938775510204|
|        Dell|8.225352112676056|
|        Acer|7.368421052631579|
|        Asus|7.888888888888889|
|      Lenovo|7.576923076923077|
|     Samsung|             11.2|
|         MSI|              8.0|
|     Toshiba|              8.0|
+------------+-----------------+



In [16]:
# Identify manufacturer with the highest average RAM
manufacturer_with_highest_avg_ram = avg_ram_per_manufacturer \
                                    .orderBy("avg_ram_gb", ascending=False) \
                                    .first()["Manufacturer"]

manufacturer_with_highest_avg_ram.show()

'Razer'

Jawaban: Razer, Average RAM 16.0 GB

## b. Q2: Please calculate the average screen size per Screen Type, and order it by average from higher to lowest.


In [17]:
# Calculate average screen size per Screen Type
avg_screen_size_per_type = laptop_df \
                           .groupBy("Screen") \
                           .agg(avg("Screen_Size_cm")\
                           .alias("avg_screen_size_cm"))

# Order by average screen size from highest to lowest
avg_screen_size_per_type = avg_screen_size_per_type \
                           .orderBy("avg_screen_size_cm", ascending=False)

# Show the result
avg_screen_size_per_type.show()

+---------+------------------+
|   Screen|avg_screen_size_cm|
+---------+------------------+
|  Full HD|37.968955414012704|
|IPS Panel| 35.84368831168828|
+---------+------------------+



## c. Q3: Please do profiling by manufacturer and screen type, which manufacturer has more IPS Panel product, Full HD product, and so on by counting the record by those two columns.


In [31]:
# Profiling by manufacturer and screen type
profiling_results = laptop_df.groupBy("Manufacturer", "Screen") \
    .agg(
        count(when(laptop_df.Screen == "IPS Panel", True)).alias("ips_panel_count"),
        count(when(laptop_df.Screen == "Full HD", True)).alias("full_hd_count")
    ).orderBy(desc("ips_panel_count"), desc("full_hd_count"))

# Show the profiling results
profiling_results.show()

+------------+---------+---------------+-------------+
|Manufacturer|   Screen|ips_panel_count|full_hd_count|
+------------+---------+---------------+-------------+
|      Lenovo|IPS Panel|             32|            0|
|        Acer|IPS Panel|             14|            0|
|     Toshiba|IPS Panel|             10|            0|
|        Dell|IPS Panel|              9|            0|
|          HP|IPS Panel|              8|            0|
|        Asus|IPS Panel|              2|            0|
|      Huawei|IPS Panel|              1|            0|
|      Xiaomi|IPS Panel|              1|            0|
|        Dell|  Full HD|              0|           62|
|          HP|  Full HD|              0|           41|
|      Lenovo|  Full HD|              0|           20|
|        Asus|  Full HD|              0|           16|
|     Toshiba|  Full HD|              0|            7|
|     Samsung|  Full HD|              0|            5|
|        Acer|  Full HD|              0|            5|
|         