<p style="text-align:center">
        <img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/assets/logos/SN_web_lightmode.png" width="300" alt="Skills Network Logo">
</p>


### Analyse search terms on the e-commerce web server


##### In this assignment you will download the search term data set for the e-commerce web server and run analytic queries on it.


In [1]:
# Install spark
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [3]:
# Start session
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Search Term Data Set E-commerce Analytics").getOrCreate()

23/10/26 14:17:20 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 14:17:23 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [5]:
# Download The search term dataset from the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

--2023-10-26 14:17:32--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233457 (228K) [text/csv]
Saving to: ‘searchterms.csv.1’


2023-10-26 14:17:33 (27.8 MB/s) - ‘searchterms.csv.1’ saved [233457/233457]



In [6]:
# Load the csv into a spark dataframe

In [6]:
df = spark.read.csv("searchterms.csv", header=True, inferSchema=True)

In [7]:
# Print the number of rows and columns
print(f"Number of rows: {df.count()}\nNumber of columns: {len(df.columns)}")
# Take a screenshot of the code and name it as shape.jpg)

Number of rows: 10000
Number of columns: 4


In [11]:
# Print the top 5 rows
# Take a screenshot of the code and name it as top5rows.jpg)
df.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



In [12]:
# Find out the datatype of the column searchterm?
# Take a screenshot of the code and name it as datatype.jpg)
print(df.dtypes)

[('day', 'int'), ('month', 'int'), ('year', 'int'), ('searchterm', 'string')]


In [13]:
# How many times was the term `gaming laptop` searched?
# Take a screenshot of the code and name it as gaminglaptop.jpg)
print(df.filter(df.searchterm=='gaming laptop').count())

499


In [15]:
# Print the top 5 most frequently used search terms?
# Take a screenshot of the code and name it as top5terms.jpg)
search_term_counts = df.groupBy("searchterm").count()
top_search_terms = search_term_counts.orderBy(desc("count")).limit(5)
top_search_terms.show()



+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+



                                                                                

In [16]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz

--2023-10-26 14:20:37--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1490 (1.5K) [application/x-tar]
Saving to: ‘model.tar.gz’


2023-10-26 14:20:37 (11.7 MB/s) - ‘model.tar.gz’ saved [1490/1490]



In [18]:
# Load the sales forecast model.
import tarfile
import os

# Path to the tar.gz file
tar_file_path = "model.tar.gz"

# Directory where you want to extract the contents
extracted_dir = "extracted_model"

# Create the directory if it doesn't exist
os.makedirs(extracted_dir, exist_ok=True)

# Extract the contents of the tar.gz file
with tarfile.open(tar_file_path, "r:gz") as tar:
    tar.extractall(extracted_dir)

# List the contents of the extracted directory
extracted_contents = os.listdir(extracted_dir)
print("Extracted contents:", extracted_contents)

Extracted contents: ['sales_prediction.model']


In [25]:
from pyspark.ml.regression import LinearRegressionModel
sales_model = LinearRegressionModel.load("sales_prediction.model")

# Using the sales forecast model, predict the sales for the year of 2023.
def predict_sales(year):
    assembler = VectorAssembler(inputCols=["year"], outputCol="features")
    
    data = [[year, 0]]
    columns = ["year", "sales"]
    df = spark.createDataFrame(data, columns)
    df_transformed = assembler.transform(df).select("features", "sales")
    predictions = sales_model.transform(df_transformed)

    predictions.select("prediction").show()

In [26]:
predict_sales(2023)

+------------------+
|        prediction|
+------------------+
|175.16564294006457|
+------------------+

