### Analyse search terms on the e-commerce web server


##### We will download the search term data set for the e-commerce web server and run analytic queries on it.


In [1]:
# Install spark
!pip install pyspark
!pip install findspark



In [2]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [3]:
# Start session
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Saving and Loading a SparkML Model").getOrCreate()

23/12/11 14:12:24 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# Download The search term dataset from the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

In [5]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv

--2023-12-11 14:12:30--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/searchterms.csv
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104, 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233457 (228K) [text/csv]
Saving to: ‘searchterms.csv.2’


2023-12-11 14:12:30 (38.8 MB/s) - ‘searchterms.csv.2’ saved [233457/233457]



In [6]:
# Load the csv into a spark dataframe

In [7]:
# Load the CSV file into a Spark DataFrame
df = spark.read.csv("searchterms.csv", header=True, inferSchema=True)

# Show the first few rows of the DataFrame
df.show()

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021|        laptop|
| 12|   11|2021|        laptop|
| 12|   11|2021|     mobile 5g|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|     mobile 5g|
| 12|   11|2021| gaming laptop|
| 12|   11|2021|     mobile 5g|
| 12|   11|2021|     mobile 5g|
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|     mobile 5g|
| 12|   11|2021|        laptop|
+---+-----+----+--------------+
only showing top 20 rows



                                                                                

In [8]:
# Print the number of rows and columns

In [9]:
# Print the number of rows
num_rows = df.count()
print(f"Number of rows: {num_rows}")

# Print the number of columns
num_columns = len(df.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 10000
Number of columns: 4


In [10]:
# Print the top 5 rows

In [11]:
df.show(5)

+---+-----+----+--------------+
|day|month|year|    searchterm|
+---+-----+----+--------------+
| 12|   11|2021| mobile 6 inch|
| 12|   11|2021| mobile latest|
| 12|   11|2021|   tablet wifi|
| 12|   11|2021|laptop 14 inch|
| 12|   11|2021|     mobile 5g|
+---+-----+----+--------------+
only showing top 5 rows



In [12]:
# Find out the datatype of the column searchterm?

In [13]:
df.printSchema()

root
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- searchterm: string (nullable = true)



In [14]:
# How many times was the term `gaming laptop` searched?

In [15]:
# Filter the DataFrame to include only rows where the search term is "gaming laptop"
gaming_laptop_df = df.filter(df['searchterm'] == 'gaming laptop')

# Count the number of occurrences
num_searches = gaming_laptop_df.count()

# Print the result
print(f"The term 'gaming laptop' was searched {num_searches} times.")

The term 'gaming laptop' was searched 499 times.


In [16]:
# Print the top 5 most frequently used search terms?

In [17]:
from pyspark.sql.functions import desc

# Group by the 'search term' and count the occurrences
term_counts = df.groupBy('searchterm').count()

# Order the results by count in descending order
term_counts_ordered = term_counts.orderBy(desc('count'))

# Show the top 5 most frequently used search terms
top_terms = term_counts_ordered.limit(5)
top_terms.show()



+-------------+-----+
|   searchterm|count|
+-------------+-----+
|mobile 6 inch| 2312|
|    mobile 5g| 2301|
|mobile latest| 1327|
|       laptop|  935|
|  tablet wifi|  896|
+-------------+-----+



                                                                                

In [18]:
# The pretrained sales forecasting model is available at  the below url
# https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DB0321EN-SkillsNetwork/Bigdata%20and%20Spark/model.tar.gz

In [19]:
import tarfile

# Path to your .tar.gz file
file_path = 'model.tar.gz'

# Specify the folder where you want to extract the contents
extract_folder = 'model'

# Open the .tar.gz file
with tarfile.open(file_path, 'r:gz') as tar:
    # Extract all contents to the specified folder
    tar.extractall(path=extract_folder)

print(f"Contents extracted to {extract_folder}")

Contents extracted to model


In [20]:
# Load the sales forecast model.

In [21]:
from pyspark.ml.regression import LinearRegressionModel
model = LinearRegressionModel.load('model/sales_prediction.model')
model.params

                                                                                

[Param(parent='LinearRegression_6d5736f3dbe7', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2)'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0.'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='featuresCol', doc='features column name'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='fitIntercept', doc='whether to fit an intercept term'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='labelCol', doc='label column name'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber. (Default squaredError)'),
 Param(parent='LinearRegression_6d5736f3dbe7', name='maxIter', doc

In [22]:
# Using the sales forecast model, predict the sales for the year of 2023.

In [23]:
from pyspark.ml.feature import VectorAssembler
def predict(year):
    assembler = VectorAssembler(inputCols=["year"],outputCol="features")
    data = [[year,0]]
    columns = ["year", "sales"]
    df_ = spark.createDataFrame(data, columns)
    df__ = assembler.transform(df_).select('features','sales')
    predictions = model.transform(df__)
    predictions.select('prediction').show()

In [24]:
predict(2023)

                                                                                

+------------------+
|        prediction|
+------------------+
|175.16564294006457|
+------------------+



23/12/11 14:13:05 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/12/11 14:13:05 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
