## Set Up:

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

Downloading and unpacking Apache Spark and Hadoop:

In [3]:
!wget https://archive.apache.org/dist/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz
!tar xf spark-3.5.3-bin-hadoop3.tgz
!rm spark-3.5.3-bin-hadoop3.tgz   # Tidying up

--2025-03-15 11:21:53--  https://archive.apache.org/dist/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400864419 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.3-bin-hadoop3.tgz’


2025-03-15 11:22:17 (16.6 MB/s) - ‘spark-3.5.3-bin-hadoop3.tgz’ saved [400864419/400864419]



In [None]:
# Setting up our environmental variables:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.3-bin-hadoop3"

Install findspark to locate Spark on system

In [None]:
!pip install -q findspark
import findspark
findspark.init()

We can now import SparkSession from `pyspark.sql` to create our entry point to Spark.

**Note:** If we were running our session on a cluster we would need the master name as an argument for `master()` - i.e. yarn. However, we'll be working in standalone mode so we'll just use `local[x]` where x is an int value > 0. This represents how many partitions should be created when using RDD, DataFrame etc. This should ideally be the number of CPU cores we have so we'll use `local[*]` here to indicate that we want to use all cores.  

In [None]:
from pyspark.sql import SparkSession

try:
  spark.stop()
except:
  pass

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) #  This will format our output tables a bit nicer when not using the show() method
spark

**Note:** You can check how many CPU cores are available to you in the cell below. (This will likely be just 2 on Google Colab but may differ if you have chosen to run this lab via Jupyter Notebooks on your own system).


In [None]:
import multiprocessing
print(multiprocessing.cpu_count())

2


## Basic collections / operations

In [None]:
# Creating our SparkContext:
sc = spark.sparkContext

# A SparkContext represents the connection to a Spark cluster
# and can be used to create RDD and broadcast variables on that cluster.

# Create some basic data:
data = list(range(1, 101))

# Create our RDD from a list/collection:
rdd = sc.parallelize(data)

# Retrieve all the data with collect method:
rddCollect = rdd.collect()

# Print out our data:
print(rddCollect)

# Print some basic information about our data:
# Print the number of partitions:
print("No partitions: {}".format(rdd.getNumPartitions()))

# Print our first and max elements:
# The error might be caused by a function within rdd.first() or rdd.max() that accesses an invalid index
# It's likely not within a user-defined function, as there are none in this code snippet
# Try collecting the RDD first to perform operations on the driver instead of distributedly
try:
    # Collect the RDD to the driver
    # collected_rdd = rdd.collect()
    print("First element: {}".format(rdd.first())) # Access the first element directly
    print("Max element: {}".format(rdd.max())) # Use Python's max function
except IndexError as e:
    print(f"IndexError: {e}")
    print("The RDD might be empty or there's an issue with indexing within Spark's internal functions.")

# Applying a filter:
# Similar to the previous operations, collect the filtered RDD to avoid serialization issues
try:
    # collected_rdd = rdd.collect()
    # filtered_rdd = [x for x in collected_rdd if x < 20]
    # print("Values less than 20: {}".format(filtered_rdd))
    filtered_rdd = rdd.filter(lambda x: x < 20).collect()
    print("Values less than 20: {}".format(filtered_rdd))
except IndexError as e:
    print(f"IndexError: {e}")
    print("There might be an issue with indexing within Spark's internal functions during filtering.")
# Here, the collect() method is used to retrieve the content of the RDD as a single list.

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
No partitions: 2
First element: 1
Max element: 100
Values less than 20: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [None]:
# install mysql connector here
!pip install mysql-connector-python
import mysql.connector

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.0 kB)
Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl (34.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.0/34.0 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.2.0


Improving import of big_five database

In [None]:
!pwd

/content


In [4]:
# %cd /content/drive/Shareddrives/big-data-drive/big-data-project/

In [5]:
! apt-get update
! apt-get install mysql-server

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [1 InRelease 0 B/129 kB 0%] [Connected to cloud.r-project.org (108.157.173.                                                                                                    Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [1 InRelease 54.7 kB/129 kB 42%] [Connected to cloud.r-project.org (108.157.173.89)] [Connecting                                                                                                     Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [3 InRelease 22.9 kB/128 kB 18%] [1 InRelease 54.7 kB/129 kB 42%] [Waiting for headers] [Connecte                                                                                                    Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_

In [6]:
!mysql --version
!service mysql start

mysql  Ver 8.0.41-0ubuntu0.22.04.1 for Linux on x86_64 ((Ubuntu))
 * Starting MySQL database server mysqld
   ...done.


In [None]:
!sudo service mysql stop
!sudo usermod -d /var/lib/mysql/ mysql
!sudo service mysql start
# force root to use mysql_native_password and sets passwd to root
# flush privileges to changes take place immediately
!mysql -e "ALTER USER 'root'@'localhost' IDENTIFIED WITH mysql_native_password BY 'root'; FLUSH PRIVILEGES;"

 * Stopping MySQL database server mysqld
   ...done.
 * Starting MySQL database server mysqld
   ...done.


In [None]:
db_name = "big_five"

def connect_to_mysql():
  # Create a connection to the MySQL server and use big five db
  conn = mysql.connector.connect(user="root", host="localhost", password="root")
  return conn

def create_cursor(conn):
  # Create a cursor to interact with the MySQL server
  cursor = conn.cursor()
  return cursor

# instantiate db
def connect_to_db(cursor, db_name):
  create_stmt = f"CREATE DATABASE IF NOT EXISTS {db_name}"
  use_stmt = f"USE {db_name}"
  cursor.execute(create_stmt)
  cursor.execute(use_stmt)

In [None]:
# Read SQL file
sql_lines_rdd = sc.textFile("/content/drive/Shareddrives/big-data-drive/big-data-project/big_five_300k.sql")

# Improved method to combine SQL statements properly
def combine_statements(lines):
    statement = []
    for line in lines:
        line = line.strip()
        if not line or line.startswith("--") or line.startswith("/*"):  # Skip empty lines and comments
            continue
        statement.append(line)
        if line.endswith(";"):  # Full statement detected
            yield " ".join(statement)  # Return full SQL statement
            statement = []  # Reset for next statement

# Convert RDD of SQL lines into RDD of full SQL statements
sql_statements_rdd = sql_lines_rdd.mapPartitions(combine_statements)

# Improved execution function
def execute_sql_partition(statements):
    try:
        conn = connect_to_mysql()
        cursor = conn.cursor()
        connect_to_db(cursor, db_name)

        for statement in statements:
            try:
                print(f"Executing statement: {statement}")
                cursor.execute(statement)  # Execute each statement separately
                conn.commit()  # Commit per statement
            except Exception as e:
                print(f"Error executing statement: {statement}\nError: {e}")
                conn.rollback()  # Rollback only failed statements, not everything

        cursor.close()
        conn.close()
    except Exception as e:
        print(f"Critical error: {e}")
    finally:
        if 'conn' in locals() and conn:
            conn.close()

# Run imports in parallel
sql_statements_rdd.foreachPartition(execute_sql_partition)

In [None]:
conn = mysql.connector.connect(host="localhost", user="root", password="root")
cursor = conn.cursor()
cursor.execute("SHOW DATABASES")
databases = cursor.fetchall()

print(databases)

cursor.execute("USE big_five")
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()

print(tables)

cursor.execute("SELECT COUNT(*) FROM response")
responses = cursor.fetchall()
print(responses)

cursor.execute("SELECT COUNT(*) FROM user")
users = cursor.fetchall()
print(users)

cursor.execute("SELECT COUNT(*) FROM question")
rows = cursor.fetchall()
print(rows)


cursor.close()
conn.close()

[('big_five',), ('information_schema',), ('mysql',), ('performance_schema',), ('sys',)]
[('question',), ('user',)]


ProgrammingError: 1146 (42S02): Table 'big_five.response' doesn't exist