In [1]:
# Load Amazon S3 data into Spark dataframe


import os
# Find the latest version of spark 3.2 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.3'
spark_version = 'spark-3.2.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
Get:4 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:5 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:7 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:8 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:9 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Hit:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Hit:11 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu focal InRelease
Get:12 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [3,073 kB]
Get:13 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2023-04-10 23:18:03--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2023-04-10 23:18:04 (6.00 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Final-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [6]:
# Read data from S3 bucket

from pyspark import SparkFiles
url = "https://highered-bucket.s3.amazonaws.com/highered_merged_data.csv"
spark.sparkContext.addFile(url)
highered_data_df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get("highered_merged_data.csv"), header=True, inferSchema=True)
highered_data_df.show()

+----+------+-------+-----+-------+-------+-------+--------------------+------+------+-----+-------+-------+--------+----------+---------+
|Year|UNITID|FACSTAT|ARANK|HRTOTLT|HRTOTLM|HRTOTLW|              INSTNM|  CITY|STABBR|  ZIP|CONTROL|HLOFFER|INSTSIZE|  LONGITUD| LATITUDE|
+----+------+-------+-----+-------+-------+-------+--------------------+------+------+-----+-------+-------+--------+----------+---------+
|2019|100654|      0|    0|    242|    131|    111|Alabama A & M Uni...|Normal|    AL|35762|      1|      9|       3|-86.568502|34.783368|
|2019|100654|     10|    0|    242|    131|    111|Alabama A & M Uni...|Normal|    AL|35762|      1|      9|       3|-86.568502|34.783368|
|2019|100654|     10|    1|     49|     37|     12|Alabama A & M Uni...|Normal|    AL|35762|      1|      9|       3|-86.568502|34.783368|
|2019|100654|     10|    2|     50|     34|     16|Alabama A & M Uni...|Normal|    AL|35762|      1|      9|       3|-86.568502|34.783368|
|2019|100654|     10|    3|

In [10]:
# Create the appointment_type table
appointment_type_df = highered_data_df.select(["Year", "UNITID", "FACSTAT", "ARANK", "HRTOTLT", "HRTOTLM", "HRTOTLW"]).drop_duplicates()
appointment_type_df.show(10)

+----+------+-------+-----+-------+-------+-------+
|Year|UNITID|FACSTAT|ARANK|HRTOTLT|HRTOTLM|HRTOTLW|
+----+------+-------+-----+-------+-------+-------+
|2019|100654|     20|    0|     99|     68|     31|
|2019|101286|     10|    6|    139|     49|     90|
|2019|101587|     40|    0|     36|     10|     26|
|2019|101602|     20|    6|     39|     15|     24|
|2019|101675|     20|    0|      7|      6|      1|
|2019|102094|     10|    4|    129|     38|     91|
|2019|102234|     40|    2|      1|      0|      1|
|2019|102298|     20|    0|      8|      6|      2|
|2019|102298|     30|    4|      7|      2|      5|
|2019|102377|     10|    2|     83|     44|     39|
+----+------+-------+-----+-------+-------+-------+
only showing top 10 rows



In [13]:
# Create the institution size table
institution_size_df = highered_data_df.select(["Year", "UNITID", "HRTOTLT", "HRTOTLM", "HRTOTLW", "INSTSIZE"]).drop_duplicates()
institution_size_df.show(10)

+----+------+-------+-------+-------+--------+
|Year|UNITID|HRTOTLT|HRTOTLM|HRTOTLW|INSTSIZE|
+----+------+-------+-------+-------+--------+
|2019|100663|    397|    205|    192|       5|
|2019|100751|    408|    231|    177|       5|
|2019|102049|     49|     14|     35|       3|
|2019|102313|     31|     18|     13|       2|
|2019|104179|    781|    536|    245|       5|
|2019|104717|     89|     41|     48|       5|
|2019|107600|     20|      8|     12|       1|
|2019|107974|      8|      2|      6|       2|
|2019|107983|     41|     24|     17|       2|
|2019|110404|      2|      1|      1|       2|
+----+------+-------+-------+-------+--------+
only showing top 10 rows



In [29]:
# Create the gender table
test_df = highered_data_df.select(["Year", "UNITID", "HRTOTLT", "HRTOTLM", "HRTOTLW"]).drop_duplicates()
test_df.show(10)

+----+------+-------+-------+-------+
|Year|UNITID|HRTOTLT|HRTOTLM|HRTOTLW|
+----+------+-------+-------+-------+
|2019|101435|      3|      2|      1|
|2019|101693|      1|      1|      0|
|2019|104708|    170|     81|     89|
|2019|106467|     74|     46|     28|
|2019|109350|    162|     93|     69|
|2019|110510|    125|     59|     66|
|2019|111948|     19|     10|      9|
|2019|113218|     10|      6|      4|
|2019|114859|    227|    115|    112|
|2019|117627|      6|      2|      4|
+----+------+-------+-------+-------+
only showing top 10 rows



In [41]:
test_df.groupby("UNITID").agg(sum("HRTOTLT").alias("sum_HRTOTLT")).show()


TypeError: ignored

In [39]:
test_df.dtypes

[('Year', 'int'),
 ('UNITID', 'int'),
 ('HRTOTLT', 'int'),
 ('HRTOTLM', 'int'),
 ('HRTOTLW', 'int')]