# This notebook shows the data analysis on the Kaggle dataset ["Where it Pays to Attend College"](https://www.kaggle.com/datasets/wsj/college-salaries)

In [None]:
# Install all dependencies
!pip install pandas
!pip install scikit-learn
!pip install matplotlib
!pip install pyspark



In [None]:
# Initialize Spark
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

Create Spark session

In [None]:
# Creating a spark context class
sc = SparkContext()

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

Locate datasets

In [None]:
degrees_that_pay_back_df = spark.read.csv('data/degrees-that-pay-back.csv', header=True, inferSchema=True)
salaries_by_college_type_df = spark.read.csv('data/salaries-by-college-type.csv', header=True, inferSchema=True)
salaries_by_region_df = spark.read.csv('data/salaries-by-region.csv', header=True, inferSchema=True)

Preprocess Data

In [None]:
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

# Replace '$' and ',' in salary columns and convert to float
for column in ['Starting Median Salary', 'Mid-Career Median Salary']:
    salaries_by_college_type_df = salaries_by_college_type_df.withColumn(column, regexp_replace(column, '[$,]', '').cast('float'))

# Indexing 'School Type' column (necessary for OneHotEncoder in Spark)
school_type_indexer = StringIndexer(inputCol='School Type', outputCol='School Type Index')

# One-hot encoding 'School Type' column
encoder = OneHotEncoder(inputCols=['School Type Index'], outputCols=['School Type Vec'])

# Assembling encoded features into a vector (required by Spark MLlib models)
assembler = VectorAssembler(inputCols=['School Type Vec'], outputCol='features')

# Pipeline for transformations
pipeline = Pipeline(stages=[school_type_indexer, encoder, assembler])

# Fit and transform the data
processed_data = pipeline.fit(salaries_by_college_type_df).transform(salaries_by_college_type_df)

# Selecting the features vector and target columns
final_data = processed_data.select('features', 'Starting Median Salary', 'Mid-Career Median Salary')


Split data into training and test sets

In [None]:
# making test and training data
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

Train Linear Regression Model

In [None]:
from pyspark.ml.regression import LinearRegression

# Initialize the linear regression model
linear_model = LinearRegression(featuresCol='features', labelCol='Starting Median Salary')

# Train the model with the training data
linear_model = linear_model.fit(train_data)

# Predict on the test data
predictions = linear_model.transform(test_data)

Evaluate Model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize the evaluator
evaluator = RegressionEvaluator(labelCol='Starting Median Salary', predictionCol='prediction', metricName='rmse')

# Calculate RMSE
rmse = evaluator.evaluate(predictions)

# Print the RMSE
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 5378.584450673772


Predicting Future Salaries by School Type

In [None]:
# Example new data
new_data = spark.createDataFrame([
    ('MIT', 'Engineering', 'Northeast', 'Computer Science'),
    ('Harvard', 'Ivy League', 'Northeast', 'English')
], ['School Name', 'School Type', 'Region', 'Undergraduate Major'])

# Apply the same transformations as the training data
new_data_transformed = pipeline.fit(salaries_by_college_type_df).transform(new_data)

# Predict salaries
predictions = linear_model.transform(new_data_transformed)

# Show predictions
predictions.show()

+-----------+-----------+---------+-------------------+-----------------+---------------+-------------+------------------+
|School Name|School Type|   Region|Undergraduate Major|School Type Index|School Type Vec|     features|        prediction|
+-----------+-----------+---------+-------------------+-----------------+---------------+-------------+------------------+
|        MIT|Engineering|Northeast|   Computer Science|              3.0|  (4,[3],[1.0])|(4,[3],[1.0])|56992.307692307695|
|    Harvard| Ivy League|Northeast|            English|              4.0|      (4,[],[])|    (4,[],[])| 60216.66666666691|
+-----------+-----------+---------+-------------------+-----------------+---------------+-------------+------------------+

