In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

# Load the data from the files
college_type_df = pd.read_csv('sample_data/salaries-by-college-type.csv')
region_df = pd.read_csv('sample_data/salaries-by-region.csv')
degrees_df = pd.read_csv('sample_data/degrees-that-pay-back.csv')

# Convert salary columns to numeric values
def convert_salary_to_numeric(df, columns):
    for column in columns:
        df[column] = pd.to_numeric(df[column].replace('[\$,]', '', regex=True))

salary_columns = [
    'Starting Median Salary', 'Mid-Career Median Salary',
    'Mid-Career 10th Percentile Salary', 'Mid-Career 25th Percentile Salary',
    'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary'
]
convert_salary_to_numeric(college_type_df, salary_columns)
convert_salary_to_numeric(region_df, salary_columns)
convert_salary_to_numeric(degrees_df, salary_columns[:-2] + ['Percent change from Starting to Mid-Career Salary'])

# Merge and clean the data
combined_df = pd.merge(college_type_df, region_df, on='School Name', suffixes=('_type', '_region'))
columns_to_drop = ['Starting Median Salary_region', 'Mid-Career Median Salary_region',
                   'Mid-Career 10th Percentile Salary_region', 'Mid-Career 25th Percentile Salary_region',
                   'Mid-Career 75th Percentile Salary_region', 'Mid-Career 90th Percentile Salary_region']
combined_cleaned_df = combined_df.drop(columns=columns_to_drop)

# Prepare data for modeling
features = ['School Name', 'School Type', 'Region', 'Starting Median Salary_type']
target = 'Mid-Career Median Salary_type'
model_data = combined_cleaned_df[features + [target]]

# Encode categorical variables but do not overwrite original columns
encoder_school_name = LabelEncoder()
encoder_school_type = LabelEncoder()
encoder_region = LabelEncoder()

# Create separate encoded features for modeling
encoded_school_name = encoder_school_name.fit_transform(model_data['School Name'])
encoded_school_type = encoder_school_type.fit_transform(model_data['School Type'])
encoded_region = encoder_region.fit_transform(model_data['Region'])

# Create the model dataset with encoded features
X = pd.DataFrame({
    'School Name': encoded_school_name,
    'School Type': encoded_school_type,
    'Region': encoded_region,
})
y = model_data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set and calculate the mean absolute error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: ${mae:.2f}")

# Function to predict mid-career median salary based on user input
def predict_salary(school_name, school_type, region):
    encoded_school_name = encoder_school_name.transform([school_name])[0]
    encoded_school_type = encoder_school_type.transform([school_type])[0]
    encoded_region = encoder_region.transform([region])[0]

    # Make prediction
    features = [encoded_school_name, encoded_school_type, encoded_region]
    predicted_salary = model.predict([features])[0]
    return f"Predicted Mid-Career Median Salary: ${predicted_salary:.2f}"

# Example of how to use the function within the same script
print(predict_salary("Harvey Mudd College", "Engineering", "Southern"))


Mean Absolute Error: $7724.70
Predicted Mid-Career Median Salary: $108656.00




In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

# Load the data
college_type_df = pd.read_csv('sample_data/salaries-by-college-type.csv')
region_df = pd.read_csv('sample_data/salaries-by-region.csv')
degrees_df = pd.read_csv('sample_data/degrees-that-pay-back.csv')

# Convert salary columns to numeric values
def convert_salary_to_numeric(df, columns):
    for column in columns:
        df[column] = pd.to_numeric(df[column].replace('[\$,]', '', regex=True))

salary_columns = [
    'Mid-Career Median Salary',
    'Mid-Career 10th Percentile Salary', 'Mid-Career 25th Percentile Salary',
    'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary'
]
convert_salary_to_numeric(college_type_df, salary_columns)
convert_salary_to_numeric(region_df, salary_columns)
convert_salary_to_numeric(degrees_df, ['Mid-Career Median Salary'])

# Artificial assumption: All majors are available at all schools
expanded_school_data = pd.concat([college_type_df.loc[college_type_df.index.repeat(degrees_df.shape[0])].reset_index(drop=True),
                                  degrees_df.loc[degrees_df.index.repeat(college_type_df.shape[0])].reset_index(drop=True)], axis=1)

# Merge with region data
combined_df = pd.merge(expanded_school_data, region_df, on='School Name', suffixes=('', '_region'))

# Drop redundant columns
columns_to_drop = ['Starting Median Salary', 'Starting Median Salary_region']
combined_cleaned_df = combined_df.drop(columns=columns_to_drop)

# Prepare data for modeling
features = ['School Name', 'School Type', 'Region', 'Undergraduate Major']
target = 'Mid-Career Median Salary'
model_data = combined_cleaned_df[features + [target]]

# Encode categorical variables
encoder_school_name = LabelEncoder()
encoder_school_type = LabelEncoder()
encoder_region = LabelEncoder()
encoder_major = LabelEncoder()

model_data['School Name'] = encoder_school_name.fit_transform(model_data['School Name'])
model_data['School Type'] = encoder_school_type.fit_transform(model_data['School Type'])
model_data['Region'] = encoder_region.fit_transform(model_data['Region'])
model_data['Undergraduate Major'] = encoder_major.fit_transform(model_data['Undergraduate Major'])

# Split the data into training and testing sets
X = model_data[features]
y = model_data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Calculate the mean absolute error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: ${mae:.2f}")

# Function to predict mid-career median salary based on user input
def predict_salary(school_name, school_type, region, major):
    encoded_school_name = encoder_school_name.transform([school_name])[0]
    encoded_school_type = encoder_school_type.transform([school_type])[0]
    encoded_region = encoder_region.transform([region])[0]
    encoded_major = encoder_major.transform([major])[0]

    # Make prediction
    features = [encoded_school_name, encoded_school_type, encoded_region, encoded_major]
    predicted_salary = model.predict([features])[0]  # Access the first element if predicting multiple
    # Ensure the output is a single value and format the result
    if isinstance(predicted_salary, np.ndarray):
        predicted_salary = predicted_salary[0]  # In case of multiple outputs, take the first one
    return f"Predicted Mid-Career Median Salary: ${predicted_salary:.2f}"

# Example of using the function
print(predict_salary("Harvey Mudd College", "Engineering", "California", "Computer Science"))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['School Name'] = encoder_school_name.fit_transform(model_data['School Name'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['School Type'] = encoder_school_type.fit_transform(model_data['School Type'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_data['Region'] = enco

Mean Absolute Error: $1.26
Predicted Mid-Career Median Salary: $107846.00


