# **Streamlit Application**
### Frances LeMond-Glasser

In [11]:
# Import libraries
import pickle
import pandas as pd
import numpy as np
import gradio as gr

# Modeling
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

Here I will cut down on the features fed into the decision tree because we want the user interface to be user friendly and simple. Additionally, this will cut down on computational costs for streamlit rendering. 

**Read-in-Data**

In [12]:
df = pd.read_csv("Cleaned_Salary_Data.csv")
data = df.copy()

# Selecting columns to keep
cols_to_keep = ['Q8','Q11', 'Q10_combined', 'Q16', 'Q29_midpoint']

# Use str.startswith() to filter columns that start with the specified prefixes
cols_to_keep += [col for col in data.columns if col.startswith('Q12')]
cols_to_keep += [col for col in data.columns if col.startswith('Q4')]
cols_to_keep += [col for col in data.columns if col.startswith('Q23')]
cols_to_keep += [col for col in data.columns if col.startswith('Q24')]
cols_to_keep += [col for col in data.columns if col.startswith('Q25')]

# filter the data keeping only the selected columns
data = data[cols_to_keep]
data.head()

Unnamed: 0,Q8,Q11,Q10_combined,Q16,Q29_midpoint,Q12_Python,Q12_R,Q12_SQL,Q12_C,Q12_C++,Q12_Java,Q12_Javascript,Q12_Julia,Q12_Bash,Q12_MATLAB,Q12_None,Q12_Other,Q12_C#,Q12_PHP,Q12_Go,Q4_Africa,Q4_Americas,Q4_Central-South Asia,Q4_East Asia,Q4_Europe,Q4_India,Q4_Middle East,Q4_Other,Q4_USA,Q23_Data Administrator,"Q23_Data Analyst (Business, Marketing, Financial, Quantitative, etc)",Q23_Data Architect,Q23_Data Engineer,Q23_Data Scientist,Q23_Developer Advocate,Q23_Engineer (non-software),Q23_Machine Learning/ MLops Engineer,"Q23_Manager (Program, Project, Operations, Executive-level, etc)",Q23_Other,Q23_Research Scientist,Q23_Software Engineer,Q23_Statistician,Q23_Teacher / professor,Q24_Academics/Education,Q24_Accounting/Finance,Q24_Broadcasting/Communications,Q24_Computers/Technology,Q24_Energy/Mining,Q24_Government/Public Service,Q24_Insurance/Risk Assessment,Q24_Manufacturing/Fabrication,Q24_Marketing/CRM,Q24_Medical/Pharmaceutical,Q24_Non-profit/Service,Q24_Online Service/Internet-based Services,Q24_Other,Q24_Retail/Sales,Q24_Shipping/Transportation,Q25_0-49 employees,"Q25_10,000 or more employees","Q25_1000-9,999 employees",Q25_250-999 employees,Q25_50-249 employees
0,2,15,0,2,27499.5,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,3,15,0,5,112499.5,1,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,5,25,1,7,112499.5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,5,15,1,7,224999.5,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4,15,1,7,224999.5,1,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


Now, I will rename columns for better understanding of gradio code.

**Renaming**

In [13]:
# Education, ML Research, Salary, Company Size (Small, Medium, Large, Enormous)
data = data.rename(columns={'Q8': 'education_level',
                            'Q10_combined': 'ML Research',
                            'Q11': 'Programming Experience',
                            'Q16': 'Machine Learning Experience',
                            'Q25_0-49 employees':'company_startup', 
                            'Q25_50-249 employees': 'company_small',
                            'Q25_250-999 employees':'company_medium', 
                            'Q25_1000-9,999 employees': 'company_large', 
                            'Q25_10,000 or more employees':'company_enormous',
                            'Q29_midpoint': 'Salary'})
# Job Roles
data = data.rename(columns={'Q23_Data Administrator': 'job_DataAdmin',
    'Q23_Data Analyst (Business, Marketing, Financial, Quantitative, etc)': 'job_DataAnalyst',
    'Q23_Data Architect': 'job_DataArchitect',
    'Q23_Data Engineer': 'job_DataEngineer',
    'Q23_Data Scientist': 'job_DataScientist',
    'Q23_Developer Advocate': 'job_DevAdvocate',
    'Q23_Engineer (non-software)': 'job_Engineer',
    'Q23_Machine Learning/ MLops Engineer': 'job_MLEngineer',
    'Q23_Manager (Program, Project, Operations, Executive-level, etc)': 'job_Manager',
    'Q23_Other': 'job_Other',
    'Q23_Research Scientist': 'job_Research',
    'Q23_Software Engineer': 'job_SoftwareEng',
    'Q23_Statistician': 'job_Statistician',
    'Q23_Teacher / professor': 'job_Teacher',})

# Industry Name
data = data.rename(columns={'Q24_Academics/Education': 'industry_Education',
    'Q24_Accounting/Finance': 'industry_Finance',
    'Q24_Broadcasting/Communications': 'industry_Communications',
    'Q24_Computers/Technology': 'industry_Tech',
    'Q24_Energy/Mining': 'industry_Energy',
    'Q24_Government/Public Service': 'industry_Government',
    'Q24_Insurance/Risk Assessment': 'industry_Insurance',
    'Q24_Manufacturing/Fabrication': 'industry_Manufacturing',
    'Q24_Marketing/CRM': 'industry_Marketing',
    'Q24_Medical/Pharmaceutical': 'industry_Medical',
    'Q24_Non-profit/Service': 'industry_NonProfit',
    'Q24_Online Service/Internet-based Services': 'industry_OnlineService',
    'Q24_Other': 'industry_Other',
    'Q24_Retail/Sales': 'industry_Retail',
    'Q24_Shipping/Transportation': 'industry_Transportation',})

# Region
data = data.rename(columns=lambda x: x.replace('Q4_', '') if x.startswith('Q4_') else x)
# Programming Language
data = data.rename(columns=lambda x: x.replace('Q12_', '') if x.startswith('Q12_') else x)

#View
data.head()

Unnamed: 0,education_level,Programming Experience,ML Research,Machine Learning Experience,Salary,Python,R,SQL,C,C++,Java,Javascript,Julia,Bash,MATLAB,None,Other,C#,PHP,Go,Africa,Americas,Central-South Asia,East Asia,Europe,India,Middle East,Other.1,USA,job_DataAdmin,job_DataAnalyst,job_DataArchitect,job_DataEngineer,job_DataScientist,job_DevAdvocate,job_Engineer,job_MLEngineer,job_Manager,job_Other,job_Research,job_SoftwareEng,job_Statistician,job_Teacher,industry_Education,industry_Finance,industry_Communications,industry_Tech,industry_Energy,industry_Government,industry_Insurance,industry_Manufacturing,industry_Marketing,industry_Medical,industry_NonProfit,industry_OnlineService,industry_Other,industry_Retail,industry_Transportation,company_startup,company_enormous,company_large,company_medium,company_small
0,2,15,0,2,27499.5,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
1,3,15,0,5,112499.5,1,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,5,25,1,7,112499.5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,5,15,1,7,224999.5,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4,15,1,7,224999.5,1,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


**Make Decision Tree Model**

In [14]:
# Train-Test
X = data.drop(columns=['Salary'])  # or select specific columns you want
y = data['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Regressor model with constraints
regressor = DecisionTreeRegressor(random_state=42, min_samples_leaf=15, max_depth=5)
regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Calculate Root Mean Squared Error
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R^2 Score: {r2:.2f}')

Root Mean Squared Error (RMSE): 69326.15
R^2 Score: 0.40


In [15]:
# Save the trained model to a pickle file
with open('decision_tree_regressor.pkl', 'wb') as model_file:
    pickle.dump(regressor, model_file)
print("Model saved successfully!")

Model saved successfully!


# Gradio

In [16]:
# Load your trained decision tree model
model = regressor

This code is a function to convert user input into a an array of features, upon which, the model predicts salary. 

In [17]:
# Mapping for education levels
education_mapping = {"No formal education past high school": 1,
                     "Some college/university study without earning a bachelor’s degree": 2,
                     "Bachelor’s degree": 3, "Master’s degree": 4, "Doctoral degree": 5,
                     "Professional doctorate": 5, "I prefer not to answer": 0}

# Regions Drop Down
regions = ["Africa", "Americas", "Central-South Asia", "East Asia", "Europe", "India", "Middle East", "Other", "USA"]

#Company Size Drop Down
company_sizes = ["startup", "enormous", "large", "medium", "small"]

#Select all
languages = ["Python", "R", "SQL", "C", "C++", "Java", "Javascript", "Julia", "Bash", "MATLAB", "None", "Other", "C#", "PHP", "Go"]

#Role, Select one (drop down)
jobs = ["Database Administrator", "Data Analyst", "Data Architect", "Data Engineer", "Data Scientist",
    "Developer Advocate", "Engineer", "ML Engineer", "Manager", "Other",
    "Researcher", "Software Engineer", "Statistician", "Teacher"]

#Industry, select on (Drop down)
industries = ["Education", "Finance", "Communications", "Tech", "Energy", "Government",
              "Insurance", "Manufacturing", "Marketing", "Medical", "NonProfit",
              "OnlineService", "Other", "Retail", "Transportation"]

In [18]:
# Full feature list matching the training data 
feature_names = ['education_level', 
                 'Programming Experience', 
                 'ML Research',
                 'Machine Learning Experience', 
                 'Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Bash', 'MATLAB', 'None', 'Other', 'C#', 'PHP', 'Go',
                 'Africa', 'Americas', 'Central-South Asia', 'East Asia', 'Europe', 'India', 'Middle East', 'Other', 'USA',
                 'job_DataAdmin', 'job_DataAnalyst', 'job_DataArchitect', 'job_DataEngineer', 'job_DataScientist', 'job_DevAdvocate', 'job_Engineer', 'job_MLEngineer', 'job_Manager', 'job_Other', 'job_Research', 'job_SoftwareEng', 'job_Statistician', 'job_Teacher',
                 'industry_Education', 'industry_Finance', 'industry_Communications', 'industry_Tech', 'industry_Energy', 'industry_Government','industry_Insurance', 'industry_Manufacturing', 'industry_Marketing', 'industry_Medical', 'industry_NonProfit', 'industry_OnlineService','industry_Other', 'industry_Retail', 'industry_Transportation',
                 'company_startup', 'company_enormous', 'company_large', 'company_medium', 'company_small']

In [19]:
def predict(education, region, years_coding, years_ml, ml_research, company_size, langs, job_roles, industry):
    
    # Initialize feature vector
    data = {feat: 0 for feat in feature_names}

    # Fill in features
    data['education_level'] = education_mapping.get(education, 0)
    data['Programming Experience'] = years_coding 
    data['Machine Learning Experience'] = years_ml   
    data['ML Research'] = 1 if ml_research else 0 

    # Region one-hot
    data[region] = 1

    # Company size one-hot
    data[f"company_{company_size}"] = 1

    # Programming languages
    for lang in langs: data[lang] = 1

    # Job roles
    for role in job_roles: data[f"job_{role}"] = 1

    # Industry
    data[f"industry_{industry}"] = 1

    # Create DataFrame
    df = pd.DataFrame([data], columns=feature_names)

    # Predict
    pred = model.predict(df)[0]
    return f"Predicted outcome: ${pred:,.0f}"

In [21]:
with gr.Blocks() as demo:
    gr.Markdown(f"\n # 💼 Data Scientist Salary Predictor \n ## **Created by: Frances LeMond-Glasser** \n ### **Welcome to the BUS 458 Final Project (Part Two!) Please fill out the following information in order to have your data science salary estimated.**")

    # First row with two columns (each column has two drop down boxes)
    with gr.Row():
        with gr.Column():
            education_input = gr.Dropdown(list(education_mapping.keys()), label="Education Level")
            region_input = gr.Dropdown(regions, label="Region")
        with gr.Column(): 
            job_input = gr.Dropdown(jobs, label="Job Role Applying For")
            industry_input = gr.Dropdown(industries, label="Desired Industry")
    
    #Check box for ML Research
    gr.Markdown(f"**Please click the box if you have used machine learning in your research.**")
    with gr.Row():
        with gr.Column():
            ml_research_input = gr.Checkbox(label="Machine Learning in Research?")
    
    # Industry, languages, experience...
    with gr.Row():
        with gr.Column():
            company_size_input = gr.Radio(company_sizes, label="Company Size")
            years_coding_input = gr.Slider(0, 30, step=1, value=2, label="Years of Coding Experience")
            years_ml_input = gr.Slider(0, 30, step=1, value=2, label="Years of Machine Learning Experience")
            lang_input = gr.CheckboxGroup(languages, label="Programming Languages Known")
            predict_btn = gr.Button("Predict")

    # Predict button
    output = gr.Textbox(label="Prediction Result")

    predict_btn.click(fn=predict, 
                      inputs=[education_input, region_input, years_coding_input, years_ml_input, ml_research_input, company_size_input, lang_input, job_input, industry_input],
                      outputs=output)

if __name__ == "__main__":
    demo.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.
