In [29]:
#Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import math
from sklearn.model_selection import train_test_split

In [30]:
datas = pd.read_csv(r"C:\Users\Elanur\Desktop\Software Engineer Salaries.csv")

In [31]:
#Copying data just in case
data = datas.copy()

In [32]:
#Converting data to dataframe
data = pd.DataFrame(data)

In [33]:
data.head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.)
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.)
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.)
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.)
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.)


In [34]:
data.isnull().sum()

Company            2
Company Score     81
Job Title          0
Location          13
Date               0
Salary           106
dtype: int64

In [35]:
data.dtypes

Company           object
Company Score    float64
Job Title         object
Location          object
Date              object
Salary            object
dtype: object

In [36]:
# Function to convert range to average value
def extract_salary(salary_str):
    if isinstance(salary_str, str):
        # Get forecast type and salary range
        salary_range = re.findall(r'\$([\dK]+)', salary_str)  # Get salary ranges
        salary_type = 'Employer est.' if 'Employer est.' in salary_str else 'Glassdoor est.'
        # Convert the salaries indicated by 'K' to numerical value and multiply by 1000
        salary_range = [int(s.replace('K', '')) * 1000 for s in salary_range]
        # If there are 2 salary values, calculate the average, otherwise return "none"
        if len(salary_range) == 2:
            return sum(salary_range) / 2, salary_type
        else:
            return None, salary_type
    # If data is not available return none
    return None, None  

# Calculate the average and forecast type in the salary column and add them as new columns to the data
data[['Average Salary', 'Estimate Type']] = data['Salary'].apply(
    lambda x: pd.Series(extract_salary(x))
)

In [37]:
data

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,Average Salary,Estimate Type
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),81000.0,Glassdoor est.
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.),82500.0,Employer est.
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),106500.0,Glassdoor est.
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),121000.0,Employer est.
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),96500.0,Glassdoor est.
...,...,...,...,...,...,...,...,...
865,RXO,,"Software Engineer, Machine Learning Compute","San Francisco, CA",6d,,,
866,Infosys,,Software Engineer - 3 (Apache NiFi),"Annapolis Junction, MD",18d,,,
867,Medtronic,,Senior Software Engineer,"Southfield, MI",19d,,,
868,,,Junior Python Developer,"Charlotte, NC",2d,,,


In [38]:
# Calculate IQR 
Q1 = data['Average Salary'].quantile(0.25)  # 1st quarter
Q3 = data['Average Salary'].quantile(0.75)  # 3rd quarter
IQR = Q3 - Q1  # IQR

# Detection outliers
outliers_iqr = (data['Average Salary'] < (Q1 - 1.5 * IQR)) | (data['Average Salary'] > (Q3 + 1.5 * IQR))

# Calculate the total number of outliers
num_outliers_iqr = outliers_iqr.sum().sum()  
print(f"total number of outliers : {num_outliers_iqr}")


total number of outliers : 30


In [39]:
# Fill missing values ​​with Knn imputer
imputer = KNNImputer(n_neighbors=2)
data[['Average Salary']] = imputer.fit_transform(data[['Average Salary']])


In [40]:
data

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,Average Salary,Estimate Type
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),81000.000000,Glassdoor est.
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.),82500.000000,Employer est.
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),106500.000000,Glassdoor est.
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),121000.000000,Employer est.
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),96500.000000,Glassdoor est.
...,...,...,...,...,...,...,...,...
865,RXO,,"Software Engineer, Machine Learning Compute","San Francisco, CA",6d,,127957.937585,
866,Infosys,,Software Engineer - 3 (Apache NiFi),"Annapolis Junction, MD",18d,,127957.937585,
867,Medtronic,,Senior Software Engineer,"Southfield, MI",19d,,127957.937585,
868,,,Junior Python Developer,"Charlotte, NC",2d,,127957.937585,


In [41]:
# Drop rows with NaN in 'Company' column
data = data.dropna(subset=['Company'])

In [42]:
data.isnull().sum()

Company             0
Company Score      79
Job Title           0
Location           13
Date                0
Salary            104
Average Salary      0
Estimate Type     104
dtype: int64

In [43]:
data

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,Average Salary,Estimate Type
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),81000.000000,Glassdoor est.
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.),82500.000000,Employer est.
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),106500.000000,Glassdoor est.
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),121000.000000,Employer est.
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),96500.000000,Glassdoor est.
...,...,...,...,...,...,...,...,...
863,OpenAI,,Embedded Software Engineer (Entry-Level),"Lake Hopatcong, NJ",2d,,127957.937585,
864,"Akina, Inc.",,Senior Software Engineer - App Orchestration,"San Mateo, CA",30d+,,127957.937585,
865,RXO,,"Software Engineer, Machine Learning Compute","San Francisco, CA",6d,,127957.937585,
866,Infosys,,Software Engineer - 3 (Apache NiFi),"Annapolis Junction, MD",18d,,127957.937585,


In [44]:
# Fill missing values in 'Company Score' column using KNN Imputer
imputer = KNNImputer(n_neighbors=2)
data[['Company Score']] = imputer.fit_transform(data[['Company Score']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[['Company Score']] = imputer.fit_transform(data[['Company Score']])


In [45]:
# Drop specified columns
data = data.drop(['Location', 'Date', 'Company', 'Estimate Type'], axis=1)

In [46]:
data

Unnamed: 0,Company Score,Job Title,Salary,Average Salary
0,4.800000,Software Engineer,$68K - $94K (Glassdoor est.),81000.000000
1,4.300000,Software Support Engineer,$61K - $104K (Employer est.),82500.000000
2,3.900000,C# Software Engineer,$95K - $118K (Glassdoor est.),106500.000000
3,3.500000,"Software Engineer, Fullstack, 1+ Years of Expe...",$97K - $145K (Employer est.),121000.000000
4,3.100000,Software Engineer,$85K - $108K (Glassdoor est.),96500.000000
...,...,...,...,...
863,3.895311,Embedded Software Engineer (Entry-Level),,127957.937585
864,3.895311,Senior Software Engineer - App Orchestration,,127957.937585
865,3.895311,"Software Engineer, Machine Learning Compute",,127957.937585
866,3.895311,Software Engineer - 3 (Apache NiFi),,127957.937585


In [47]:
# Create OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Convert 'Job Title' column to one-hot encoding
encoded = encoder.fit_transform(data[['Job Title']])

# Get the names of the one-hot encoded columns
encoded_columns = encoder.get_feature_names_out(['Job Title'])

# Convert the encoded data to a new DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoded_columns)

# Drop the original 'Job Title' column and add the one-hot encoded columns
data = data.drop('Job Title', axis=1)
data = pd.concat([data, encoded_df], axis=1)

In [48]:
data

Unnamed: 0,Company Score,Salary,Average Salary,Job Title_2024 Associate Software Engineer - Linthicum MD,Job Title_2025 BNY Summer Internship Program - Engineering (Developer),Job Title_2025 Early Career Program: Software Engineering,Job Title_2128 Software Engineer 1 (Java Focused),Job Title_2172 Software Engineer 1,Job Title_2251 Java & NiFi Software Engineer,Job Title_568 Software Engineer 0,...,Job Title_Test Software Engineer,Job Title_Trainee Automation Engineer,"Job Title_VP, Software Engineering",Job Title_Vehicle Software and Diagnostics Tool Test Engineer,"Job Title_Vice President, Back-End Engineer I",Job Title_Video Software Engineer,Job Title_Web Developers,Job Title_WordPress Full Stack Engineer (React),Job Title_Workday Software Engineer,Job Title_Yardi Enterprise Software Support Engineer - Hybrid Remote
0,4.800000,$68K - $94K (Glassdoor est.),81000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.300000,$61K - $104K (Employer est.),82500.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.900000,$95K - $118K (Glassdoor est.),106500.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.500000,$97K - $145K (Employer est.),121000.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.100000,$85K - $108K (Glassdoor est.),96500.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,3.895311,,127957.937585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
864,3.895311,,127957.937585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
865,3.895311,,127957.937585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
866,3.895311,,127957.937585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Columns to normalize
columns_to_normalize = ['Company Score', 'Average Salary']

# Create MinMaxScaler
scaler = MinMaxScaler()

# Apply Min-Max normalization to specified columns
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

In [50]:
data

Unnamed: 0,Company Score,Salary,Average Salary,Job Title_2024 Associate Software Engineer - Linthicum MD,Job Title_2025 BNY Summer Internship Program - Engineering (Developer),Job Title_2025 Early Career Program: Software Engineering,Job Title_2128 Software Engineer 1 (Java Focused),Job Title_2172 Software Engineer 1,Job Title_2251 Java & NiFi Software Engineer,Job Title_568 Software Engineer 0,...,Job Title_Test Software Engineer,Job Title_Trainee Automation Engineer,"Job Title_VP, Software Engineering",Job Title_Vehicle Software and Diagnostics Tool Test Engineer,"Job Title_Vice President, Back-End Engineer I",Job Title_Video Software Engineer,Job Title_Web Developers,Job Title_WordPress Full Stack Engineer (React),Job Title_Workday Software Engineer,Job Title_Yardi Enterprise Software Support Engineer - Hybrid Remote
0,0.950000,$68K - $94K (Glassdoor est.),0.154085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.825000,$61K - $104K (Employer est.),0.157187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.725000,$95K - $118K (Glassdoor est.),0.206825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.625000,$97K - $145K (Employer est.),0.236815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.525000,$85K - $108K (Glassdoor est.),0.186143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,0.723828,,0.251206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
864,0.723828,,0.251206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
865,0.723828,,0.251206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
866,0.723828,,0.251206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Find the number of 'Job Title_' columns
job_columns = [col for col in data.columns if col.startswith('Job Title_')]
num_jobs = len(job_columns)  # Number of job columns

# Split data into training and test sets
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

# UCB parameters
def run_ucb(data, N):
    job_selected = []
    num_selections = [0] * num_jobs  # Number of times each job is selected
    sum_rewards = [0] * num_jobs  # Total rewards for each job (company score and salary)

    for t in range(N):
        job = 0
        max_upper_bound = 0
        for i in range(num_jobs):
            if num_selections[i] > 0:
                average_reward = sum_rewards[i] / num_selections[i]
                delta_i = math.sqrt(3/2 * math.log(t+1) / num_selections[i])
                upper_bound = average_reward + delta_i
            else:
                upper_bound = 1e400  # A large number to ensure exploration
            if upper_bound > max_upper_bound:
                max_upper_bound = upper_bound
                job = i
        job_selected.append(job)
        num_selections[job] += 1
        
        # Reward calculation: Company score and normalized salary
        reward = data['Company Score'].iloc[t % len(data)] + data['Average Salary'].iloc[t % len(data)] / 100000
        sum_rewards[job] += reward
    
    return job_selected, num_selections, sum_rewards

# Run UCB algorithm on training set
job_selected_train, num_selections_train, sum_rewards_train = run_ucb(data_train, N=100)

# Run UCB algorithm on test set
job_selected_test, num_selections_test, sum_rewards_test = run_ucb(data_test, N=20)

# Display results in tabular format
train_results = {
    'Job Title': job_columns,
    'Selection Count (Train)': num_selections_train,
    'Total Reward (Train)': sum_rewards_train
}

test_results = {
    'Job Title': job_columns,
    'Selection Count (Test)': num_selections_test,
    'Total Reward (Test)': sum_rewards_test
}

train_results_df = pd.DataFrame(train_results)
test_results_df = pd.DataFrame(test_results)

In [52]:
train_results_df

Unnamed: 0,Job Title,Selection Count (Train),Total Reward (Train)
0,Job Title_2024 Associate Software Engineer - L...,1,0.700002
1,Job Title_2025 BNY Summer Internship Program -...,1,0.675004
2,Job Title_2025 Early Career Program: Software ...,1,0.875002
3,Job Title_2128 Software Engineer 1 (Java Focused),1,0.800002
4,Job Title_2172 Software Engineer 1,1,0.775004
...,...,...,...
536,Job Title_Video Software Engineer,0,0.000000
537,Job Title_Web Developers,0,0.000000
538,Job Title_WordPress Full Stack Engineer (React),0,0.000000
539,Job Title_Workday Software Engineer,0,0.000000


In [53]:
test_results_df

Unnamed: 0,Job Title,Selection Count (Test),Total Reward (Test)
0,Job Title_2024 Associate Software Engineer - L...,1,0.625001
1,Job Title_2025 BNY Summer Internship Program -...,1,0.825002
2,Job Title_2025 Early Career Program: Software ...,1,0.800004
3,Job Title_2128 Software Engineer 1 (Java Focused),1,0.675002
4,Job Title_2172 Software Engineer 1,1,0.723830
...,...,...,...
536,Job Title_Video Software Engineer,0,0.000000
537,Job Title_Web Developers,0,0.000000
538,Job Title_WordPress Full Stack Engineer (React),0,0.000000
539,Job Title_Workday Software Engineer,0,0.000000
