In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression

In [2]:
ud_df = pd.read_csv("salary_data_cleaned.csv")

In [3]:
ud_df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,...,avg_salary,company_txt,job_state,same_state,age,python_yn,R_yn,spark,aws,excel
0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,...,72.0,Tecolote Research\n,NM,0,47,1,0,0,0,1
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,...,87.5,University of Maryland Medical System\n,MD,0,36,1,0,0,0,0
2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,...,85.0,KnowBe4\n,FL,1,10,1,0,1,0,1
3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,...,76.5,PNNL\n,WA,1,55,1,0,0,0,0
4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,...,114.5,Affinity Solutions\n,NY,1,22,1,0,0,0,1


In [None]:
# Separate features and target variable
X = ud_df.drop('avg_salary', axis=1)
y = ud_df['avg_salary']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=37)

# Filter the test set to include only rows with target variable values present in the training set
x_test_filtered = x_test[x_test.index.isin(x_train.index)]
y_test_filtered = y_test[y_test.index.isin(y_train.index)]

# One-hot encode categorical variables
x_train_encoded = pd.get_dummies(x_train)
x_test_d_filtered = pd.get_dummies(x_test_filtered)

# Label encode the target variable
label_encode = LabelEncoder()
y_train_encoded = label_encode.fit_transform(y_train)

# Build and train logistic regression model
lgr = LogisticRegression(max_iter=100000000000)
lgr.fit(x_train_encoded, y_train_encoded)

# Make predictions
y_pred_filtered = lgr.predict(x_test_d_filtered)

# Compute accuracy score
test_score_filtered = accuracy_score(y_test_filtered, y_pred_filtered, normalize=True).astype(str)

# Write the accuracy score to a text file
with open('prediction_score_filtered.txt', 'w') as f:
    f.write('computing score : ')
    f.write(test_score_filtered)

# Map encoded labels to original categories
ohe_to_category_dic = {0: 'high', 1: 'low', 2: 'medium'}
y_pred_translated_filtered = [ohe_to_category_dic.get(i) for i in y_pred_filtered]
y_pred_translated_filtered = pd.DataFrame(y_pred_translated_filtered, columns=['predicted_salary'])

# Add predicted salary to the filtered test data
results_df_filtered = x_test_filtered.copy()
results_df_filtered.reset_index(drop=True, inplace=True)
results_df_filtered['real_salary'] = y_test_filtered.reset_index(drop=True)
results_df_filtered = pd.concat([results_df_filtered, y_pred_translated_filtered], axis=1)

# Save results to CSV for the filtered test set
results_df_filtered.to_csv('results_filtered.csv', index=False)

# Count the occurrences of each predicted salary category for the filtered test set
predicted_counts_filtered = results_df_filtered['predicted_salary'].value_counts()


In [None]:
# Create a pie chart for the filtered test set
plt.figure(figsize=(8, 8))
plt.pie(predicted_counts_filtered, labels=predicted_counts_filtered.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Predicted Salary Categories (Filtered Test Set)')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()