In [112]:
import numpy as np
import matplotlib.pyplot as plt
import math
import pandas as pd
import scipy.stats as stats
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# Cleaning data 
Importing data and removing NaN values

In [113]:
data = pd.read_csv('Salary_Data.csv')
data=data.dropna()

In [114]:
print(list(data.columns))

['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience', 'Salary']


In [115]:
Education_counts = data['Education Level'].value_counts()
print(Education_counts)

Education Level
Bachelor's Degree    2265
Master's Degree      1572
PhD                  1368
Bachelor's            756
High School           448
Master's              288
phD                     1
Name: count, dtype: int64


Some education level entries mean the same thing. Thus, we need to merge the entries together

In [116]:
df = data
df['Education Level'].replace("Bachelor's", "Bachelor's Degree", inplace=True)
df['Education Level'].replace("Master's", "Master's Degree", inplace=True)
df['Education Level'].replace("phD", "PhD", inplace=True)

In [117]:
#checking if the values are merged
Education_counts_merged = df['Education Level'].value_counts()
print(Education_counts_merged)

Education Level
Bachelor's Degree    3021
Master's Degree      1860
PhD                  1369
High School           448
Name: count, dtype: int64


Making dummy variables

In [118]:
dummies_gender = pd.get_dummies(df['Gender'])
dummies_education = pd.get_dummies(df["Education Level"])
df = pd.concat([df, dummies_gender,dummies_education], axis=1)
df.drop('Gender', axis=1, inplace=True)
df.drop("Education Level",axis=1,inplace=True)


In [119]:
print(df.head())

    Age          Job Title  Years of Experience    Salary  Female   Male  \
0  32.0  Software Engineer                  5.0   90000.0   False   True   
1  28.0       Data Analyst                  3.0   65000.0    True  False   
2  45.0     Senior Manager                 15.0  150000.0   False   True   
3  36.0    Sales Associate                  7.0   60000.0    True  False   
4  52.0           Director                 20.0  200000.0   False   True   

   Other  Bachelor's Degree  High School  Master's Degree    PhD  
0  False               True        False            False  False  
1  False              False        False             True  False  
2  False              False        False            False   True  
3  False               True        False            False  False  
4  False              False        False             True  False  


# T-testing for Male and Female Salaries
Getting the salaries of male and female

In [120]:
male_salary = np.array(df[df['Male'] == True]['Salary'].values)
female_salary = np.array(df[df['Female'] == True]['Salary'].values)


In [121]:
t_statistic, p_value = stats.ttest_ind(male_salary, female_salary)

# Display the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Check if the result is statistically significant (common alpha level is 0.05)
if p_value < 0.05:
    print("The difference between Male and Female salary is statistically significant.")
else:
    print("There is no significant difference Male and Female salary.")

T-Statistic: 10.489305680117718
P-Value: 1.5264743340303035e-25
The difference between Male and Female salary is statistically significant.


Since the difference between Male and Female salary is statistically significant, we need to do Confidence Interval to determine which gender earns more and the range for its difference.

In [122]:
#Confidence interval 
mean_male = np.mean(male_salary)
mean_female = np.mean(female_salary)
std_dev_male = np.std(male_salary, ddof=1)  # ddof=1 for sample standard deviation
std_dev_female = np.std(female_salary, ddof=1)

# Set the confidence level and calculate the confidence interval for the difference
confidence_level = 0.95
degrees_of_freedom = len(male_salary) + len(female_salary) - 2  # Degrees of freedom for two-sample t-test

# Calculate the standard error of the difference between means
std_error_diff = np.sqrt((std_dev_male**2 / len(male_salary)) + (std_dev_female**2 / len(female_salary)))

# Calculate the t-statistic for the given confidence level and degrees of freedom
t_statistic = stats.t.ppf((1 + confidence_level) / 2, degrees_of_freedom)

# Calculate the margin of error
margin_of_error = t_statistic * std_error_diff

# Calculate the confidence interval for the difference between means
lower_bound = (mean_male - mean_female) - margin_of_error
upper_bound = (mean_male - mean_female) + margin_of_error

# Display the results
print(f"Mean Difference: {mean_male - mean_female}")
print(f"Confidence Interval ({confidence_level * 100}%): ({lower_bound}, {upper_bound})")

Mean Difference: 13506.698957654036
Confidence Interval (95.0%): (10979.5017984352, 16033.896116872873)


Conclude with 95% CI that Males earn more than Females

# Linear Regression

In [123]:
X = df[['Male','Female','Other','High School',"Bachelor's Degree","Master's Degree",'PhD', "Years of Experience"]]
Y = df['Salary']

model = LinearRegression().fit(X, Y,[1 for i in range(6698)])
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [  3301.79357509  -2149.59826484  -1152.19531025 -36217.53727772
    995.78266981  12696.68013248  22525.07447543   5750.26559335]
Intercept: 61783.19535297727


In [124]:
print(X)

       Male  Female  Other  High School  Bachelor's Degree  Master's Degree  \
0      True   False  False        False               True            False   
1     False    True  False        False              False             True   
2      True   False  False        False              False            False   
3     False    True  False        False               True            False   
4      True   False  False        False              False             True   
...     ...     ...    ...          ...                ...              ...   
6699  False    True  False        False              False            False   
6700   True   False  False         True              False            False   
6701  False    True  False        False               True            False   
6702   True   False  False        False              False             True   
6703  False    True  False         True              False            False   

        PhD  Years of Experience  
0     False     

In [125]:
print(Y)

0        90000.0
1        65000.0
2       150000.0
3        60000.0
4       200000.0
          ...   
6699    200000.0
6700     50000.0
6701     55000.0
6702    140000.0
6703     35000.0
Name: Salary, Length: 6698, dtype: float64


Using the model to predict the salary

In [126]:
#Prediction Model
male = 1
female = 0
other = 0

yoe = 5

high_sch=0
bach=1
master=0
phd=0


In [127]:
# New data for prediction
new_data = np.array([[male,female,other,high_sch,bach,master,phd,yoe]])  

# Make predictions using the model
predicted_value = model.predict(new_data)

# Print the predicted value
print("Predicted Value:", predicted_value[0])

Predicted Value: 94832.09956460405




Checking the accuracy of the model with a randomly selected individual and checking how far off the predicted salary is from their actual salary

In [128]:
X=sm.add_constant(X)


In [129]:
x=np.asarray(X,dtype=np.float32)

In [130]:
model_sm = sm.OLS(Y, x).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.707
Model:                            OLS   Adj. R-squared:                  0.707
Method:                 Least Squares   F-statistic:                     2697.
Date:                Wed, 20 Dec 2023   Prob (F-statistic):               0.00
Time:                        17:04:23   Log-Likelihood:                -78221.
No. Observations:                6698   AIC:                         1.565e+05
Df Residuals:                    6691   BIC:                         1.565e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.902e+04   1711.561     22.798      0.0

# Machine Learning using Linear Regression
To conduct machine learning, we must first split the cleaned data into training and testing. Training datasets will consists of 70% of the data and the testing will be the remaining 30%.

In [131]:
y_ml = df['Salary']
x_ml = df.drop(columns=['Salary','Job Title'])

In [132]:
print(x_ml)

       Age  Years of Experience  Female   Male  Other  Bachelor's Degree  \
0     32.0                  5.0   False   True  False               True   
1     28.0                  3.0    True  False  False              False   
2     45.0                 15.0   False   True  False              False   
3     36.0                  7.0    True  False  False               True   
4     52.0                 20.0   False   True  False              False   
...    ...                  ...     ...    ...    ...                ...   
6699  49.0                 20.0    True  False  False              False   
6700  32.0                  3.0   False   True  False              False   
6701  30.0                  4.0    True  False  False               True   
6702  46.0                 14.0   False   True  False              False   
6703  26.0                  1.0    True  False  False              False   

      High School  Master's Degree    PhD  
0           False            False  False  

In [133]:
X_train, X_test, y_train, y_test = train_test_split(x_ml, y_ml, test_size=0.3, random_state=42)

In [134]:
print(X_train)

       Age  Years of Experience  Female   Male  Other  Bachelor's Degree  \
4092  43.0                 16.0   False   True  False              False   
2180  33.0                  6.0    True  False  False               True   
5224  25.0                  1.0   False   True  False               True   
459   30.0                  5.0    True  False  False               True   
6118  40.0                 16.0    True  False  False               True   
...    ...                  ...     ...    ...    ...                ...   
3776  29.0                  4.0   False   True  False               True   
5195  38.0                 14.0    True  False  False              False   
5230  25.0                  1.0   False   True  False               True   
5395  27.0                  1.0   False   True  False               True   
862   25.0                  2.0    True  False  False               True   

      High School  Master's Degree    PhD  
4092        False            False   True  

In [135]:
model_ml = LinearRegression().fit(X_train, y_train)
print("Coefficients:", model_ml.coef_)
print("Intercept:", model_ml.intercept_)

Coefficients: [-1.95138231e+03  8.03647794e+03  1.35681592e+03  7.15718814e+03
 -8.51400407e+03  3.04969635e+01 -3.54932854e+04  1.12960984e+04
  2.41666900e+04]
Intercept: 105716.56501353417


In [139]:
y_pred = model_ml.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
rmse = mse**0.5

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r_squared)

Mean Squared Error: 826734879.449579
Root Mean Squared Error: 28752.99774718419
R-squared: 0.7087771960045556


# Machine Learning using Lasso Regression

In [137]:
y_lasso = df['Salary']
x_lasso = df.drop(columns=['Salary','Job Title'])
X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso = train_test_split(x_lasso, y_lasso, test_size=0.3, random_state=100)

# Create Lasso regression model
lasso = Lasso(alpha=0.1)  # Alpha is the regularization strength

# Fit the Lasso model using the training data
lasso.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r_squared)

# Get coefficients that are not zero (selected features)
selected_features = X.columns[lasso.coef_ != 0]
print("Selected features:", selected_features)

Mean Squared Error: 826732757.6474081
R-squared: 0.708777943423339
Selected features: Index(['const', 'Male', 'Female', 'Other', 'High School', 'Bachelor's Degree',
       'Master's Degree', 'PhD', 'Years of Experience'],
      dtype='object')


In [140]:
df.to_csv('Salary_Data_Cleaned.csv',index=False)