In [156]:
from pandas import DataFrame,read_csv,concat
from numpy import argmax,cumsum
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


In [157]:
salary_data = read_csv('content/salary.csv')

null_indices = salary_data[salary_data.isnull().any(axis=1)].index.tolist()
print("Null values indices:", null_indices)

print(salary_data.iloc[172])
print(salary_data.iloc[260])


Null values indices: [172, 260]
Age                    NaN
Gender                 NaN
Education Level        NaN
Job Title              NaN
Years of Experience    NaN
Salary                 NaN
Name: 172, dtype: object
Age                    NaN
Gender                 NaN
Education Level        NaN
Job Title              NaN
Years of Experience    NaN
Salary                 NaN
Name: 260, dtype: object


In [158]:
salary_data = salary_data.drop(index=null_indices)

In [159]:

target_salary_data = salary_data['Salary']
salary_data = salary_data.drop(columns=['Salary'])


In [160]:
train_salary_data, train_target_salary_data,test_salary_data,test_target_salary_data = train_test_split(salary_data, target_salary_data, test_size=0.2, random_state=42)

In [161]:
train_salary_data.shape, train_target_salary_data.shape, test_salary_data.shape, test_target_salary_data.shape

((298, 5), (75, 5), (298,), (75,))

In [162]:
train_salary_data.head(10)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience
193,34.0,Male,Bachelor's,Senior Product Manager,7.0
75,37.0,Male,Bachelor's,Project Manager,10.0
84,29.0,Female,Bachelor's,Junior Designer,2.0
363,33.0,Male,Bachelor's,Junior Marketing Specialist,5.0
16,33.0,Female,Master's,Marketing Manager,7.0
66,46.0,Male,Bachelor's,IT Manager,19.0
285,44.0,Female,PhD,Senior Product Designer,15.0
7,31.0,Male,Bachelor's,Sales Manager,4.0
113,32.0,Male,Master's,Senior Business Analyst,6.0
116,40.0,Female,Bachelor's,Office Manager,15.0


In [163]:

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False,dtype='int')

encoded_cols = ["Age", "Education Level", "Job Title","Gender"]
encoded_data = encoder.fit_transform(train_salary_data[encoded_cols])
encoded_df = DataFrame(encoded_data, columns=encoder.get_feature_names_out(encoded_cols),dtype=int)

encoded_df = encoded_df.astype(int)

# Drop and concat
train_salary_data = concat([
    train_salary_data.drop(columns=encoded_cols).reset_index(drop=True),
    encoded_df.reset_index(drop=True)
], axis=1)

train_salary_data.head(10)

Unnamed: 0,Years of Experience,Age_24.0,Age_25.0,Age_26.0,Age_27.0,Age_28.0,Age_29.0,Age_30.0,Age_31.0,Age_32.0,...,Job Title_Supply Chain Manager,Job Title_Technical Recruiter,Job Title_Technical Writer,Job Title_Training Specialist,Job Title_UX Designer,Job Title_VP of Finance,Job Title_VP of Operations,Job Title_Web Developer,Gender_Female,Gender_Male
0,7.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,10.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2.0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,7.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
5,19.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,15.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,4.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
8,6.0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
9,15.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [166]:
pca = PCA(n_components=185)

pca.fit(train_salary_data)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = cumsum(explained_variance_ratio)
optimal_k = argmax(cumulative_variance >= 0.95) + 1
print(f"Optimal number of components (k) for 95% variance: {optimal_k}")
pca = PCA(n_components=optimal_k)
pca_train_salary_data = pca.fit_transform(train_salary_data)
pca_train_salary_data.shape


Optimal number of components (k) for 95% variance: 3


(298, 3)

In [None]:
from seaborn import scatterplot
from matplotlib.pyplot import figure, show

fig = figure()
ax = fig.add_subplot(projection='3d')

ax.scatter3D(pca_train_salary_data[:, 0], pca_train_salary_data[:, 1], pca_train_salary_data[:, 2], c=train_target_salary_data, cmap='viridis')
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.set_title('3D PCA Visualization of Salary Data')
ax.view_init(elev=20, azim=30)
fig.show()



TypeError: 'module' object is not callable