In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
import seaborn as sns
pd.set_option('display.max_rows', 300)

plt.style.use('seaborn-v0_8')

In [2]:
## carregar base

df = pd.read_csv('Salary_Data_Based_country_and_race.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
df = df.rename(columns={'Education Level': 'Education_Level', 
                        'Job Title': 'Job_Title', 
                        'Years of Experience': 'Years_of_Experience'})
print(df.shape)

(6704, 8)


In [4]:
df.tail(3)

Unnamed: 0,Age,Gender,Education_Level,Job_Title,Years_of_Experience,Salary,Country,Race
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0,China,Chinese
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0,China,Korean
6703,26.0,Female,High School,Sales Executive,1.0,35000.0,Canada,Black


In [5]:
df_preprocess = df.copy()

In [6]:
### para facilitar as análises, faço a separação das colunas numéricas e categóricas

d_col = {'num':[], 'cat':[]} 
for col in df_preprocess.columns:
    if df_preprocess[col].dtype=='object':
        df_preprocess[col] = df_preprocess[col].str.lower().str.replace(' ', '_')
        d_col['cat'].append(col)
    else:
        d_col['num'].append(col)

In [7]:
d_col

{'num': ['Age', 'Years_of_Experience', 'Salary'],
 'cat': ['Gender', 'Education_Level', 'Job_Title', 'Country', 'Race']}

In [8]:
## fill missing colunas numéricas pela mediana
for col in d_col['num']:
    df_preprocess[col] = df_preprocess[col].fillna(value=df_preprocess[col].median())
    
## fill missing colunas categóricas pela moda
for col in d_col['cat']:
    df_preprocess[col] = df_preprocess[col].fillna(value=df_preprocess[col].mode().squeeze()) 

In [9]:
### adequação das classes 'Education_Level' e 'Race' de acordo com as exploração dos dados

df_preprocess['Education_Level'] = df_preprocess['Education_Level'].replace({"bachelor's_degree": "bachelor's", "master's_degree": "master's"})
df_preprocess['Race'] = df_preprocess['Race'].replace({'australian': 'white', 'welsh': 'white', 'korean': 'asian', 'chinese': 'asian', 'african_american': 'black'})

In [10]:
### remoção dos gêneros 'other', dado sua insignificancia numérica
df_preprocess = df_preprocess[df_preprocess['Gender']!='other']

In [19]:
### remover os dois extremos salariais

min_ = df_preprocess['Salary'].quantile(.005)
max_ = df_preprocess['Salary'].quantile(.995)

df_preprocess = df_preprocess[(df_preprocess['Salary']>=min_) & (df_preprocess['Salary']<=max_)]

print(min_, max_)

25000.0 219000.0


In [11]:
from sklearn.preprocessing import LabelEncoder

print('colunas "encondáveis":' ,d_col['cat'])

colunas "encondáveis": ['Gender', 'Education_Level', 'Job_Title', 'Country', 'Race']


In [12]:
datapath = 'data_preprocess'

In [13]:
### encoding categorical (o 'Job_Title' terá um tratamento diferente dado sua grande quantidade de classes)

le = LabelEncoder()

for col in d_col['cat']:
    if col!='Job_Title':
        df_preprocess[col] = le.fit_transform(df_preprocess[col])
        le_json = dict(zip(le.classes_, le.transform(le.classes_).astype('str')))
        jsonpath = os.path.join(datapath, col+'.json')
        with open(jsonpath, 'w') as f:
            json.dump(le_json, f)

In [14]:
### encoding Job_Title
### existe 192 classes de Job_Title, muito deles com frequencia única
### Algumas profissões são nomeadas com o nivel junior, senior, diretor etc. Porém há classes sem a especificidade do nivel
### Por exemplo, há "data_scientist", "senior_data_scientist", "junior_data_scientist" e "director_of_data_science"
### O tokenization e encoding será feito de forma que essas classes equivalentes a "data_scientist" sejam aproximadas, como o exemplo:
### "data_scientist": [0, 14, 20], "senior_data_scientist": [1, 14, 20], "junior_data_scientist": [3, 14, 20], "director_of_data_science": [6, 14, 20]
### Essa aproximação é feita pelos elementos em comum [14, 20], o que não ocorreria no caso de uma simples troca das classes por um número como
### "data_scientist": 1, "senior_data_scientist": 2, "junior_data_scientist": 3, "director_of_data_science": 4

import nltk

ls_word = np.concatenate(df_preprocess['Job_Title'].drop_duplicates().reset_index(drop=True).str.split('_'))
allWordDist = nltk.FreqDist(ls_word)
allWordDist.most_common(25)

[('senior', 50),
 ('manager', 44),
 ('junior', 36),
 ('marketing', 21),
 ('analyst', 20),
 ('director', 20),
 ('of', 14),
 ('specialist', 14),
 ('operations', 13),
 ('sales', 12),
 ('coordinator', 12),
 ('engineer', 11),
 ('software', 10),
 ('data', 10),
 ('hr', 10),
 ('product', 9),
 ('developer', 9),
 ('business', 9),
 ('designer', 9),
 ('scientist', 8),
 ('financial', 8),
 ('project', 8),
 ('human', 8),
 ('resources', 7),
 ('customer', 6)]

In [15]:
n_vocab = len(allWordDist)
d_vocab = {}
d_vocab_r = {}
for i, word in enumerate(list(allWordDist)):
    if word not in ['of', 'and']:
        d_vocab[word] = i+1
        d_vocab_r[i+1] = word
        
jsonpath = os.path.join(datapath, 'Job_Title_words.json')
with open(jsonpath, 'w') as f:
    json.dump(d_vocab, f)

In [16]:
ls_vec = []
for tokeniz in df_preprocess['Job_Title'].str.split('_'):
    ls_vec.append([0, 0, 0, 0] + [d_vocab[w] for w in tokeniz if w in d_vocab.keys()])

df_preprocess['Job_Title_Vec'] = ls_vec
df_preprocess['Job_Title_Vec'] = df_preprocess['Job_Title_Vec'].str.slice(-5)


df_preprocess['Job_Title_Vec_1'] = df_preprocess['Job_Title_Vec'].apply(lambda x: x[0])
df_preprocess['Job_Title_Vec_2'] = df_preprocess['Job_Title_Vec'].apply(lambda x: x[1])
df_preprocess['Job_Title_Vec_3'] = df_preprocess['Job_Title_Vec'].apply(lambda x: x[2])
df_preprocess['Job_Title_Vec_4'] = df_preprocess['Job_Title_Vec'].apply(lambda x: x[3])
df_preprocess['Job_Title_Vec_5'] = df_preprocess['Job_Title_Vec'].apply(lambda x: x[4])

In [18]:
## Todos os developers se identificam com o Job_Title_Vec_5=17, independente de suas especificações de função ou nivel.
df_preprocess[df_preprocess['Job_Title'].str.contains('developer')][['Job_Title', 'Job_Title_Vec_1','Job_Title_Vec_2','Job_Title_Vec_3','Job_Title_Vec_4','Job_Title_Vec_5']].drop_duplicates()

Unnamed: 0,Job_Title,Job_Title_Vec_1,Job_Title_Vec_2,Job_Title_Vec_3,Job_Title_Vec_4,Job_Title_Vec_5
10,software_developer,0,0,0,13,17
27,junior_developer,0,0,0,3,17
70,web_developer,0,0,0,43,17
97,junior_software_developer,0,0,3,13,17
103,junior_web_developer,0,0,3,43,17
129,senior_software_developer,0,0,1,13,17
1203,back_end_developer,0,0,88,60,17
1206,front_end_developer,0,0,91,60,17
2011,developer,0,0,0,0,17


In [20]:
columns_select = ['Gender', 'Education_Level', 'Years_of_Experience', 'Country', 'Race', 
                  'Job_Title_Vec_1', 'Job_Title_Vec_2', 'Job_Title_Vec_3', 'Job_Title_Vec_4', 'Job_Title_Vec_5',  
                  'Salary']

df_preprocess[columns_select].head(3)

Unnamed: 0,Gender,Education_Level,Years_of_Experience,Country,Race,Job_Title_Vec_1,Job_Title_Vec_2,Job_Title_Vec_3,Job_Title_Vec_4,Job_Title_Vec_5,Salary
0,1,0,5.0,3,4,0,0,0,13,12,90000.0
1,0,2,3.0,4,2,0,0,0,14,5,65000.0
2,1,3,15.0,1,4,0,0,0,1,2,150000.0


In [21]:
filename = os.path.join(datapath, 'salary_preprocess_1.csv')
df_preprocess[columns_select].to_csv(filename, index=None)

será testado um dataset onde houve mum simples encode do 'Job_Title'

In [23]:
col = 'Job_Title'
df_preprocess[col] = le.fit_transform(df_preprocess[col])
le_json = dict(zip(le.classes_, le.transform(le.classes_).astype('str')))
jsonpath = os.path.join(datapath, col+'.json')
with open(jsonpath, 'w') as f:
    json.dump(le_json, f)

In [24]:
columns_select = ['Gender', 'Education_Level', 'Years_of_Experience', 'Country', 'Race', 'Job_Title',  
                  'Salary']

df_preprocess[columns_select].head(3)

Unnamed: 0,Gender,Education_Level,Years_of_Experience,Country,Race,Job_Title,Salary
0,1,0,5.0,3,4,173,90000.0
1,0,2,3.0,4,2,15,65000.0
2,1,3,15.0,1,4,141,150000.0


In [25]:
filename = os.path.join(datapath, 'salary_preprocess_2.csv')
df_preprocess[columns_select].to_csv(filename, index=None)