In [None]:
import json
import pandas as pd
import numpy as np
import opendatasets as od
import seaborn as sns

import matplotlib.pyplot as plt

from utils.attributes import TARGET_COLUMN, SENSITIVE_ATTRIBUTES, NOT_FEATURES, SALARY_MAPPING

DATASET_URL = 'https://www.kaggle.com/datasets/datahackers/state-of-data-2022/data'

In [None]:
# Fazendo download do dataset
od.download(DATASET_URL)

# Carregando o dataset
df = pd.read_csv('state-of-data-2022/State_of_Data_2022.csv')

In [None]:
# Exibindo as primeiras linhas do dataset
df.head(5)

In [None]:
# Exibindo as colunas do dataset
df.columns

In [None]:
# Mapeando colunas do dataset
def get_column_mapping(columns):
    column_dict = {}
    column_mapping = {}

    for column in columns:
        column_stripped = column.strip("(')")
        column_splitted = column_stripped.replace("'", "").split(', ')

        column_dict[column_splitted[0].strip()] = ', '.join(column_splitted[1:])
        column_mapping[column] = column_splitted[0].strip()

    with open('column_mapping.json', 'w', encoding='utf-8') as json_file:
        sorted_dict = {k: column_dict[k] for k in sorted(column_dict)}
        json.dump(sorted_dict, json_file, ensure_ascii=False, indent=4)

    return column_mapping

column_mapping = get_column_mapping(df.columns)
df = df.rename(columns=column_mapping)

In [None]:
# Tratando colunas de atributos sensíveis e faixa salarial
def treat_sensitive_attributes(df):
    df[SENSITIVE_ATTRIBUTES['age']] = df[SENSITIVE_ATTRIBUTES['age']].apply(categorize_age)
    df[SENSITIVE_ATTRIBUTES['gender']] = df[SENSITIVE_ATTRIBUTES['gender']].apply(categorize_gender)
    df[SENSITIVE_ATTRIBUTES['race_color']] = df[SENSITIVE_ATTRIBUTES['race_color']].apply(categorize_race_color)
    df[SENSITIVE_ATTRIBUTES['pwd']] = df[SENSITIVE_ATTRIBUTES['pwd']].apply(categorize_pwd)

    return df


def categorize_age(age):
    if age <= 40:
        return '18-40'
    elif age > 40:
        return '40+'
    else:
        return 'Sem Resposta'
    

def categorize_gender(gender):
    if gender in ['Masculino', 'Feminino']:
        return gender
    else:
        return 'Sem Resposta'
    

def categorize_race_color(race_color):
    if race_color == 'Branca':
        return race_color
    elif race_color in ['Parda', 'Preta', 'Amarela', 'Indígena', 'Outra']:
        return 'Não Branca'
    else:
        return 'Sem Resposta'
    

def categorize_pwd(pwd):
    if pwd in ['Sim', 'Não']:
        return pwd
    else:
        return 'Sem Resposta'
    
def map_salaries(df):
    df = df.dropna(subset=['P2_h'])    
    df.loc[:, 'P2_h'] = df.loc[:, 'P2_h'].map(SALARY_MAPPING)    
    return df    

    
df = treat_sensitive_attributes(df)
print(f'Full dataset shape: {df.shape}')

df = map_salaries(df)
print(f'Dataset shape after map salaries: {df.shape}')


In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='P2_h', palette='viridis')
plt.title('Distribuição de Salários')
plt.xlabel('Faixa Salarial')
plt.ylabel('Quantidade')
legend_labels = [f'{value} = {key}' for key, value in SALARY_MAPPING.items()]
plt.legend(legend_labels)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='P2_h', hue='P1_b')
plt.title('Distribuição de Salários')
plt.xlabel('Faixa Salarial')
plt.ylabel('Quantidade')
legend_labels = df['P1_b'].unique().tolist()
plt.legend(legend_labels)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='P2_h', hue='P1_a')
plt.title('Distribuição de Salários')
plt.xlabel('Faixa Salarial')
plt.ylabel('Quantidade')
legend_labels = df['P1_a'].unique().tolist()
plt.legend(legend_labels, title='Idade')
plt.show()

In [None]:
# Removendo colunas que não são features
columns_to_drop = list(SENSITIVE_ATTRIBUTES.values()) + list(NOT_FEATURES.keys())   
df = df.drop(columns=columns_to_drop) 
print(f'Dataset shape after drop columns and nan target: {df.shape}')

# Salvando dataset tratado
df.to_csv('state-of-data-2022/State_of_Data_2022_cleaned.csv', index=False)
df.to_excel('state-of-data-2022/State_of_Data_2022_cleaned.xlsx', index=False)