In [1]:
# libraries
import jedi
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pandas_profiling
import pygments
import random 
import re
import seaborn as sns
from scipy import stats as st
import string
from tqdm import tqdm
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [2]:
# Versions
print("jedi version:", jedi.__version__)
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("pandas profiling version:", pandas_profiling.__version__)
print("Pygments version:", pygments.__version__)
print("re version:", re.__version__)
print("Seaborn version:", sns.__version__)

jedi version: 0.15.1
numpy version: 1.17.2
pandas version: 0.25.1
pandas profiling version: 2.4.0
Pygments version: 2.4.2
re version: 2.2.1
Seaborn version: 0.9.0


In [3]:
# Directories & Files
os.listdir()

# Datasets directory
directory = "./datasets/"

## Database Documentation: 

* Equal Opportunities for Women and Men: Promotions, leadership roles, salary increases, incentive programs, etc

* Female Representation in Leadership: Women on the executive team, in senior leadership, etc

* Management Opportunities: Your chances of becoming a manager of teams and talent

* Maternity and Adoptive Leave: Paid parental leave policies, job security, support for returning moms, etc

* Family Growth Support: Access to dedicated lactation rooms, child care, expense reimbursement, etc

* Paid Time Off: Sick days, vacation days, and personal days

* Flexible Work Hours: Ability to set your schedule as long as you get your work done

* Ability to Telecommute: Flexibility to work remotely

* Salary Satisfaction: Salary, merit increases, cost of living adjustments, overall comp

* Sponsorship or Mentorship Program: Official mentorship program, women-focused initiatives or affiliate groups

* Learning Opportunities: On and off-site skills training, speaker series, conferences, etc

* Wellness Initiatives: On-site gym, gym discounts, walking desks, healthy food options, etc

* Employer Responsiveness: Effective channels for elevating issues and concerns

* The People You Work With: Respectful, professional, unbiased, all those good things

* Social Activities and Environment: Happy hours, game room, company outings, and other perks

In [4]:
# creating locations to populate
locations = ['Lisboa', 'Porto', 'Braga', 'Setúbal', 'Aveiro', 'Faro', 'Leiria', 'Coimbra', 
             'Santarém', 'Viseu','Viana do Castelo', 'Madeira', 'Açores', 'Vila Real', 
             'Castelo Branco', 'Évora', 'Guarda', 'Beja', 'Bragança', 'Portalegre']

# creating job_types to populate
job_type = ['Full-time', 'Part-time', 'Contract', 'Temporary', 'Internship', 'Other']

# creating job_levels to populate
job_level = ['Early Career', 'Mid-Level', 'Senior-Level', 'Executive']

# creating job_departments to populate
job_department = ['Accounting & Finance', 'Admin', 'Business Development', 
                  'Business Intelligence', 'Communications', 'Customer Service', 'Design', 
                  'Engineering', 'Human Resources', 'IT', 'Legal', 'Management', 'Marketing', 
                  'Operations', 'Product', 'Production', 'Research & Development', 'Sales', 
                  'Tech', 'Other']

# creating companies, departments and titles to populate
company_department_jobtitle = pd.read_csv(directory + "data_for_random.csv", sep = ";", 
                                     error_bad_lines = False, encoding = 'cp1252')

# creating employment_status
employment_status = ["Current Employee", "Former Employee"]

# creating last_year_of_employment
last_year_of_employment = [i for i in range(2000, 2021)]

In [5]:
# separating into 3 df's
company = company_department_jobtitle["Company"]
department = company_department_jobtitle["Department"]
jobtitle = company_department_jobtitle["Job Title"]

# cleaning the NaN's
company.dropna(inplace = True)
department.dropna(inplace = True)
jobtitle.dropna(inplace = True)

In [6]:
# setting the size of the DF to populate
size = 1000

# setting the distribution to populate ratings
ratings = st.norm.rvs(loc = 3 , scale = 1, size = size)

In [7]:
# setting the distribution to populate salaries
salary = st.norm.rvs(loc = 18000, scale = 8000, size = size)

In [8]:
# creating and filling the columns for the DF
dataset = {"company_id": [i for i in range(size)], 
          "company_name": [random.choice(company) for i in range(size)], 
          "user_id": [1000 + i for i in range(size)], 
          "employment_status": [random.choice(employment_status) for i in range(size)], 
          "last_year_of_employment": [random.choice(last_year_of_employment) for i in range(size)], 
          "location": [random.choice(locations) for i in range(size)], 
          "job_location": [random.choice(locations) for i in range(size)], 
          "job_type": [random.choice(job_type) for i in range(size)], 
          "job_level": [random.choice(job_level) for i in range(size)], 
          "department": [random.choice(job_department) for i in range(size)], 
          "job_title": [random.choice(jobtitle) for i in range(size)], 
          "equal_opportunity": [round(random.choice(ratings)) for i in range(size)], 
          "leadership_female_representation": [round(random.choice(ratings)) for i in range(size)], 
          "women_management_opportunities": [round(random.choice(ratings)) for i in range(size)], 
          "maternity_adoptive_leave": [round(random.choice(ratings)) for i in range(size)], 
          "family_growth_support": [round(random.choice(ratings)) for i in range(size)], 
          "paid_time_off": [round(random.choice(ratings)) for i in range(size)], 
          "flexible_work_hours": [round(random.choice(ratings)) for i in range(size)], 
          "ability_to_telecommute": [round(random.choice(ratings)) for i in range(size)], 
          "salary_satisfaction": [round(random.choice(ratings)) for i in range(size)], 
          "sponsorship_or_mentorship_program": [round(random.choice(ratings)) for i in range(size)], 
          "learning_opportunities": [round(random.choice(ratings)) for i in range(size)], 
          "wellness_initiatives": [round(random.choice(ratings)) for i in range(size)], 
          "employer_responsiveness": [round(random.choice(ratings)) for i in range(size)], 
          "people_you_work_with": [round(random.choice(ratings)) for i in range(size)], 
          "social_activities_environment": [round(random.choice(ratings)) for i in range(size)], 
          "overall_satisfaction": [round(random.choice(ratings)) for i in range(size)], 
          "public_comment": ["".join([random.choice(string.printable) for i in range(15)]) for j in range(size)], 
          "asked_pay_raise_last_12_months": [random.choice([0, 1, 2]) for i in range(size)], 
          "think_pt_buiz_doing_enough_to_address_gender_pay_gap": [random.choice([0, 1]) for i in range(size)],
          "think_gov_doing_enough_to_address_gender_pay_gap": [random.choice([0, 1]) for i in range(size)], 
          "confidence_in_ability_to_negotiate_salary": [round(random.choice(ratings)) for i in range(size)], 
          "sex_orientation": [random.choice(["Heterosexual", "Gay/Lesbian", "Bisexual", "Other"]) for i in range(size)], 
          "gender": [random.choice(["Male", "Female", "Non-Binary", "Other"]) for i in range(size)], 
          "marital_status": [random.choice([0, 1]) for i in range(size)], 
          "children": [random.choice([0, 1]) for i in range(size)], 
          "education": [random.choice([0, 1, 2]) for i in range(size)],
          "salary": [round(random.choice(salary), 2) for i in range(size)]}

In [9]:
# initializing the DF
df = pd.DataFrame(dataset)

In [10]:
# checking the df
df.sample(10)

Unnamed: 0,company_id,company_name,user_id,employment_status,last_year_of_employment,location,job_location,job_type,job_level,department,...,asked_pay_raise_last_12_months,think_pt_buiz_doing_enough_to_address_gender_pay_gap,think_gov_doing_enough_to_address_gender_pay_gap,confidence_in_ability_to_negotiate_salary,sex_orientation,gender,marital_status,children,education,salary
487,487,Altri,1487,Current Employee,2014,Viana do Castelo,Beja,Temporary,Executive,Business Development,...,1,0,1,3.0,Heterosexual,Male,1,1,2,30012.51
215,215,Sonae Indústria,1215,Current Employee,2015,Leiria,Guarda,Contract,Mid-Level,Tech,...,0,1,0,3.0,Heterosexual,Other,1,1,1,17370.7
104,104,Chipidea,1104,Former Employee,2002,Viana do Castelo,Guarda,Part-time,Mid-Level,Production,...,2,0,1,3.0,Heterosexual,Female,0,1,1,26161.17
663,663,Renova,1663,Current Employee,2015,Bragança,Coimbra,Internship,Executive,Production,...,2,1,1,5.0,Other,Female,0,0,2,6267.07
23,23,The Navigator Company,1023,Current Employee,2008,Madeira,Viana do Castelo,Temporary,Executive,Operations,...,0,0,0,3.0,Gay/Lesbian,Other,1,0,1,12333.62
922,922,Bial,1922,Current Employee,2020,Guarda,Guarda,Part-time,Mid-Level,Business Development,...,2,0,0,2.0,Gay/Lesbian,Other,1,1,0,16329.6
188,188,Sumol + Compal,1188,Former Employee,2017,Guarda,Aveiro,Part-time,Senior-Level,Other,...,1,1,1,3.0,Bisexual,Male,0,1,2,12165.64
73,73,Critical Software,1073,Former Employee,2005,Portalegre,Bragança,Part-time,Executive,Product,...,1,1,1,3.0,Gay/Lesbian,Male,1,0,1,946.45
685,685,Pingo Doce,1685,Current Employee,2000,Viana do Castelo,Santarém,Other,Mid-Level,Operations,...,1,0,1,4.0,Gay/Lesbian,Male,1,1,1,11286.65
90,90,Sumol + Compal,1090,Current Employee,2017,Viana do Castelo,Santarém,Contract,Mid-Level,Admin,...,1,1,0,2.0,Gay/Lesbian,Male,1,1,2,12677.88


In [11]:
# changing last year of employment to be coherent w/last_year_of_employment
current_employee = df["employment_status"] == "Current Employee"
year = "last_year_of_employment"

df.loc[current_employee, year] = 2020

In [12]:
df.sample(10)

Unnamed: 0,company_id,company_name,user_id,employment_status,last_year_of_employment,location,job_location,job_type,job_level,department,...,asked_pay_raise_last_12_months,think_pt_buiz_doing_enough_to_address_gender_pay_gap,think_gov_doing_enough_to_address_gender_pay_gap,confidence_in_ability_to_negotiate_salary,sex_orientation,gender,marital_status,children,education,salary
267,267,Altri,1267,Current Employee,2020,Setúbal,Viseu,Temporary,Senior-Level,Tech,...,1,0,1,3.0,Heterosexual,Male,0,0,2,13424.72
929,929,Sonae Indústria,1929,Current Employee,2020,Santarém,Madeira,Internship,Mid-Level,Operations,...,2,1,0,3.0,Heterosexual,Female,0,1,1,9158.51
654,654,Unicer Brewery,1654,Former Employee,2020,Coimbra,Madeira,Contract,Senior-Level,Business Intelligence,...,2,0,0,3.0,Bisexual,Female,0,1,1,16548.53
507,507,Sonae Indústria,1507,Current Employee,2020,Vila Real,Viana do Castelo,Part-time,Executive,IT,...,1,1,0,2.0,Other,Non-Binary,1,1,0,15878.4
414,414,Montepio,1414,Current Employee,2020,Beja,Lisboa,Contract,Mid-Level,Engineering,...,2,0,0,3.0,Other,Female,1,0,1,17754.81
366,366,Lactogal,1366,Former Employee,2001,Faro,Vila Real,Temporary,Mid-Level,Production,...,2,0,0,2.0,Heterosexual,Male,1,0,0,16343.02
55,55,Lactogal,1055,Current Employee,2020,Guarda,Guarda,Part-time,Executive,Sales,...,1,1,0,5.0,Other,Other,1,1,0,23715.03
150,150,UMM,1150,Current Employee,2020,Braga,Aveiro,Temporary,Mid-Level,Business Development,...,2,1,1,3.0,Gay/Lesbian,Other,1,0,2,3489.85
189,189,Tupam editores,1189,Former Employee,2015,Coimbra,Vila Real,Internship,Early Career,Other,...,1,0,1,3.0,Bisexual,Female,1,0,2,24524.69
749,749,The Navigator Company,1749,Former Employee,2007,Madeira,Santarém,Other,Senior-Level,Operations,...,0,1,0,3.0,Heterosexual,Female,0,0,2,2896.05


In [13]:
df.to_json(r'base_de_dados.json', orient = 'records')