In [1]:
# libraries
import jedi
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pandas_profiling
import pygments
import random 
import re
import seaborn as sns
from scipy import stats as st
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report, accuracy_score
import string
from tqdm import tqdm
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [2]:
# Versions
print("jedi version:", jedi.__version__)
print("json version:", json.__version__)
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("pandas profiling version:", pandas_profiling.__version__)
print("Pygments version:", pygments.__version__)
print("re version:", re.__version__)
print("Seaborn version:", sns.__version__)
print("SKLearn version:", sklearn.__version__)

jedi version: 0.15.1
json version: 2.0.9
numpy version: 1.17.2
pandas version: 0.25.1
pandas profiling version: 2.4.0
Pygments version: 2.4.2
re version: 2.2.1
Seaborn version: 0.9.0
SKLearn version: 0.21.3


In [3]:
# Directories & Files
os.listdir()

# Datasets directory
directory = "./datasets/"

## Database Documentation: 

* Equal Opportunities for Women and Men: Promotions, leadership roles, salary increases, incentive programs, etc

* Female Representation in Leadership: Women on the executive team, in senior leadership, etc

* Management Opportunities: Your chances of becoming a manager of teams and talent

* Maternity and Adoptive Leave: Paid parental leave policies, job security, support for returning moms, etc

* Family Growth Support: Access to dedicated lactation rooms, child care, expense reimbursement, etc

* Paid Time Off: Sick days, vacation days, and personal days

* Flexible Work Hours: Ability to set your schedule as long as you get your work done

* Ability to Telecommute: Flexibility to work remotely

* Salary Satisfaction: Salary, merit increases, cost of living adjustments, overall comp

* Sponsorship or Mentorship Program: Official mentorship program, women-focused initiatives or affiliate groups

* Learning Opportunities: On and off-site skills training, speaker series, conferences, etc

* Wellness Initiatives: On-site gym, gym discounts, walking desks, healthy food options, etc

* Employer Responsiveness: Effective channels for elevating issues and concerns

* The People You Work With: Respectful, professional, unbiased, all those good things

* Social Activities and Environment: Happy hours, game room, company outings, and other perks

In [4]:
# creating locations to populate
locations = ['Lisboa', 'Porto', 'Braga', 'Setúbal', 'Aveiro', 'Faro', 'Leiria', 'Coimbra', 
             'Santarém', 'Viseu','Viana do Castelo', 'Madeira', 'Açores', 'Vila Real', 
             'Castelo Branco', 'Évora', 'Guarda', 'Beja', 'Bragança', 'Portalegre']

# creating job_types to populate
job_type = ['Full-time', 'Part-time', 'Contract', 'Temporary', 'Internship', 'Other']

# creating job_levels to populate
job_level = ['Early Career', 'Mid-Level', 'Senior-Level', 'Executive']

# creating job_departments to populate
job_department = ['Accounting & Finance', 'Admin', 'Business Development', 
                  'Business Intelligence', 'Communications', 'Customer Service', 'Design', 
                  'Engineering', 'Human Resources', 'IT', 'Legal', 'Management', 'Marketing', 
                  'Operations', 'Product', 'Production', 'Research & Development', 'Sales', 
                  'Tech', 'Other']

# creating companies, departments and titles to populate
company_department_jobtitle = pd.read_csv(directory + "data_for_random.csv", sep = ";", 
                                     error_bad_lines = False, encoding = 'cp1252')

# creating employment_status
employment_status = ["Current Employee", "Former Employee"]

# creating last_year_of_employment
last_year_of_employment = [i for i in range(2000, 2021)]

In [5]:
# separating into 3 df's
company = company_department_jobtitle["Company"]
department = company_department_jobtitle["Department"]
jobtitle = company_department_jobtitle["Job Title"]

# cleaning the NaN's
company.dropna(inplace = True)
department.dropna(inplace = True)
jobtitle.dropna(inplace = True)

In [6]:
# setting the size of the DF to populate
size = 10000

# setting the distribution to populate ratings
ratings = st.norm.rvs(loc = 3 , scale = 1, size = size)

# setting another distribuition to vary the distribuitions to pop ratings
p_ratings = st.poisson.rvs(loc = 3 , mu = 3, size = size)

In [7]:
# setting the distribution to populate salaries
salary = st.norm.rvs(loc = 18000, scale = 8000, size = size)

In [8]:
# creating and filling the columns for the DF
dataset = {"company_id": [i for i in range(size)], 
          "company_name": [random.choice(company) for i in range(size)], 
          "user_id": [1000 + i for i in range(size)], 
          "employment_status": [random.choice(employment_status) for i in range(size)], 
          "last_year_of_employment": [random.choice(last_year_of_employment) for i in range(size)], 
          "location": [random.choice(locations) for i in range(size)], 
          "job_location": [random.choice(locations) for i in range(size)], 
          "job_type": [random.choice(job_type) for i in range(size)], 
          "job_level": [random.choice(job_level) for i in range(size)], 
          "department": [random.choice(job_department) for i in range(size)], 
          "job_title": [random.choice(jobtitle) for i in range(size)], 
          "equal_opportunity": [round(random.choice(ratings)) for i in range(size)], 
          "leadership_female_representation": [round(random.choice(p_ratings)) for i in range(size)], 
          "women_management_opportunities": [round(random.choice(p_ratings)) for i in range(size)], 
          "maternity_adoptive_leave": [round(random.choice(ratings)) for i in range(size)], 
          "family_growth_support": [round(random.choice(p_ratings)) for i in range(size)], 
          "paid_time_off": [round(random.choice(ratings)) for i in range(size)], 
          "flexible_work_hours": [round(random.choice(ratings)) for i in range(size)], 
          "ability_to_telecommute": [round(random.choice(p_ratings)) for i in range(size)], 
          "salary_satisfaction": [round(random.choice(ratings)) for i in range(size)], 
          "sponsorship_or_mentorship_program": [round(random.choice(p_ratings)) for i in range(size)], 
          "learning_opportunities": [round(random.choice(ratings)) for i in range(size)], 
          "wellness_initiatives": [round(random.choice(ratings)) for i in range(size)], 
          "employer_responsiveness": [round(random.choice(p_ratings)) for i in range(size)], 
          "people_you_work_with": [round(random.choice(ratings)) for i in range(size)], 
          "social_activities_environment": [round(random.choice(p_ratings)) for i in range(size)], 
          "overall_satisfaction": [round(random.choice(ratings)) for i in range(size)], 
          "public_comment": ["".join([random.choice(string.printable) for i in range(15)]) for j in range(size)], 
          "asked_pay_raise_last_12_months": [random.choice([0, 1, 2]) for i in range(size)], 
          "think_pt_buiz_doing_enough_to_address_gender_pay_gap": [random.choice([0, 1]) for i in range(size)],
          "think_gov_doing_enough_to_address_gender_pay_gap": [random.choice([0, 1]) for i in range(size)], 
          "confidence_in_ability_to_negotiate_salary": [round(random.choice(ratings)) for i in range(size)], 
          "sex_orientation": [random.choice(["Heterosexual", "Gay/Lesbian", "Bisexual", "Other"]) for i in range(size)], 
          "gender": [random.choice(["Male", "Female", "Non-Binary", "Other"]) for i in range(size)], 
          "marital_status": [random.choice([0, 1]) for i in range(size)], 
          "children": [random.choice([0, 1]) for i in range(size)], 
          "education": [random.choice([0, 1, 2]) for i in range(size)],
          "salary": [round(random.choice(salary), 2) for i in range(size)]}

In [9]:
# initializing the DF
df = pd.DataFrame(dataset)

In [10]:
# checking the df
df.sample(10)

Unnamed: 0,company_id,company_name,user_id,employment_status,last_year_of_employment,location,job_location,job_type,job_level,department,...,asked_pay_raise_last_12_months,think_pt_buiz_doing_enough_to_address_gender_pay_gap,think_gov_doing_enough_to_address_gender_pay_gap,confidence_in_ability_to_negotiate_salary,sex_orientation,gender,marital_status,children,education,salary
7573,7573,Corticeira Amorim,8573,Current Employee,2014,Évora,Setúbal,Temporary,Early Career,Customer Service,...,1,1,0,2.0,Gay/Lesbian,Female,1,1,1,11667.35
8586,8586,Uniplaces,9586,Current Employee,2006,Vila Real,Vila Real,Internship,Senior-Level,Human Resources,...,1,0,1,3.0,Gay/Lesbian,Female,0,1,1,19732.77
4869,4869,Pingo Doce,5869,Former Employee,2003,Açores,Castelo Branco,Other,Mid-Level,Business Intelligence,...,0,1,0,1.0,Gay/Lesbian,Non-Binary,0,1,0,26131.23
9739,9739,Ironhack,10739,Former Employee,2002,Santarém,Coimbra,Other,Senior-Level,Sales,...,0,1,1,1.0,Bisexual,Other,0,0,0,19726.75
1310,1310,Sonae,2310,Current Employee,2014,Viseu,Lisboa,Contract,Executive,Engineering,...,1,0,1,2.0,Other,Other,0,0,1,17026.82
8425,8425,Caixa Geral de Depósitos,9425,Former Employee,2016,Guarda,Viseu,Internship,Early Career,Other,...,1,1,1,3.0,Other,Non-Binary,0,1,2,25179.34
4181,4181,Critical Software,5181,Current Employee,2006,Faro,Castelo Branco,Internship,Senior-Level,Legal,...,0,1,1,4.0,Bisexual,Male,1,1,0,26971.0
7492,7492,Cofina,8492,Current Employee,2005,Vila Real,Castelo Branco,Part-time,Executive,IT,...,2,1,1,2.0,Bisexual,Male,1,1,1,14446.5
908,908,Montepio,1908,Current Employee,2002,Guarda,Madeira,Internship,Executive,Production,...,2,0,0,3.0,Heterosexual,Male,0,0,2,27098.48
6387,6387,Ciberbit,7387,Current Employee,2006,Porto,Bragança,Other,Executive,Engineering,...,0,0,1,2.0,Bisexual,Non-Binary,0,0,1,16328.59


In [11]:
# changing last year of employment to be coherent w/last_year_of_employment
current_employee = df["employment_status"] == "Current Employee"
year = "last_year_of_employment"

df.loc[current_employee, year] = 2020

In [12]:
# changing ratings to be between 0 and 5 
for col in df.columns:
    if (df[col].dtype == "float64") & (col != "salary"):
        small_mask = (df[col] < 0)
        big_mask = (df[col] > 5)
        df.loc[small_mask, col] = 0
        df.loc[big_mask, col] = 0
        
        # checking
        print((df[col] < 0).sum())

# changing salary to be > 0
less_than_0_salary = (df["salary"] < 0)

df.loc[less_than_0_salary, "salary"] = 0

# checking
(df["salary"] < 0).sum()

0
0
0
0
0
0
0
0
0
0


0

In [13]:
df.sample(10)

Unnamed: 0,company_id,company_name,user_id,employment_status,last_year_of_employment,location,job_location,job_type,job_level,department,...,asked_pay_raise_last_12_months,think_pt_buiz_doing_enough_to_address_gender_pay_gap,think_gov_doing_enough_to_address_gender_pay_gap,confidence_in_ability_to_negotiate_salary,sex_orientation,gender,marital_status,children,education,salary
6621,6621,Media Capital,7621,Current Employee,2020,Açores,Portalegre,Full-time,Mid-Level,Sales,...,1,0,0,3.0,Gay/Lesbian,Female,0,1,0,30892.77
939,939,Biotecnol,1939,Former Employee,2000,Porto,Évora,Part-time,Executive,Business Development,...,2,0,0,5.0,Other,Male,0,1,2,12230.39
5117,5117,Ciberbit,6117,Current Employee,2020,Évora,Lisboa,Part-time,Executive,Business Development,...,1,0,0,2.0,Bisexual,Non-Binary,0,1,2,17485.86
5330,5330,Quidgest,6330,Current Employee,2020,Viseu,Bragança,Internship,Early Career,Legal,...,1,0,1,4.0,Gay/Lesbian,Female,0,1,1,17502.3
7236,7236,Sovena Group,8236,Former Employee,2010,Coimbra,Guarda,Full-time,Early Career,Legal,...,1,1,1,4.0,Heterosexual,Non-Binary,0,0,2,7356.14
6108,6108,Sovena Group,7108,Former Employee,2013,Santarém,Açores,Part-time,Early Career,Human Resources,...,2,1,0,1.0,Other,Non-Binary,1,1,0,10851.38
7617,7617,Quidgest,8617,Former Employee,2011,Setúbal,Viseu,Internship,Senior-Level,Management,...,0,0,1,5.0,Gay/Lesbian,Non-Binary,1,1,1,20239.58
871,871,Move Interactive,1871,Former Employee,2013,Braga,Aveiro,Other,Mid-Level,Sales,...,1,0,0,2.0,Gay/Lesbian,Male,1,1,0,27965.61
2831,2831,The Navigator Company,3831,Former Employee,2012,Lisboa,Évora,Part-time,Senior-Level,Operations,...,1,1,0,5.0,Bisexual,Male,0,0,2,8867.85
4056,4056,Portugal Telecom,5056,Current Employee,2020,Bragança,Bragança,Part-time,Executive,Human Resources,...,1,0,0,4.0,Other,Female,1,1,0,13329.26


In [14]:
# reducing the table
df.drop(["user_id", "employment_status", "job_location", "public_comment"], axis = 1, 
        inplace = True)

In [15]:
# avg's of sections
print(df.columns)
df["opportunity"] = round((df["equal_opportunity"] + df["leadership_female_representation"] + df["women_management_opportunities"]) / 3, 2)
df["family"] = round((df["maternity_adoptive_leave"] + df["family_growth_support"]) / 2, 2)
df["schedule_flex"] = round((df["paid_time_off"] + df["flexible_work_hours"] + df["ability_to_telecommute"]) / 3, 2)
df["enrichment"] = round((df["salary_satisfaction"] + df["sponsorship_or_mentorship_program"] + df["learning_opportunities"] + df["wellness_initiatives"]) / 4, 2)
df["culture"] = round((df["employer_responsiveness"] + df["people_you_work_with"] + df["social_activities_environment"]) / 3, 2)

# changing overall rating
df["overall_satisfaction"] = round((df["opportunity"] + df["family"] + df["schedule_flex"] + df["enrichment"] + df["culture"]) / 5, 2)

# checking
df["opportunity"]
df["family"]
df["schedule_flex"]
df["enrichment"]
df["culture"]
df["overall_satisfaction"]
df

Index(['company_id', 'company_name', 'last_year_of_employment', 'location',
       'job_type', 'job_level', 'department', 'job_title', 'equal_opportunity',
       'leadership_female_representation', 'women_management_opportunities',
       'maternity_adoptive_leave', 'family_growth_support', 'paid_time_off',
       'flexible_work_hours', 'ability_to_telecommute', 'salary_satisfaction',
       'sponsorship_or_mentorship_program', 'learning_opportunities',
       'wellness_initiatives', 'employer_responsiveness',
       'people_you_work_with', 'social_activities_environment',
       'overall_satisfaction', 'asked_pay_raise_last_12_months',
       'think_pt_buiz_doing_enough_to_address_gender_pay_gap',
       'think_gov_doing_enough_to_address_gender_pay_gap',
       'confidence_in_ability_to_negotiate_salary', 'sex_orientation',
       'gender', 'marital_status', 'children', 'education', 'salary'],
      dtype='object')


Unnamed: 0,company_id,company_name,last_year_of_employment,location,job_type,job_level,department,job_title,equal_opportunity,leadership_female_representation,...,gender,marital_status,children,education,salary,opportunity,family,schedule_flex,enrichment,culture
0,0,Tupam editores,2000,Braga,Contract,Executive,Sales,Management Analyst,2.0,8,...,Non-Binary,0,1,0,5918.37,5.67,4.0,3.00,3.50,6.00
1,1,Fábrica Nacional de Munições de Armas Ligeiras,2020,Setúbal,Internship,Early Career,Other,Regional Manager,3.0,5,...,Other,0,0,1,21819.95,5.33,5.5,5.33,4.25,5.00
2,2,SIC,2011,Portalegre,Other,Senior-Level,Marketing,Shipping Clerk,3.0,3,...,Non-Binary,0,1,0,12580.97,4.33,4.5,3.67,4.75,3.00
3,3,NOS,2000,Aveiro,Temporary,Early Career,Research & Development,Supply Chain Analyst,3.0,9,...,Male,1,0,0,3947.42,6.33,5.5,4.00,3.50,4.33
4,4,Caixa Geral de Depósitos,2020,Castelo Branco,Part-time,Early Career,Design,Customer Care Representative,4.0,7,...,Other,1,1,1,18423.79,6.67,5.5,4.33,3.25,4.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,Tranquilidade,2003,Vila Real,Other,Early Career,Legal,Line Cook,1.0,10,...,Male,1,1,0,19977.52,5.67,3.0,4.67,4.00,6.67
9996,9996,Bial,2020,Braga,Contract,Mid-Level,Research & Development,Production Artist,3.0,6,...,Non-Binary,1,1,1,8818.03,5.00,4.0,4.00,3.50,3.67
9997,9997,CMS-Helmets,2012,Açores,Temporary,Mid-Level,Other,Actor,3.0,7,...,Non-Binary,1,1,0,20894.01,4.33,4.0,4.33,4.50,3.67
9998,9998,Soares da Costa,2013,Braga,Other,Early Career,Admin,Energy Consultant,3.0,5,...,Female,0,0,2,8827.88,4.33,7.0,2.67,2.75,4.33


In [16]:
# creating smaller DF
ratings = df[["company_id", "company_name", "location", "opportunity", "family", "schedule_flex", "enrichment", "culture", 
              "overall_satisfaction"]]

ratings

Unnamed: 0,company_id,company_name,location,opportunity,family,schedule_flex,enrichment,culture,overall_satisfaction
0,0,Tupam editores,Braga,5.67,4.0,3.00,3.50,6.00,4.43
1,1,Fábrica Nacional de Munições de Armas Ligeiras,Setúbal,5.33,5.5,5.33,4.25,5.00,5.08
2,2,SIC,Portalegre,4.33,4.5,3.67,4.75,3.00,4.05
3,3,NOS,Aveiro,6.33,5.5,4.00,3.50,4.33,4.73
4,4,Caixa Geral de Depósitos,Castelo Branco,6.67,5.5,4.33,3.25,4.33,4.82
...,...,...,...,...,...,...,...,...,...
9995,9995,Tranquilidade,Vila Real,5.67,3.0,4.67,4.00,6.67,4.80
9996,9996,Bial,Braga,5.00,4.0,4.00,3.50,3.67,4.03
9997,9997,CMS-Helmets,Açores,4.33,4.0,4.33,4.50,3.67,4.17
9998,9998,Soares da Costa,Braga,4.33,7.0,2.67,2.75,4.33,4.22


In [17]:
# exporting db to json
df.to_json(r"base_de_dados.json", orient = "records")

In [18]:
# exporting ratings db to json
ratings.to_json(r"ratings.json", orient = "records")

In [19]:
# exporting ratings db to csv
ratings.to_csv(directory + r"ratings.csv")

In [20]:
# reading updated json file
updated_data = pd.read_json(r"ratings.json")

In [21]:
updated_data

Unnamed: 0,company_id,company_name,location,opportunity,family,schedule_flex,enrichment,culture,overall_satisfaction
0,0,Tupam editores,Braga,5.67,4.0,3.00,3.50,6.00,4.43
1,1,Fábrica Nacional de Munições de Armas Ligeiras,Setúbal,5.33,5.5,5.33,4.25,5.00,5.08
2,2,SIC,Portalegre,4.33,4.5,3.67,4.75,3.00,4.05
3,3,NOS,Aveiro,6.33,5.5,4.00,3.50,4.33,4.73
4,4,Caixa Geral de Depósitos,Castelo Branco,6.67,5.5,4.33,3.25,4.33,4.82
...,...,...,...,...,...,...,...,...,...
9995,9995,Tranquilidade,Vila Real,5.67,3.0,4.67,4.00,6.67,4.80
9996,9996,Bial,Braga,5.00,4.0,4.00,3.50,3.67,4.03
9997,9997,CMS-Helmets,Açores,4.33,4.0,4.33,4.50,3.67,4.17
9998,9998,Soares da Costa,Braga,4.33,7.0,2.67,2.75,4.33,4.22


In [22]:
# Random Forest Prediction Model
# creating features
X = df.drop(["company_id", "company_name", "last_year_of_employment", "location", "job_type",
            "job_level", "department", "job_title", "sex_orientation", "gender", "salary",
            "opportunity", "family", "schedule_flex", "enrichment", "culture"], axis = 1)

# creating labels
y = df["company_name"]

# Spliting the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [23]:
# creating a Gaussian Classifier
clf = RandomForestClassifier(n_estimators = 100)

# training the model using the training sets y_pred = clf.predict(X_test)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [24]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.017


In [25]:
# checking the importance of the features to predict
feature_imp = pd.Series(clf.feature_importances_, index = X.columns).sort_values(ascending = False)
feature_imp

overall_satisfaction                                    0.083161
women_management_opportunities                          0.054394
social_activities_environment                           0.054358
sponsorship_or_mentorship_program                       0.054228
family_growth_support                                   0.053605
ability_to_telecommute                                  0.053574
leadership_female_representation                        0.053108
employer_responsiveness                                 0.050869
equal_opportunity                                       0.044994
paid_time_off                                           0.044677
learning_opportunities                                  0.044407
people_you_work_with                                    0.043843
salary_satisfaction                                     0.043493
flexible_work_hours                                     0.043373
confidence_in_ability_to_negotiate_salary               0.043176
maternity_adoptive_leave 

In [26]:
"""
def update_panel_json(input_json, target_key, update_value):
    if type(input_json) is dict and input_json:
        for key in input_json:
            if key == target_key:
                input_json[key] = update_value
            update_panel_json(input_json[key], target_key, update_value)

    elif type(input_json) is list and input_json:
        for entity in input_json:
            update_panel_json(entity, target_key, update_value)
"""

'\ndef update_panel_json(input_json, target_key, update_value):\n    if type(input_json) is dict and input_json:\n        for key in input_json:\n            if key == target_key:\n                input_json[key] = update_value\n            update_panel_json(input_json[key], target_key, update_value)\n\n    elif type(input_json) is list and input_json:\n        for entity in input_json:\n            update_panel_json(entity, target_key, update_value)\n'