In [147]:
# libraries
import jedi
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pandas_profiling
import pygments
import random 
import re
import seaborn as sns
from scipy import stats as st
import string
from tqdm import tqdm
import this

In [149]:
# Versions
print("jedi version:", jedi.__version__)
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("pandas profiling version:", pandas_profiling.__version__)
print("Pygments version:", pygments.__version__)
print("re version:", re.__version__)
print("Seaborn version:", sns.__version__)

jedi version: 0.15.1
numpy version: 1.17.2
pandas version: 0.25.1
pandas profiling version: 2.4.0
Pygments version: 2.4.2
re version: 2.2.1
Seaborn version: 0.9.0


In [4]:
# Directories & Files
os.listdir()

# Datasets directory
directory = "./datasets/"

## Database Documentation: 

* Equal Opportunities for Women and Men: Promotions, leadership roles, salary increases, incentive programs, etc

* Female Representation in Leadership: Women on the executive team, in senior leadership, etc

* Management Opportunities: Your chances of becoming a manager of teams and talent

* Maternity and Adoptive Leave: Paid parental leave policies, job security, support for returning moms, etc

* Family Growth Support: Access to dedicated lactation rooms, child care, expense reimbursement, etc

* Paid Time Off: Sick days, vacation days, and personal days

* Flexible Work Hours: Ability to set your schedule as long as you get your work done

* Ability to Telecommute: Flexibility to work remotely

* Salary Satisfaction: Salary, merit increases, cost of living adjustments, overall comp

* Sponsorship or Mentorship Program: Official mentorship program, women-focused initiatives or affiliate groups

* Learning Opportunities: On and off-site skills training, speaker series, conferences, etc

* Wellness Initiatives: On-site gym, gym discounts, walking desks, healthy food options, etc

* Employer Responsiveness: Effective channels for elevating issues and concerns

* The People You Work With: Respectful, professional, unbiased, all those good things

* Social Activities and Environment: Happy hours, game room, company outings, and other perks

In [136]:
# creating locations to populate
locations = ['Lisboa', 'Porto', 'Braga', 'Setúbal', 'Aveiro', 'Faro', 'Leiria', 'Coimbra', 
             'Santarém', 'Viseu','Viana do Castelo', 'Madeira', 'Açores', 'Vila Real', 
             'Castelo Branco', 'Évora', 'Guarda', 'Beja', 'Bragança', 'Portalegre']

# creating job_types to populate
job_type = ['Full-time', 'Part-time', 'Contract', 'Temporary', 'Internship', 'Other']

# creating job_levels to populate
job_level = ['Early Career', 'Mid-Level', 'Senior-Level', 'Executive']

# creating job_departments to populate
job_department = ['Accounting & Finance', 'Admin', 'Business Development', 
                  'Business Intelligence', 'Communications', 'Customer Service', 'Design', 
                  'Engineering', 'Human Resources', 'IT', 'Legal', 'Management', 'Marketing', 
                  'Operations', 'Product', 'Production', 'Research & Development', 'Sales', 
                  'Tech', 'Other']

# creating companies, departments and titles to populate
company_department_jobtitle = pd.read_csv(directory + "data_for_random.csv", sep = ";", 
                                     error_bad_lines = False, encoding = 'cp1252')

# creating employment_status
employment_status = ["Current Employee", "Former Employee"]

# creating last_year_of_employment
last_year_of_employment = [i for i in range(2000, 2021)]

In [117]:
# separating into 3 df's
company = company_department_jobtitle["Company"]
department = company_department_jobtitle["Department"]
jobtitle = company_department_jobtitle["Job Title"]

# cleaning the NaN's
company.dropna(inplace = True)
department.dropna(inplace = True)
jobtitle.dropna(inplace = True)

In [118]:
# setting the size of the DF to populate
size = 1000

# setting the distribution to populate ratings
ratings = st.norm.rvs(loc = 3 , scale = 1, size = size)

In [119]:
# setting the distribution to populate salaries
salary = st.norm.rvs(loc = 18000, scale = 8000, size = size)

In [151]:
# creating and filling the columns for the DF
dataset = {"company_id": [i for i in range(size)], 
          "company_name": [random.choice(company) for i in range(size)], 
          "user_id": [1000 + i for i in range(size)], 
          "employment_status": [random.choice(employment_status) for i in range(size)], 
          "last_year_of_employment": [random.choice(last_year_of_employment) for i in range(size)], 
          "location": [random.choice(locations) for i in range(size)], 
          "job_location": [random.choice(locations) for i in range(size)], 
          "job_type": [random.choice(job_type) for i in range(size)], 
          "job_level": [random.choice(job_level) for i in range(size)], 
          "department": [random.choice(job_department) for i in range(size)], 
          "job_title": [random.choice(jobtitle) for i in range(size)], 
          "equal_opportunity": [round(random.choice(ratings)) for i in range(size)], 
          "leadership_female_representation": [round(random.choice(ratings)) for i in range(size)], 
          "women_management_opportunities": [round(random.choice(ratings)) for i in range(size)], 
          "maternity_adoptive_leave": [round(random.choice(ratings)) for i in range(size)], 
          "family_growth_support": [round(random.choice(ratings)) for i in range(size)], 
          "paid_time_off": [round(random.choice(ratings)) for i in range(size)], 
          "flexible_work_hours": [round(random.choice(ratings)) for i in range(size)], 
          "ability_to_telecommute": [round(random.choice(ratings)) for i in range(size)], 
          "salary_satisfaction": [round(random.choice(ratings)) for i in range(size)], 
          "sponsorship_or_mentorship_program": [round(random.choice(ratings)) for i in range(size)], 
          "learning_opportunities": [round(random.choice(ratings)) for i in range(size)], 
          "wellness_initiatives": [round(random.choice(ratings)) for i in range(size)], 
          "employer_responsiveness": [round(random.choice(ratings)) for i in range(size)], 
          "people_you_work_with": [round(random.choice(ratings)) for i in range(size)], 
          "social_activities_environment": [round(random.choice(ratings)) for i in range(size)], 
          "overall_satisfaction": [round(random.choice(ratings)) for i in range(size)], 
          "public_comment": ["".join([random.choice(string.printable) for i in range(15)]) for j in range(size)], 
          "asked_pay_raise_last_12_months": [random.choice([0, 1, 2]) for i in range(size)], 
          "think_pt_buiz_doing_enough_to_address_gender_pay_gap": [random.choice([0, 1]) for i in range(size)],
          "think_gov_doing_enough_to_address_gender_pay_gap": [random.choice([0, 1]) for i in range(size)], 
          "confidence_in_ability_to_negotiate_salary": [round(random.choice(ratings)) for i in range(size)], 
          "sex_orientation": [random.choice(["Heterosexual", "Gay/Lesbian", "Bisexual", "Other"]) for i in range(size)], 
          "gender": [random.choice(["Male", "Female", "Non-Binary", "Other"]) for i in range(size)], 
          "marital_status": [random.choice([0, 1]) for i in range(size)], 
          "children": [random.choice([0, 1]) for i in range(size)], 
          "education": [random.choice([0, 1, 2]) for i in range(size)],
          "salary": [round(random.choice(salary), 2) for i in range(size)]}

In [152]:
# initializing the 
df = pd.DataFrame(dataset)

In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 38 columns):
company_id                                              1000 non-null int64
company_name                                            1000 non-null object
user_id                                                 1000 non-null int64
employment_status                                       1000 non-null object
last_year_of_employment                                 1000 non-null int64
location                                                1000 non-null object
job_location                                            1000 non-null object
job_type                                                1000 non-null object
job_level                                               1000 non-null object
department                                              1000 non-null object
job_title                                               1000 non-null object
equal_opportunity                                 

In [100]:
random.choice(company_and_department["Company"])

nan

In [76]:
random.choice(company_and_department["Company"])
company_and_department["Company"].isna().sum()
database["location"] = random.choice(locations)
random.choice(locations)

'Viseu'

In [75]:
database

Unnamed: 0,company_id,company_name,user_id,employment_status,last_year_of_employment,location,job_location,job_type,job_level,department,...,asked_pay_raise_last_12_months,think_pt_buiz_doing_enough_to_address_gender_pay_gap,think_gov_doing_enough_to_address_gender_pay_gap,confidence_in_ability_to_negotiate_salary,sex_orientation,gender,ethnicity,marital_status,children,education


In [50]:
database.columns

Index(['company_id', 'company_name', 'user_id', 'employment_status',
       'last_year_of_employment', 'location', 'job_location', 'job_type',
       'job_level', 'department', 'job_title', 'equal_opportunity',
       'leadership_female_representation', 'women_management_opportunities',
       'maternity_adoptive_leave', 'family_growth_support', 'paid_time_off',
       'flexible_work_hours', 'ability_to_telecommute', 'salary_satisfaction',
       'sonsorship_or_mentorship_program', 'learning_opportunities',
       'wellness_initiatives', 'employer_responsiveness',
       'people_you_work_with', 'social_activities_environment',
       'overall_satisfaction', 'public_comment',
       'asked_pay_raise_last_12_months',
       'think_pt_buiz_doing_enough_to_address_gender_pay_gap',
       'think_gov_doing_enough_to_address_gender_pay_gap',
       'confidence_in_ability_to_negotiate_salary', 'sex_orientation',
       'gender', 'ethnicity', 'marital_status', 'children', 'education'],
      dt

In [38]:
company_and_department = pd.read_csv(directory + "data_for_random.csv", sep = ";", 
                                     error_bad_lines = False, encoding = 'cp1252')
company_and_department

Unnamed: 0,Company,Department,Job Title
0,Altri,Administrative,ABA Therapist
1,Ambar – Ideas on Paper S.A.,Arts & Design,Academic Advisor
2,Banco Comercial Português,Business,Account Executive
3,Bial,Consulting,Account Officer
4,Biotecnol,Customer Services & Support,Accountant
...,...,...,...
601,,,Visual Designer
602,,,Visual Merchandiser
603,,,Waiter
604,,,Warehouse Worker


In [79]:
for entity in database.columns:
    x = st.norm.rvs(loc = 3 , scale = 1, size = 100000)
    company_id

In [80]:
database

Unnamed: 0,company_id,company_name,user_id,employment_status,last_year_of_employment,location,job_location,job_type,job_level,department,...,asked_pay_raise_last_12_months,think_pt_buiz_doing_enough_to_address_gender_pay_gap,think_gov_doing_enough_to_address_gender_pay_gap,confidence_in_ability_to_negotiate_salary,sex_orientation,gender,ethnicity,marital_status,children,education


In [None]:
def score(home, away):
    """Calculates the score of a match and returns it as result"""
    homeGoals = goal(home)
    awayGoals = goal(away)

    result = (homeGoals, awayGoals)
    return result

def goal(team):
    """Simulates the shots a team takes on a match and returns the amount that are goals"""
    goals = 0

    for shot in range(team["Shots"]):
        quality = random.random()
        if quality <= team["xG"]:
            goals += 1

    return goals

def oddsAndProbability(measured, measure):
    """Calculates and returns the probability of something happening and it's respective odd."""
    probability = round((measured / measure) * 100, 2)
    odds = round((1 / (measured / measure)), 2)

    return probability, odds

def simulate(home, away, matches):
    """Takes a number of matches between the same two teams to simulate the final score.
    Once it has done that, it appends each score to the list results, and counts each individual result
    to the corresponding variable.
    Then it calculates the overall probability of each outcome (Home Win, Away Win, Draw) and its
    corresponding odd."""

    homeWin = int()
    awayWin = int()
    draw = int()

    results = []

    for i in range(matches):
        (homeScore, awayScore) = score(home, away)
        results.append((homeScore, awayScore))

        if homeScore > awayScore:
            homeWin += 1
        elif homeScore < awayScore:
            awayWin += 1
        else:
            draw += 1

    homePercentage, homeOdds = oddsAndProbability(homeWin, matches)
    drawPercentage, drawOdds = oddsAndProbability(draw, matches)
    awayPercentage, awayOdds = oddsAndProbability(awayWin, matches)

    #resultsDF = pd.DataFrame(results, index=results, columns=[home["Name"], away["Name"]])
    #resultsDF = resultsDF.groupby([home["Name"], away["Name"]]).size()
    ##resultsDF["Matches"] = resultsDF.groupby([home["Name"], away["Name"]]).size()

    print(home["Name"], "Win %:", homePercentage, home["Name"], "Odds:", homeOdds)
    print("Draw %:", drawPercentage, "Draw Odds:", drawOdds)
    print(away["Name"], "Win %:", awayPercentage, away["Name"], "Odds:", awayOdds)
    #print(resultsDF)
    #print(list(resultsDF.columns.values))

    results.sort()
    resultCounts = {}
    for result in results:
        if result not in resultCounts.keys():
            resultCounts[result] = results.count(result)
        else:
            continue
    print("\n")
    for result, amount in resultCounts.items():
        scoreProbability, scoreOdds = oddsAndProbability(amount, matches)

        if scoreProbability >= 1.0:
            print(f"{result} has a {scoreProbability}% probability, with {scoreOdds} odds.")
        else:
            continue

    print()
    print("\n")

simulate(scb, fcpf, 200000)
simulate(gvfc, vsc, 200000)
simulate(mfc, bel, 200000)
simulate(far, fei, 200000)
simulate(spcov, aca, 200000)
simulate(oli, pen, 200000)
simulate(val, rm, 200000)
simulate(ars, mcity, 200000)
simulate(wolves, tot, 200000)

In [None]:
# creating the columns for the DF
columns = {"company_id": int, 
          "company_name": object, 
          "user_id": int, 
          "employment_status": str, 
          "last_year_of_employment": datetime, 
          "location": object, 
          "job_location": object, 
          "job_type": object, 
          "job_level": , 
          "department", 
          "job_title", 
          "equal_opportunity", 
          "leadership_female_representation", 
          "women_management_opportunities", 
          "maternity_adoptive_leave", 
          "family_growth_support", 
          "paid_time_off", 
          "flexible_work_hours", 
          "ability_to_telecommute", 
          "salary_satisfaction", 
          "sonsorship_or_mentorship_program", 
          "learning_opportunities", 
          "wellness_initiatives", 
          "employer_responsiveness", 
          "people_you_work_with", 
          "social_activities_environment", 
          "overall_satisfaction", 
          "public_comment", 
          "asked_pay_raise_last_12_months", 
          "think_pt_buiz_doing_enough_to_address_gender_pay_gap",
          "think_gov_doing_enough_to_address_gender_pay_gap", 
          "confidence_in_ability_to_negotiate_salary", 
          "sex_orientation", 
          "gender", 
          "ethnicity", 
          "marital_status", 
          "children", 
          "education"}

In [88]:
company_and_department.to_json(r'test.json', orient = 'records')