In [8]:
from selenium.common.exceptions import TimeoutException
from GlassdoorBot import GlassdoorBot
from currency_converter import CurrencyConverter
from currency_code import CurrencyCode
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import regex as re
import matplotlib
import warnings
import json
import os

matplotlib.use('Qt5Agg')
plt.ion()

warnings.simplefilter(action='ignore', category=FutureWarning)


class UnkownMultiplier(Exception):
    pass

In [None]:
def currency_string_to_usd(currency_string) -> float:
    # currency_string = currency_string.replace(' ', '')
    multiplier = currency_string[-1]

    # Match everything before a digit
    currency = re.search(r'^[^\d]*', currency_string).group()
    currency = CurrencyCode.get_code(currency)

    # Match a group of digits
    amount = re.search(r'(\d+)', currency_string).group()
    amount = int(amount)

    if multiplier == 'K':
        amount *= 1e3
    elif multiplier == 'M':
        amount *= 1e6
    else:
        raise UnkownMultiplier("Error: Unknown multiplier")

    try:
        currency_converter = CurrencyConverter()
        return currency_converter.convert(amount, currency, 'USD')
    except:
        return 0.0

In [None]:
job_title = "Data Engineer"

# check if there are empty rows in the database
database = pd.read_csv('database.csv')
has_empty_cells = np.any(pd.isna(database['Job title']))
locations = database['City']


while has_empty_cells:

    # update the database after every crash
    database = pd.read_csv('database.csv')
    with open("user_credentials.json") as json_file:
        user_credential = json.load(json_file)
        bot = GlassdoorBot(
            email=user_credential['email'],
            password=user_credential['password'],
            headless=False
        )

    try:
        bot.open_glassdoor()
        for i, location in enumerate(locations):

            # If the row is empty
            if pd.isna(database.at[i, 'Job title']):
                salary_details = bot.get_salary_details(job_title, location)
                job_title, city, salary_range = salary_details

                min_salary, max_salary = salary_range.split(' - ')
                frequency = max_salary[-2:]
                max_salary = max_salary[:-3].replace(' ', '')
                min_salary = min_salary.replace(' ', '')

                if frequency == 'yr':
                    mult = 1
                elif frequency == 'mo':
                    mult = 12
                else:
                    raise UnkownMultiplier("Error: Unknown multiplier")

                min_salary_usdy = currency_string_to_usd(min_salary) * mult
                max_salary_usdy = currency_string_to_usd(max_salary) * mult
                mean_salary_usdy = np.mean([min_salary_usdy, max_salary_usdy])

                # update the database
                database.at[i, 'Job title'] = job_title
                database.at[i, 'Min salary (local currency)'] = min_salary
                database.at[i, 'Max salary (local currency)'] = max_salary
                database.at[i, 'per'] = frequency
                database.at[i, 'Min salary (usd/y)'] = min_salary_usdy
                database.at[i, 'Max salary (usd/y)'] = max_salary_usdy
                database.at[i, 'Average salary (usd/y)'] = mean_salary_usdy

                # save the database
                database.to_csv('database.csv', index=False)

                # update the looping condition
                has_empty_cells = np.any(pd.isna(database['Job title']))

    except TimeoutException:
        bot.driver.close()

In [None]:
database = pd.read_csv('database.csv', usecols=[0, 1, 5, 6, 7])
database = database.sort_values('Average salary (usd/y)', ascending=False)

profession = database['Job title'][0]

plt.close('all')


num_bar, _ = database.shape
x = range(num_bar)


plt.figure()
plt.suptitle(f'{profession} salaries around the world')
plt.bar(x, database['Max salary (usd/y)'], color='lightblue', label='max')
plt.bar(x, database['Min salary (usd/y)'], color='dodgerblue', label='min')
plt.plot(x, database['Average salary (usd/y)'], 'ro-', label='average')
plt.axhline(y=100e3, color='red')
plt.legend()
plt.xticks(x, database['City'], rotation=90)
plt.ylabel('Salary (USD)')
plt.tight_layout()
plt.show()

In [1]:
from DatabaseHandler import DatabaseHandler
job_title = 'Data Engineer'

database = DatabaseHandler()
database.create_database(job_title=job_title, locations=[
                         'Paris', 'London', 'Bruxelles'])