# Indeed job listing data - Exploratory Data Analysis (EDA) 

In [1]:
# Install necessary packages 
#!pip install missingno
#!pip install wordcloud

In [2]:
# Import necessary packages 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import missingno as msno
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
from collections import defaultdict

%matplotlib inline

In [3]:
# Constants and configurations

# Ensure NLTK stopwords are downloaded (run this once)
nltk.download('stopwords')

DATA_PATH = 'output/indeed_jobs_'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/emmafrid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Keyword dictionaries and other configurations 

# Dictionary specifying column names and desired data types
dtype_dict = {
    'page': 'int64',  
    'country': 'string', 
    'job_link': 'object', 
    'search_keyword': 'string', 
    'search_location': 'string', 
    'job_title': 'string', 
    'company_name': 'string', 
    'company_location': 'object', 
    'salary': 'object', 
    'job_description': 'string'
}

# Dictionary of data frames and their corresponding currencies
currency_mapping = {
    'SWE': 'SEK',  # Swedish Krona
    'FRA': 'EUR',  # Euro
    'ITA': 'EUR',  # Euro
    'USA': 'USD'   # US Dollar
}

# Dictionary with common software/programming tools keywords 
software_keywords = {
    'Programming Languages': [
        'python', ' r ', 'sql', 'javascript', 'java', 'c++', 'c#', 'ruby', 'swift', 'kotlin', 'scala', 'matlab', 'sas', 'stata', ' go ', 'php', 'typescript', 'rust', 'bash'
    ],
    'Data Analysis and Manipulation': [
        'excel', 'pandas', 'numpy', 'dplyr', 'tidyverse', 'julia', 'matlab', 'stata'
    ],
    'Machine Learning and Statistical Modeling': [
        'scikit-learn', 'tensorflow', 'keras', 'pytorch', 'xgboost', 'catboost', 'lightgbm', 'mlpack', 'caret', 'mlr', 'weka', 'statsmodels'
    ],
    'Data Visualization and Business Intelligence (BI) Tools': [
        'tableau', 'power bi', 'matplotlib', 'seaborn', 'd3.js', 'looker', 'plotly', 'ggplot2', 'qlik', 'sap', 'looker studio', 'superset', 'metabase'
    ],
    'Big Data Technologies': [
        'spark', 'hadoop', 'bigquery', 'redshift', 'snowflake', 'databricks', 'hive', 'kafka', 'hdfs', 'flink', 'storm'
    ],
    'Database Management Systems (DBMS)': [
        'mysql', 'postgresql', 'mongodb', 'cassandra', 'oracle', 'microsoft sql server', 'firebase', 'db2', 'couchbase', 'neo4j', 'redis', 'couchdb', 'mariadb'
    ],
    'Cloud Computing': [
        'aws', 'azure', 'google cloud', 'gcp', 'ibm cloud', 'oracle cloud', 'digitalocean', 'heroku'
    ],
    'Development Tools': [
        'git', 'docker', 'vscode', 'jupyter', 'pycharm', 'rstudio', 'eclipse', 'netbeans', 'intellij idea', 'notepad++', 'sublime text', 'atom'
    ],
    'Version Control and Collaboration': [
        'github', 'gitlab', 'bitbucket', 'jira', 'confluence', 'slack', 'trello', 'microsoft teams', 'asana', 'notion'
    ],
    'Containerization and Orchestration': [
        'docker', 'kubernetes', 'openshift', 'mesos', 'rancher', 'nomad'
    ],
    'Workflow Management': [
        'airflow', 'luigi', 'prefect', 'kubeflow'
    ],
    'Data Science Platforms': [
        'databricks', 'knime', 'h2o.ai', 'rapidminer', 'datarobot', 'mlflow'
    ]
}

NameError: name 'df_SWE' is not defined

In [None]:
# Helper functions 

# Salary conversion function to handle both thousand separators and decimal points
def convert_salary(value):
    # Converts salary strings with thousand separators or decimal points into a float.
    return float(value.replace('\xa0', '').replace(' ', '').replace(',', '').replace('.', '').replace('..', '.'))

def preprocess_text(text):
    # Remove punctuation and make lowercase
    return re.sub(r'[^\w\s]', '', text.lower())

def tokenize_and_filter(text, stop_words):
    # Tokenization: split text into words and remove stopwords
    tokens = text.split()
    return [word for word in tokens if word not in stop_words]

In [None]:
# High-level functions 

def merge_US_cities(cities):
    """
    Merges job listings from multiple US cities into a single DataFrame.
    
    Parameters:
    - cities: List of city names (strings) to merge.

    Returns:
    - A DataFrame containing job listings from all specified US cities.
    """
    
    # Load data for the first city and add the 'country' column manually
    df_NY = pd.read_csv(f"{DATA_PATH}{'USA_'}{cities[0]}.csv")
    df_NY['country'] = 'USA'  # Add the 'country' column to match format
    print("Loaded data for", cities[0])

    # Load data for other cities
    df_LA = pd.read_csv(f"{DATA_PATH}{'USA_'}{cities[1]}.csv")
    df_CHI = pd.read_csv(f"{DATA_PATH}{'USA_'}{cities[2]}.csv")

    # Ensure consistent column order across DataFrames
    desired_order = df_LA.columns.tolist()
    df_NY = df_NY[desired_order]
    print("Column order for consistency:", desired_order)

    # Concatenate the DataFrames
    df_USA = pd.concat([df_NY, df_LA, df_CHI], ignore_index=True)

    # Verify column order consistency 
    assert df_USA.columns.tolist() == desired_order, "Column order mismatch!"

    return df_USA

# unique() prints the unique values, nunique() prints the number of unique values
def check_duplicates(data):
    # The number of rows should be equal to the number of unique job links, etc 
    # Get the number of rows 
    num_rows = data.shape[0]
    # Print the number of rows
    print(f'The DataFrame has {num_rows} rows.')
    print(data.nunique()) 
    # Check for duplicates in all columns
    duplicates = data.duplicated(keep=False)
    # Print duplicate rows 
    print(data[duplicates])

def desc_categorical(data):
    # Get frequency counts for each categorical column
    string_columns = data.select_dtypes(include='string').drop(columns='job_description') # Skip job description! 
    # Get frequency counts for the categorical columns with mixed data types (strings and numbers)
    object_columns = data.select_dtypes(include='object').drop(columns='job_link')

    # Loop through the columns and print value counts
    for col in string_columns.columns:
        print(f'Value counts for column: {col}\n{string_columns[col].value_counts()}\n')
    for col in object_columns.columns:
        print(f'Value counts for column: {col}\n{object_columns[col].value_counts()}\n')

In [None]:
# Functions for extracting salary information 

# Format, clean, and fix columns for salary column 
def clean_columns(data):
    # Remove + signs and replace them with spaces in 'search_keyword' and 'search_location'
    data[['search_keyword', 'search_location']] = data[['search_keyword', 'search_location']].replace({r'\+': ' '}, regex=True)
    
    # Remove all newline characters from 'job_description'
    data['job_description'] = data['job_description'].replace({r'\n': ' '}, regex=True)
    
    # Extract salary numbers using regex
    # This regex captures numbers with commas, spaces, and periods, handling both American and European formats
    data['salary'] = data['salary'].astype(str)
    data['salary_num'] = data['salary'].apply(lambda x: re.findall(r'\d{1,3}(?:[,\s]\d{3})*(?:\.\d+)?', x))
    
    # Replace empty lists with NaN in 'salary_num'
    data['salary_num'] = data['salary_num'].apply(lambda x: x if x else np.nan)
    
    # Create 'salary_num_low' and 'salary_num_high' by extracting and cleaning the numbers
    # If there is only one number put it in both low and high column
    data['salary_num_low'] = data['salary_num'].apply(lambda x: convert_salary(x[0]) if isinstance(x, list) and len(x) > 0 else np.nan)
    data['salary_num_high'] = data['salary_num'].apply(lambda x: convert_salary(x[0]) if isinstance(x, list) and len(x) == 1 else convert_salary(x[1]) if isinstance(x, list) and len(x) > 1 else np.nan)

    # Extract time period from 'salary' column using regex
    # par an since 'an' is an English word 
    data['time_period'] = data['salary'].str.extract(r'(hour|year|month|week|day|ora|anno|mese|settimana|giorno|heure|par an|mois|semaine|jour|månad)')

    return data

def convert_salary_to_monthly(row, salary_column):
    # Dictionary to map time periods (in different languages) to their monthly conversion factor
    time_period_map = {
        'hour': 160, 'ora': 160, 'heure': 160,
        'year': 1/12, 'anno': 1/12, 'par an': 1/12,
        'week': 4, 'settimana': 4, 'semaine': 4,
        'day': 20, 'giorno': 20, 'jour': 20,
        'month': 1, 'mese': 1, 'mois': 1, 'månad': 1
    }
    
    time_period = row['time_period']
    
    # Check if 'time_period' is a valid string and map it to conversion factor, otherwise return NaN
    if isinstance(time_period, str):
        time_period = time_period.lower()
        return row[salary_column] * time_period_map.get(time_period, np.nan)
    
    return np.nan

# Function to apply salary conversion for min and max salary
def apply_salary_conversion(df, currency):
    df['min_salary_month'] = df.apply(lambda row: convert_salary_to_monthly(row, 'salary_num_low'), axis=1)
    df['max_salary_month'] = df.apply(lambda row: convert_salary_to_monthly(row, 'salary_num_high'), axis=1)
    df['currency'] = currency  # Add currency column
    return df

# Function to clean DataFrames, add a currency column, and calculate salary per month
def clean_and_add_currency_and_salaries(df, currency):
    cleaned_df = clean_columns(df)  # Clean the DataFrame
    cleaned_df['currency'] = currency  # Add currency column
    # Calculate min and max salary per month
    cleaned_df['min_salary_month'] = cleaned_df.apply(lambda row: convert_salary_to_monthly(row, 'salary_num_low'), axis=1)
    cleaned_df['max_salary_month'] = cleaned_df.apply(lambda row: convert_salary_to_monthly(row, 'salary_num_high'), axis=1)
    return cleaned_df

In [None]:
# Functions for extracting info from job descriptions 
def extract_keywords(df, country, language):
    """
    Parameters:
    - df: DataFrame containing job descriptions and search keywords.
    - country: String representing the country to filter by.
    - language: language to filter by (to ensure correct stopwords are removed). 

    Returns:
    - A list with most common keywords? 
    """
    
    # Always include English stopwords
    stop_words = set(stopwords.words('english'))

    # Add additional stopwords based on the specified language
    if language == 'french':
        stop_words.update(stopwords.words('french'))
    elif language == 'italian':
        stop_words.update(stopwords.words('italian'))
    elif language == 'swedish':
        stop_words.update(stopwords.words('swedish'))
    elif language == 'english':
        # English stopwords are already included at the top
        pass
    else:
        raise ValueError("Unsupported language.")

    # Filter the DataFrame for the specified country
    df_country = df[df['country'] == country].copy()  # Create a copy to avoid SettingWithCopyWarning

    # Use .loc to assign new columns
    df_country.loc[:, 'cleaned_description'] = df_country['job_description'].apply(preprocess_text)
    #df_country.loc[:, 'tokens'] = df_country['cleaned_description'].apply(tokenize_and_filter)
    df_country.loc[:, 'tokens'] = df_country['cleaned_description'].apply(lambda text: tokenize_and_filter(text, stop_words))

    # Flatten the list of tokens and count frequencies
    all_tokens = [token for sublist in df_country['tokens'] for token in sublist]
    word_counts = Counter(all_tokens)

    # Get the top 10 keywords
    common_keywords = word_counts.most_common(10)  
    return (common_keywords, all_tokens)

def plot_common_keywords(common_keywords, country):
    """
    Plots the most common keywords from job descriptions.

    Parameters:
    - common_keywords: List of tuples (keyword, frequency).
    - country: Name of the country for labeling the plot.
    """
    # Unzip the list of tuples into two lists: words and counts
    words, counts = zip(*common_keywords)

    # Create a bar plot
    plt.figure(figsize=(10, 6))  # Set the figure size
    plt.bar(words, counts, color='skyblue')  # Bar plot
    plt.xlabel('Keywords', fontsize=14)  # Label for x-axis
    plt.ylabel('Frequency', fontsize=14)  # Label for y-axis
    plt.title(f'Most Common Keywords in Job Descriptions - {country}', fontsize=16)  # Title of the plot
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.tight_layout()  # Adjust layout to make room for rotated labels
    plt.show()  # Display the plot

def count_keywords(df, country):
    """
    Counts the occurrences of keywords in job descriptions by category and sub-category for a specific country,
    creating separate entries for each keyword and its associated search keyword.

    Parameters:
    - df: DataFrame containing job descriptions and search keywords.
    - country: String representing the country to filter by.

    Returns:
    - A DataFrame with categories, sub-categories, keyword counts, associated search keywords, and country.
    """
    # Prepare the DataFrame list to store individual entries
    data = []

    # Filter DataFrame by country
    filtered_df = df[df['country'] == country]
    
    # Flatten the keywords into a single list with their categories
    category_keywords = [(category, keyword) for category, keywords in software_keywords.items() for keyword in keywords]

    for index, row in filtered_df.iterrows():
        job_description = row['job_description'].lower()  # Access job description
        search_keyword = row['search_keyword']  # Access associated search keyword
        
        for category, keyword in category_keywords:
            count = job_description.count(keyword)
            if count > 0:  # Only record non-zero counts
                data.append({
                    'Category': category,
                    'Keyword': keyword,
                    'Count': count,
                    'Search Keyword': search_keyword,
                    'Country': country  
                })

    # Create a df from the collected data
    result_df = pd.DataFrame(data)

    # Group by relevant columns and sum the counts
    result_df = result_df.groupby(['Category', 'Keyword', 'Search Keyword', 'Country'], as_index=False).sum()

    return result_df

In [None]:
# Function to extract info about interview process

def extract_interview_info(df):
    """
    Extracts interview process information from job descriptions.

    Parameters:
    - df: DataFrame containing job listings.

    Returns:
    - DataFrame with extracted interview information.
    """
    # Initialize a list to hold the extracted information
    interview_info = []

    for index, row in df.iterrows():
        job_desc = row['job_description']
        
        # Extract information based on keywords and patterns
        interview_details = {
            'job_id': row['job_id'],
            'phone_screening': bool(re.search(r'phone screening|phone interview|screening call', job_desc, re.IGNORECASE)),
            'coding_assessment': bool(re.search(r'coding test|coding interview|programming assessment|technical assessment|live coding challenge', job_desc, re.IGNORECASE)),
            'case_study': bool(re.search(r'case study|take-home assignment', job_desc, re.IGNORECASE)),
            'on_site_interview': bool(re.search(r'on-site interview|final round|in-person interview', job_desc, re.IGNORECASE)),
            'presentation': bool(re.search(r'presentation|project presentation|technical presentation', job_desc, re.IGNORECASE)),
        }
        
        interview_info.append(interview_details)

    # Convert the list of dictionaries into a DataFrame
    interview_info_df = pd.DataFrame(interview_info)

    return interview_info_df

## Load the data
Import the data scraped using scraper-countries.py (Sept 20-21 2024), for the following countries: USA, Sweden, France, and Italy. These datasets contain the job listings for the 3 largest cities in respective country, for the job titles _Data Scientist_, _Data Analyst_, _Product Analyst_, and _BI Analyst_.

In [None]:
# Import csv files with scraped data for resp. country 
# Sweden 
df_SWE = pd.read_csv(f"{DATA_PATH}{'Sweden'}.csv")
# France 
df_FRA = pd.read_csv(f"{DATA_PATH}{'France'}.csv")
# Italy
df_ITA =pd.read_csv(f"{DATA_PATH}{'Italy'}.csv")

In [None]:
# USA 
df_USA = merge_US_cities(['NY', 'LA', 'CHI'])

## Initial inspection of the data

In [None]:
df_SWE.head(5)

In [None]:
df_FRA.head(5)

In [None]:
df_ITA.head(5)

In [None]:
df_USA.head(5)

### Check dimensions and data types

In [None]:
# Check dimensions of dfs 
print(f'SWE \t   columns: {df_SWE.shape[1]} \t  rows: {df_SWE.shape[0]}')
print(f'FRA \t   columns: {df_FRA.shape[1]} \t  rows: {df_FRA.shape[0]}')
print(f'ITA \t   columns: {df_ITA.shape[1]} \t  rows: {df_ITA.shape[0]}')
print(f'USA \t   columns: {df_USA.shape[1]} \t  rows: {df_USA.shape[0]}')

In [None]:
# Check data types
df_SWE.info() 
# Most columns are of type 'Dtype object'
# Convert columns with strings only to string data type to optimize performance
# Mixed columns with both numbers and strings: company_location, salary, job_link (url) 

In [None]:
# Convert objects to strings 
df_SWE = df_SWE.astype(dtype_dict)

# Output new data types
print(df_SWE.dtypes) # Is string[python] not dtype string?

In [None]:
# Do the same for the other datasets 
df_FRA = df_FRA.astype(dtype_dict)
df_ITA = df_ITA.astype(dtype_dict)
df_USA = df_USA.astype(dtype_dict)

## Some descriptive statistics

In [None]:
# Describe (numerical) columns  
df_SWE.describe(include='all')

In [None]:
# Describe categorical columns  
desc_categorical(df_SWE) 

### Preliminary observations, Swedish job listings:
- __Most frequent job titles__: Data Analyst, systemutvecklare, Data Scientist.
- __Most frequent company__: Volvo Group. 
- __Salary ranges__: Few numerical values are provided.

In [None]:
df_FRA.describe(include='all')

In [None]:
desc_categorical(df_FRA) 

### Preliminary observations, French job listings:
- Many listings compared to Sweden. 
- __Most frequent job titles__: Data Analyst H/F (many similar names for this title, e.g. Data Analyst, Data Analyst F/H). 
- __Most frequently mentioned company__: AXA.
- __Company location__: Includes some information about télétravail (especially common in Paris).  
- __Salary ranges__: Need to split column to be able to draw any conclusions.

In [None]:
df_ITA.describe(include='all')

In [None]:
desc_categorical(df_ITA) 

### Preliminary observations, Italian job listings:
- Many listings compared to Sweden, but less than France.
- __Most frequent job titles__: Data Analyst, Data Scientist, Product Analyst. 
- __Most frequently mentioned company__: BIP - Business Integration Partners. 
- __Company location__: Milano. Also remote in Milano is relatively common (as is Rome).
- __Salary ranges__: Very few numerical entries provided. 

In [None]:
df_USA.describe(include='all')

In [None]:
desc_categorical(df_USA) 

### Preliminary observations, American job listings:
- Number of listings are less than for France. 
- __Most frequent job titles__: Data Analyst, Data Scientist, BI analyst. 
- __Most frequently mentioned company__: Citi. 
- __Company location__: New York.  
- __Salary ranges__: Need to split column to draw conclusions. 

### Conclusions
- The following columns needs to be cleaned: search_keyword, job_title, job_description, company_location.   
- The salary column should be split into two columns (separate numeric vs string content).    
- Job titles appear to vary somewhat between countries (since top 3 ones were different for different countries). 


## Data reduction and data cleaning
Handle missing and duplicate data entries. Remove unnecessary columns (if any). 
Clean and preprocess the data to handle anomalies and outliers. 

### Missing values

In [None]:
# Handle missing values 
# The scraper labels cells as 'Not available' when there is no value. Change these to NaN. 
df_SWE.replace('Not available', np.NaN, inplace=True)
df_FRA.replace('Not available', np.NaN, inplace=True) 
df_ITA.replace('Not available', np.NaN, inplace=True)
df_USA.replace('Not available', np.NaN, inplace=True)

df_FRA.isnull().sum() # Missing salary and job description entries 
df_SWE.isnull().sum() # Missing salary entries 
df_ITA.isnull().sum() # Missing salary entries 
df_USA.isnull().sum() # Missing salary entries 

# Calculate percentage of missing values
df_FRA.isnull().mean() * 100

# Salary entries will be examined later when columns are split 
# For now we do not drop these rows (we are not primarily interested in salaries)

In [None]:
# Visualize missing data patterns using missingno
msno.matrix(df_FRA) #.sample(250)
plt.title('Matrix Plot of Missing Values - French listings', fontsize=16)  # Add a title to the plot
plt.show()

In [None]:
# Look into the missing descriptions for French job listings (seems that requests were blocked here?)
print(df_FRA.isnull().sum())
#df_FRA[df_FRA['job_description'].isnull()]

# Manual imports of missing job descriptions from URLs 
df_FRA_missing = pd.read_csv('df_FRA_missing_data.csv')

In [None]:
# Merge the DataFrames based on the 'job_link' column
merged_df = pd.merge(df_FRA, df_FRA_missing, on=['job_link'])
merged_df = pd.merge(df_FRA, df_FRA_missing[['job_link', 'job_description_new']], on='job_link', how='left')
# Save the job description as only one column (based on when it is not NaN)
merged_df['job_description'] = merged_df['job_description'].combine_first(merged_df['job_description_new'])
merged_df.drop('job_description_new', axis=1, inplace=True)
# Verify that there are no missing values in job_description now
merged_df

In [None]:
merged_df.isnull().sum()
df_FRA = merged_df
print(df_FRA.isnull().sum())

In [None]:
# Now the job_description should not have missing values anymore
msno.matrix(df_FRA) #.sample(250)
plt.title('Matrix Plot of Missing Values - French listings', fontsize=16)  
plt.show()

In [None]:
# There are still many missing values for salary, but this is since that information was not always provided in the job listings
df_FRA[df_FRA['salary'].isnull()]

### Check for duplicates

In [None]:
# Check for potential duplicates 
check_duplicates(df_FRA)

In [None]:
df_FRA.isnull().sum()

__Conclusion:__ There seem to be no issues with duplicated entries for any of the countries. 

## Feature engineering 

### Retrieve numeric values for salary

In [None]:
# Dictionary of data frames 
dfs = {'SWE': df_SWE, 'FRA': df_FRA, 'ITA': df_ITA, 'USA': df_USA}

# Loop through the dictionary and apply the function to clean dfs and add currency and salaries
dfs = {key: clean_and_add_currency_and_salaries(df, currency_mapping[key]) for key, df in dfs.items()}

In [None]:

'''
# Unpack the cleaned DataFrames
df_SWE, df_FRA, df_ITA, df_USA = dfs.values()

# Inspect output 
print(df_ITA.isnull().sum())
# Check salary ranges 
df_ITA.describe()
# Filter out data when column 'salary_num' is not NaN
df_filtered = df_ITA.dropna(subset=['salary_num'])
df_filtered
'''

In [None]:
# Merge all dfs in the dictionary into a single df
df_combined = pd.concat(dfs.values(), ignore_index=True)
df_combined.reset_index(drop=True, inplace=True)

# Add dates when data was scraped 
df_combined['date'] = np.where(df_combined['country'].isin(['Sweden', 'USA']), pd.to_datetime('2024-09-19'), pd.to_datetime('2024-09-20'))
df_combined.describe()

In [None]:
# Maybe good to assign a job id to a column, instead of checking for unique URLs 
df_combined.insert(0, 'job_id', range(1, len(df_combined) + 1))

In [None]:
df_combined.head()

### Detect keywords 

In [None]:
# Extract keywords 
common_keywords_SWE = extract_keywords(df_combined, 'Sweden', 'swedish')
common_keywords_FRA = extract_keywords(df_combined, 'France', 'french')
common_keywords_ITA = extract_keywords(df_combined, 'Italy', 'italian')
common_keywords_USA = extract_keywords(df_combined, 'USA', 'english')

# Plot common keywords 
plot_common_keywords(common_keywords_SWE[0], 'Sweden')
plot_common_keywords(common_keywords_FRA[0], 'France')
plot_common_keywords(common_keywords_ITA[0], 'Italy')
plot_common_keywords(common_keywords_USA[0], 'USA')

In [None]:
# Calculate software/programming keyword counts for each country 
keyword_counts_SWE = count_keywords(df_combined, 'Sweden')
keyword_counts_FRA = count_keywords(df_combined, 'France')
keyword_counts_ITA = count_keywords(df_combined, 'Italy')
keyword_counts_USA = count_keywords(df_combined, 'USA')

# Combine all dfs into one
keyword_counts_combined = pd.concat([keyword_counts_SWE, keyword_counts_FRA, keyword_counts_ITA, keyword_counts_USA], ignore_index=True)
keyword_counts_combined.sort_values(by=['Count'], ascending=[False])

In [None]:
# Extract interview information
interview_info_df = extract_interview_info(df_combined)

In [None]:
# Count the total occurrences of each interview stage
interview_stage_counts = interview_info_df.sum()

# Convert the Series to a DataFrame for better readability
interview_stage_counts_df = interview_stage_counts.reset_index()
interview_stage_counts_df.columns = ['Interview Stage', 'Count'].sort?

print(interview_stage_counts_df)

# Many presentations: could this perhaps be that there are a lot of the jobs that include 'presentations' as a job assignment?

## Univariate analysis 


In [None]:
# histograms for salaries 
# to do 

## Bivariate analysis 

In [None]:
# Word cloud visualizations per country
def plt_wordtree(data, country):
    #df = data['job_description']
    #country = 'France'
    # Combine all the text into a single string 
    text = ' '.join(data)
    # Create a wordcloud object
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    # Display the wordcloud 
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Wordcloud of Job Descriptions - {country}')
    plt.axis('off')
    plt.show()

plt_wordtree(common_keywords_SWE[1], 'Sweden')
plt_wordtree(common_keywords_FRA[1], 'France')
plt_wordtree(common_keywords_ITA[1], 'Italy')
plt_wordtree(common_keywords_USA[1], 'USA')

In [None]:
# Add also word cloud visualizations per job title? 

In [None]:
# Box plots help visualize the distribution of salary ranges across different categories (e.g., job titles, countries).
# Look at outliers

# Number of job listings per job title and country 

In [None]:
# Please note: salaries are not in the same currency so comparisons cant really be made 

sns.boxplot(data=df_combined, x='search_keyword', y='salary_num_low')
plt.xticks(rotation=45)
plt.title('Salary Distribution by Job Title')
plt.show()

mean_salary = df_combined.groupby('search_keyword')['salary_num_low'].mean().reset_index()
sns.barplot(data=mean_salary, x='search_keyword', y='salary_num_low')
plt.xticks(rotation=45)
plt.title('Average Salary by Job Title')
plt.show()

sns.countplot(data=df_combined, x='search_keyword', hue='salary_num_low')
plt.xticks(rotation=45)
plt.title('Count of Job Titles by Salary Range')
plt.show()

sns.countplot(data=df_combined, x='search_keyword', hue='country')
plt.xticks(rotation=45)
plt.title('Count of Job Titles by Country')
plt.show()

In [None]:

import scipy.stats as stats

salary_ranges = pd.cut(df_combined['salary_num_low'], bins=[0, 20000, 40000, 60000, 80000, 100000], labels=['<20k', '20-40k', '40-60k', '60-80k', '80-100k'])
crosstab = pd.crosstab(df_combined['search_keyword'], salary_ranges)
chi2, p, dof, expected = stats.chi2_contingency(crosstab)
print(f'Chi-squared: {chi2}, p-value: {p}')

In [None]:
pivot_table = df_combined.pivot_table(values='salary_num_low', index='country', columns='search_keyword', aggfunc='mean')
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu')
plt.title('Average Salary Heatmap by Country and Job Title')
plt.show()

'''
g = sns.FacetGrid(df_clean, col='search_location', col_wrap=3)
g.map(sns.boxplot, 'search_keyword', 'salary_num_low')
plt.xticks(rotation=45)
plt.show()
'''

## Conclusions 
TBA.
Also interesting to look into 'Recruitment process' and 'Interview' process since there is data about that in the dataset?