In [None]:
import pandas as pd
import datetime
import numpy as np
# Testing

# Linkedin Data from Kaggle

In [None]:
data = pd.read_csv("job postings 2023 24/postings.csv")

In [None]:
data.columns

In [None]:
data.original_listed_time[:2]

In [None]:
def convert_unix_to_ddmmyyyy(unix_time):
    # Convert from milliseconds to seconds
    unix_time_seconds = unix_time / 1000.0
    
    # Convert to datetime and handle errors
    formatted_dates = pd.to_datetime(unix_time_seconds, unit='s', errors='coerce')
    
    # Format as 'dd-mm-yyyy'
    return formatted_dates.dt.strftime('%d-%m-%Y')

In [None]:
raw_data = data.copy()

In [None]:
raw_data['original_listed_time_mod'] = convert_unix_to_ddmmyyyy(raw_data['original_listed_time'])

In [None]:
raw_data['original_listed_time_mod'] =pd.to_datetime(raw_data['original_listed_time_mod'], format='%d-%m-%Y', errors='coerce')

In [None]:
raw_data['original_listed_time_mod'].min()

In [None]:
raw_data['original_listed_time_mod'].max()

# Data Scraped from ScrapingDog API

In [None]:
import requests

def fetch_all_linkedin_jobs(api_key, fields, geoids, pages):
    url = "https://api.scrapingdog.com/linkedinjobs/"
    all_data = []  # Store data from all requests

    # Iterate over each combination of field, geoid, and page
    for field in fields:
        for geoid in geoids:
            for page in pages:
                params = {
                    "api_key": api_key,
                    "field": field,
                    "geoid": geoid,
                    "page": page
                }
                
                response = requests.get(url, params=params)
                
                if response.status_code == 200:
                    # Append data from each request to the list
                    all_data.extend(response.json())
                else:
                    print(f"Request failed for field: {field}, geoid: {geoid}, page: {page} with status code: {response.status_code}")

    return all_data

# Usage example
api_key = "671da7467fddaf7ca053001a"
fields = ["java", "data science", "machine learning"]
geoids = ["100293800","103736294","102277331"]  # NV, Colorado, California
pages = ["1", "2", "3"]

# data = fetch_all_linkedin_jobs(api_key, fields, geoids, pages)



In [None]:
if data:
    new_Data = pd.DataFrame(data)
    new_Data.to_csv("261024_Job_Data.csv")

# Data Preprocessing

## Filtering Data

Filtering Function Overview
The filter_data function dynamically filters a DataFrame based on a specified list of columns and corresponding conditions. This flexibility allows for customized data extraction based on various criteria, such as dates, maximum or minimum values, and frequent items.

Function Parameters
df (pd.DataFrame): The input DataFrame containing the data you want to filter.

columns (list): A list of column names to apply filters on. Each column corresponds to a condition in the conditions list.

conditions (list): A list of conditions, where each element is a tuple specifying the operation and the value for filtering. Each tuple matches one of the columns in the columns list.


In [None]:
import pandas as pd

def filter_data(df, columns, conditions):
    """
    Filters a DataFrame based on a list of columns and their conditions.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to filter.
    - columns (list): List of column names to apply filters on.
    - conditions (list): List of conditions where each element is a tuple in the format:
                         (condition, value), for example ('>', '2024-04-20').
                         Each tuple should correspond to a column in the columns list.
    
    Returns:
    - pd.DataFrame: A filtered DataFrame.
    """
    # Loop over each column and its corresponding condition
    for column, (condition, value) in zip(columns, conditions):
        if condition == '>':
            df = df[df[column] > value]
        elif condition == '<':
            df = df[df[column] < value]
        elif condition == '==':
            df = df[df[column] == value]
        elif condition == '>=':
            df = df[df[column] >= value]
        elif condition == '<=':
            df = df[df[column] <= value]
        elif condition == '!=':
            df = df[df[column] != value]
        elif condition == 'max':
            max_value = df[column].max()
            df = df[df[column] == max_value]
        elif condition == 'min':
            min_value = df[column].min()
            df = df[df[column] == min_value]
        elif condition == 'top_n':
            top_n_counts = df[column].value_counts().nlargest(value)
            df = df[df[column].isin(top_n_counts.index)]
        elif condition == 'top_n':
            top_n_counts = df[column].value_counts()#.nlargest(value)
            df = df[df[column].isin(top_n_counts.index)]
        else:
            print(f"Invalid condition '{condition}' for column '{column}'")
    
    return df


## Filtering Scraped Data by Date

In [None]:
# Load the data
scraped_data = pd.read_csv("C:/Users/DELL/Downloads/261024_Job_Data.csv")

# Convert date column to datetime if filtering by date
scraped_data['job_posting_date'] = pd.to_datetime(scraped_data['job_posting_date'], errors='coerce')

# Specify columns and conditions
columns = ['job_posting_date']
conditions = [('>', '2024-04-20')]

# Apply the filter function
filtered_scraped_data = filter_data(scraped_data, columns, conditions)

# Save the filtered data
filtered_scraped_data.to_csv("filtered_scraped_job_data.csv", index=False)

# Print the filtered data
print(filtered_scraped_data)

## Filtering Current Kaggle Data to check top N job positions

In [None]:
import pandas as pd
kaggledata = pd.read_csv("job postings 2023 24/postings.csv")


In [None]:
def filter_top_n_values(df, column, n):
    # Get the top n most frequent values in the specified column
    top_n_values = df[column].value_counts()#.nlargest(n)
    return top_n_values
columns = ['title']
conditions = [('top_n',50)]
top_15_job_roles = filter_data(kaggledata, columns, conditions)

print(top_15_job_roles)

In [None]:
top_15_job_roles['title'].unique()

## Data cube

In [None]:
data_cube = kaggledata.pivot_table(
    values=[
        'med_salary', 'max_salary', 'min_salary', 'views', 'applies', 'normalized_salary'
    ],
    index=[
        'location', 'company_name', 'title', 'formatted_work_type', 'remote_allowed', 'formatted_experience_level'
    ],
    aggfunc={
        'med_salary': 'mean',           # Average median salary
        'max_salary': 'max',            # Maximum salary
        'min_salary': 'min',            # Minimum salary
        'views': 'sum',                 # Total views
        'applies': 'sum',               # Total applications
        'normalized_salary': 'mean'     # Average normalized salary
    }
).reset_index()

# Display the resulting data cube
print("Data Cube:")
print(data_cube)

# Optional: Save to CSV for further analysis
data_cube.to_csv('job_data_cube.csv', index=False)

## Dynamic data cube

In [None]:
def dynamic_data_cube(df, company_name=None, title=None, location=None, work_type=None):
    filtered_df = df.copy()
    
    # Apply filters based on user input
    if company_name:
        filtered_df = filtered_df[filtered_df['company_name'] == company_name]
    if title:
        filtered_df = filtered_df[filtered_df['title'] == title]
    if location:
        filtered_df = filtered_df[filtered_df['location'] == location]
    if work_type:
        filtered_df = filtered_df[filtered_df['formatted_work_type'] == work_type]
    
    # Check if filtered DataFrame is not empty
    if filtered_df.empty:
        return "No data found for the specified filters."
    
    # Aggregate metrics for the filtered data
    result = {
        'Total Job Listings': len(filtered_df),
        'Max Salary': filtered_df['max_salary'].max(),
        'Min Salary': filtered_df['min_salary'].min(),
        'Average Median Salary': filtered_df['med_salary'].mean(),
        'Total Views': filtered_df['views'].sum(),
        'Total Applications': filtered_df['applies'].sum(),
        'Average Normalized Salary': filtered_df['normalized_salary'].mean()
    }
    
    return result


In [None]:
print(dynamic_data_cube(kaggledata, company_name="ServiceNow"))

In [None]:
print(dynamic_data_cube(kaggledata, title="Full Stack Java Developer"))

#### Mapping Postings with Industries based on Company ID

In [1]:
import pandas as pd

# Load the datasets
jobs_df = data = pd.read_csv("C:/Users/DELL/Linkedin-Job-Market-Analysis-using-ML/LinkedIn Scraper/job postings 2023 24/postings.csv") 
industries_df = pd.read_csv('C:/Users/DELL/Linkedin-Job-Market-Analysis-using-ML/LinkedIn Scraper/job postings 2023 24/companies/company_industries.csv') 
# Merge the datasets on the company_id column
merged_df = jobs_df.merge(industries_df, on='company_id', how='left')  # Perform a left join

# Save the merged dataset to a new CSV file
merged_df.to_csv('merged_jobs_with_industries.csv', index=False)

print("Merge completed. The industries column has been added.")


In [6]:
merged_df.head

<bound method NDFrame.head of             job_id                     company_name  \
0           921716            Corcoran Sawyer Smith   
1          1829192                              NaN   
2         10998357           The National Exemplar    
3         23221523           Abrams Fensterman, LLP   
4         35982263                              NaN   
...            ...                              ...   
123991  3906267117                     Lozano Smith   
123992  3906267126                        Pinterest   
123993  3906267131                     EPS Learning   
123994  3906267195  Trelleborg Applied Technologies   
123995  3906267224                        Solugenix   

                                                    title  \
0                                   Marketing Coordinator   
1                       Mental Health Therapist/Counselor   
2                             Assitant Restaurant Manager   
3       Senior Elder Law / Trusts and Estates Associat...   
4   

#### Dynamic Data Cube: with industries included

In [7]:
def dynamic_data_cube(df, company_name=None, title=None, location=None, work_type=None, industry=None):
    filtered_df = df.copy()
    
    # Apply filters based on user input
    if company_name:
        filtered_df = filtered_df[filtered_df['company_name'] == company_name]
    if title:
        filtered_df = filtered_df[filtered_df['title'] == title]
    if location:
        filtered_df = filtered_df[filtered_df['location'] == location]
    if work_type:
        filtered_df = filtered_df[filtered_df['formatted_work_type'] == work_type]
    if industry:
        filtered_df = filtered_df[filtered_df['industry'].str.contains(industry, case=False, na=False)]  # Case-insensitive
    # Check if filtered DataFrame is not empty
    if filtered_df.empty:
        return "No data found for the specified filters."
    
    # Aggregate metrics for the filtered data
    result = {
        'Total Job Listings': len(filtered_df),
        'Max Salary': filtered_df['max_salary'].max(),
        'Min Salary': filtered_df['min_salary'].min(),
        'Average Median Salary': filtered_df['med_salary'].mean(),
        'Total Views': filtered_df['views'].sum(),
        'Total Applications': filtered_df['applies'].sum(),
        'Average Normalized Salary': filtered_df['normalized_salary'].mean()
    }
    
    return result

In [8]:
print(dynamic_data_cube(merged_df, company_name= "Amazon", industry="Software Development"))

{'Total Job Listings': 343, 'Max Salary': np.float64(307900.0), 'Min Salary': np.float64(15.5), 'Average Median Salary': nan, 'Total Views': np.float64(4887.0), 'Total Applications': np.float64(293.0), 'Average Normalized Salary': np.float64(145290.2548387097)}


In [9]:
print(dynamic_data_cube(merged_df, industry= "Banking"))

{'Total Job Listings': 1304, 'Max Salary': np.float64(330000.0), 'Min Salary': np.float64(15.0), 'Average Median Salary': np.float64(17364.422), 'Total Views': np.float64(10317.0), 'Total Applications': np.float64(688.0), 'Average Normalized Salary': np.float64(88354.88512024048)}


In [10]:
print(dynamic_data_cube(merged_df, industry= "Health care"))

{'Total Job Listings': 16553, 'Max Salary': np.float64(950000.0), 'Min Salary': np.float64(10.0), 'Average Median Salary': np.float64(26060.236622418877), 'Total Views': np.float64(114444.0), 'Total Applications': np.float64(7766.0), 'Average Normalized Salary': np.float64(360982.0397253541)}


#### Aggregate metrics by industries for comparisons

In [12]:
industry_metrics = merged_df.groupby('industry').agg({
    'max_salary': 'mean',
    'min_salary': 'mean',
    'views': 'sum',
    'applies': 'sum'
}).reset_index()
result = {}
result['Industry Insights'] = industry_metrics.to_dict('records')
print(result['Industry Insights'])

[{'industry': 'Accounting', 'max_salary': 125930.66279220779, 'min_salary': 77141.78220779222, 'views': 3817.0, 'applies': 330.0}, {'industry': 'Administration of Justice', 'max_salary': 92458.045, 'min_salary': 63872.7825, 'views': 70.0, 'applies': 1.0}, {'industry': 'Advertising Services', 'max_salary': 91359.05024390244, 'min_salary': 65853.44592682927, 'views': 27125.0, 'applies': 5561.0}, {'industry': 'Airlines and Aviation', 'max_salary': 111163.6608, 'min_salary': 84118.8782, 'views': 4836.0, 'applies': 449.0}, {'industry': 'Alternative Dispute Resolution', 'max_salary': nan, 'min_salary': nan, 'views': 3.0, 'applies': 0.0}, {'industry': 'Alternative Medicine', 'max_salary': 42028.5, 'min_salary': 31557.833333333332, 'views': 1076.0, 'applies': 251.0}, {'industry': 'Animation and Post-production', 'max_salary': nan, 'min_salary': nan, 'views': 74.0, 'applies': 15.0}, {'industry': 'Appliances, Electrical, and Electronics Manufacturing', 'max_salary': 85288.21811023621, 'min_salar

In [20]:
import plotly.express as px
from plotly.offline import plot
# Create a bar plot with Plotly
fig = px.bar(industry_metrics, x='industry', y=['max_salary', 'min_salary'], 
             title="Max and Min Salary by Industry", labels={'max_salary': 'Max Salary', 'min_salary': 'Min Salary'})

# To ensure Plotly works in Jupyter, you might need to explicitly render it
plot(fig)

'temp-plot.html'