## Install and import packages

In [4]:
%pip install pandas numpy psycopg2 ipywidgets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: C:\Users\XavierDelGiudice\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [1]:
import time
import ipywidgets as widgets
import psycopg2
from IPython.display import display
import re
import pandas as pd

## Connect to DB

In [14]:
DB_NAME = "Linkedin"
DB_USER = "postgres"
DB_PASS = "Sapienza"
DB_HOST = "localhost"
DB_PORT = "5432"


def get_db_connection():
    return psycopg2.connect(
        dbname=DB_NAME, user=DB_USER, password=DB_PASS,
        host=DB_HOST, port=DB_PORT)

In [15]:
conn = get_db_connection()
cur = conn.cursor()
cur.execute('''SELECT DISTINCT "FORMATTED_EXPERIENCE_LEVEL" FROM public."POSTING" WHERE "FORMATTED_EXPERIENCE_LEVEL" IS NOT NULL ORDER BY "FORMATTED_EXPERIENCE_LEVEL" ASC''')
experience_levels = [row[0] for row in cur.fetchall()]
cur.close()

len(experience_levels)

6

Distinct is not very performant then if ww do not have duplicate rows into this column we can also avoid to use the DISTINCT statement

## Insert your personal information

We prefered to use the like concat instead that equals or not equals in order to extract meaningful information also if we do not correctly insert the exact value in the where condition

In [6]:
conn = get_db_connection()
cur = conn.cursor()

cur.execute('''SELECT DISTINCT "FORMATTED_EXPERIENCE_LEVEL" FROM public."POSTING" WHERE "FORMATTED_EXPERIENCE_LEVEL" IS NOT NULL ORDER BY "FORMATTED_EXPERIENCE_LEVEL" ASC''')
experience_levels = [row[0] for row in cur.fetchall()]
cur.close()

cur = conn.cursor()
cur.execute('''SELECT DISTINCT "ID", "SKILL_NAME" FROM public."SKILL" WHERE "SKILL_NAME" IS NOT NULL ORDER BY "SKILL_NAME" ASC''')
skill_data = cur.fetchall()
skill_dict = {row[1]: row[0] for row in skill_data}
skill_names = list(skill_dict.keys())
cur.close()

email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

# Widget creation
name_input = widgets.Text(placeholder="Insert your name")
surname_input = widgets.Text(placeholder="Insert your surname")
email_input = widgets.Text(placeholder="Insert your email")
experience_input = widgets.Dropdown(options=experience_levels, description="Experience:")
skill_input = widgets.SelectMultiple(
    options=skill_names,
    description="Skills:",
    rows=10
)

button = widgets.Button(description="Insert Data", button_style="success")
output = widgets.Output()

def insert_data(b):
    with output:
        output.clear_output()
        name = name_input.value
        surname = surname_input.value
        email = email_input.value
        experience = experience_input.value
        selected_skills = skill_input.value
        
        # Validazione email
        if not re.match(email_pattern, email):
            print("Warning: Invalid email format. Please enter a valid email address.")
            email_input.value = ""
            return
        
        try:
            cur = conn.cursor()
            # Inserimento utente
            query_user = '''INSERT INTO public."USERS" ("NAME", "SURNAME", "MAIL", "EXPERIENCE") 
                            VALUES (%s, %s, %s, %s) RETURNING "ID"'''
            cur.execute(query_user, (name, surname, email, experience))
            user_id = cur.fetchone()[0]  # Recupero USER_ID
            conn.commit()
            
            # Inserimento skill per l'utente
            if selected_skills:
                query_skill = '''INSERT INTO public."USER_SKILL" ("USER_ID", "SKILL_ID") VALUES (%s, %s)'''
                for skill_name in selected_skills:
                    skill_id = skill_dict[skill_name]
                    cur.execute(query_skill, (user_id, skill_id))
                
                conn.commit()
            
            cur.close()
            print("Data inserted successfully!")
        except Exception as e:
            print(f"Error while connecting to database: {e}")

# Collegamento del bottone all'evento
button.on_click(insert_data)

# Visualizzazione dei widget
display(widgets.VBox([
    widgets.Label("Name:"), name_input,
    widgets.Label("Surname:"), surname_input,
    widgets.Label("Email:"), email_input,
    experience_input,
    skill_input,
    button, output
]))

VBox(children=(Label(value='Name:'), Text(value='', placeholder='Insert your name'), Label(value='Surname:'), …

## Some interesting insight/question that can be extracted from data:


Most In-Demand Job Titles

```SQL 
SELECT "job_title", COUNT(*) AS num_postings
FROM public."postings"
GROUP BY "job_title"
ORDER BY num_postings DESC
LIMIT 10;
```

Most Hiring Companies

```SQL 
SELECT "company_name", COUNT(*) AS num_postings
FROM public."postings"
GROUP BY "company_name"
ORDER BY num_postings DESC
LIMIT 10;
```

Job Locations with Highest Demand

```SQL 
SELECT "job_location", COUNT(*) AS num_postings
FROM public."postings"
GROUP BY "job_location"
ORDER BY num_postings DESC
LIMIT 10;
```

Average Salary for Each Job Title (if salary info is available)

```SQL 
SELECT "job_title", AVG("salary") AS avg_salary
FROM public."postings"
WHERE "salary" IS NOT NULL
GROUP BY "job_title"
ORDER BY avg_salary DESC
LIMIT 10;
```

Remote vs On-Site Jobs

```SQL
SELECT "work_type", COUNT(*) AS num_postings
FROM public."postings"
WHERE "work_type" IN ('Remote', 'On-site')
GROUP BY "work_type";
```

Most Common Skills Required

```SQL
SELECT "skills", COUNT(*) AS num_occurrences
FROM public."postings"
WHERE "skills" IS NOT NULL
GROUP BY "skills"
ORDER BY num_occurrences DESC
LIMIT 10;
```

## 1. Select all the posting job for company name given by input and with requested location and for a specific job location ricorda che devo mettere l upper in entrambi input e match

In [7]:
company_input = widgets.Text(placeholder="Insert the company name")
location_input = widgets.Text(placeholder="Insert the preferred location")
job_title_input = widgets.Text(placeholder="Insert the job title")
search_button = widgets.Button(description="Look for job posting", button_style="primary")
output = widgets.Output()


def search_jobs(b):
    with output:
        output.clear_output()
        company = company_input.value.strip() or None
        location = location_input.value.strip() or None
        job_title = job_title_input.value.strip() or None

        conn = None

        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
            SELECT "JOB_ID", "COMPANY_NAME", "LOCATION", "TITLE" FROM public."POSTING"
            WHERE (%s IS NULL OR UPPER("COMPANY_NAME") ILIKE UPPER(%s)) 
            AND (%s IS NULL OR UPPER("LOCATION") ILIKE UPPER(%s)) 
            AND (%s IS NULL OR UPPER("TITLE") ILIKE UPPER(%s))
            ORDER BY "JOB_ID" ASC
            LIMIT 10;
            '''

            # Prepare parameters
            params = (
                company, f"%{company}%" if company else None,
                location, f"%{location}%" if location else None,
                job_title, f"%{job_title}%" if job_title else None
            )

            cur.execute(query, params)
            rows = cur.fetchall()

            # Close cursor before processing results
            cur.close()
            conn.close()

            if rows:
                df = pd.DataFrame(rows, columns=["JOB_ID", "COMPANY_NAME", "LOCATION", "TITLE"])
                display(df)
            else:
                print("No job postings found with the selected parameters.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()


search_button.on_click(search_jobs)

display(widgets.VBox([
    widgets.Label("Company Name:"), company_input,
    widgets.Label("Location:"), location_input,
    widgets.Label("Job Title:"), job_title_input,
    search_button, output
]))

VBox(children=(Label(value='Company Name:'), Text(value='', placeholder='Insert the company name'), Label(valu…

## 2. Select all the job postings for the TechGiant (more than 500 employees and in the IT industry) DA IMPLEMENTARE SICURAMENTE

In [8]:
# Create input fields
industry_input = widgets.Text(placeholder="Insert the Industry domain")
company_input = widgets.Text(placeholder="Insert the company name")
search_button = widgets.Button(description="Look for job posting", button_style="primary")
output = widgets.Output()


# Function to search jobs in TechGiant companies
def search_jobs(b):
    with output:
        output.clear_output()
        company = company_input.value.strip() or None
        industry = industry_input.value.strip() or None

        conn = None

        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
            SELECT P."COMPANY_NAME", P."VIEWS", P."PAY_PERIOD", P."APPLIES", P."EXPIRY", CI."INDUSTRY"
            FROM public."POSTING" as P 
            JOIN public."COMPANY_INDUSTRY" as CI ON P."COMPANY_ID" = CI."COMPANY_ID"
            JOIN public."EMPLOYEE_COUNTS" as EC ON P."COMPANY_ID" = EC."COMPANY_ID" 
            WHERE EC."EMPLOYEE_COUNT" > 2000 
            AND "VIEWS" IS NOT NULL
            AND (%s IS NULL OR CI."INDUSTRY" ILIKE %s)
            AND (%s IS NULL OR P."COMPANY_NAME" ILIKE %s) 
            ORDER BY "VIEWS" DESC
            LIMIT 20;
            '''

            # Prepare parameters (matching the placeholders)
            params = (
                industry, f"%{industry}%" if industry else None,
                company, f"%{company}%" if company else None
            )

            cur.execute(query, params)
            rows = cur.fetchall()

            # Close cursor before processing results
            cur.close()
            conn.close()

            if rows:
                df = pd.DataFrame(rows,
                                  columns=["COMPANY_NAME", "VIEWS", "PAY_PERIOD", "APPLIES", "EXPIRY", "INDUSTRY"])
                display(df)
            else:
                print("No job postings found for TechGiant companies.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()  # Reset transaction state on error
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()


# Bind function to button click
search_button.on_click(search_jobs)

# Display UI
display(widgets.VBox([
    widgets.Label("Industry:"), industry_input,
    widgets.Label("Company Name:"), company_input,
    search_button, output
]))

VBox(children=(Label(value='Industry:'), Text(value='', placeholder='Insert the Industry domain'), Label(value…

## 3. Select all the companies and job position requested which salary is between two values given by input. Add info such as, all the part-time with USD payment and Monthly pay period, I want also to select the industry and the job should match at least 1 of my skill, order by number of skill matched

Considering today is today but one year early select all the active job postings that allows for remote working, that pay in USD, from companies with name given by input and that respect the characteristics of the logged user DA INTEGRARE CON LA 4

In [18]:
from ipywidgets import widgets, VBox, Label

def fetch_users():
    conn = get_db_connection()
    cur = conn.cursor()
    cur.execute('SELECT "ID", "NAME", "SURNAME", "MAIL" FROM public."USERS" ORDER BY "NAME";')
    users = cur.fetchall()
    cur.close()
    conn.close()
    user_dict = {f"{name} {surname} ({email})": user_id for user_id, name, surname, email in users}
    return user_dict

user_dict = fetch_users()
user_dropdown = widgets.Dropdown(
    options=[("Select a user", None)] + list(user_dict.items()),
    description="User:",
    style={"description_width": "auto"}
)

currency_input = widgets.Text(placeholder="Currency (e.g., USD)", description="Currency:")
job_type_input = widgets.Text(placeholder="Job Type (e.g., Part-time)", description="Job Type:")
pay_period_input = widgets.Text(placeholder="Pay Period (e.g., Monthly)", description="Pay Period:")
industry_input = widgets.Text(placeholder="Industry (e.g., Tech)", description="Industry:")
company_input = widgets.Text(placeholder="Company Name", description="Company:")

salary_slider = widgets.IntRangeSlider(
    value=[0, 100000],
    min=0,
    max=100000,
    step=100,
    description="Salary Range:",
    continuous_update=False
)

search_button = widgets.Button(description="Look for job postings", button_style="primary")
output = widgets.Output()

def search_jobs(b):
    with output:
        output.clear_output()
        currency = currency_input.value.strip() or None
        job_type = job_type_input.value.strip() or None
        pay_period = pay_period_input.value.strip() or None
        industry = industry_input.value.strip() or None
        company = company_input.value.strip() or None
        min_salary, max_salary = salary_slider.value

        selected_user = user_dropdown.value
        if not selected_user:
            print("Please select a user before searching for jobs.")
            return

        conn = None

        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
            SELECT 
                p."COMPANY_NAME",
                p."TITLE",
                p."PAY_PERIOD",
                p."CURRENCY",
                p."WORK_TYPE",
                s."MIN_SALARY",
                s."MAX_SALARY",
                i."NAME" AS industry
            FROM public."POSTING" p
            JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
            JOIN public."JOB_INDUSTRIES" ji ON p."JOB_ID" = ji."JOB_ID"
            JOIN public."INDUSTRIES" i ON ji."INDUSTRY_ID" = i."ID"
            LEFT JOIN (
                SELECT js."JOB_ID", COUNT(*) AS skill_match_count
                FROM public."JOB_SKILLS" js
                JOIN public."USER_SKILL" us ON js."SKILL_ABR" = (
                    SELECT "SKILL_ABR" FROM public."SKILL" WHERE "ID" = us."SKILL_ID"
                )
                WHERE us."USER_ID" = %s
                GROUP BY js."JOB_ID"
            ) jsm ON p."JOB_ID" = jsm."JOB_ID"
            WHERE 
                p."EXPIRY" >= (EXTRACT(EPOCH FROM NOW()) - (365 * 24 * 60 * 60)) * 1000
                AND (%s IS NULL OR p."CURRENCY" ILIKE %s)
                AND (%s IS NULL OR p."WORK_TYPE" ILIKE %s)
                AND (%s IS NULL OR p."PAY_PERIOD" ILIKE %s)
                AND (%s IS NULL OR p."COMPANY_NAME" ILIKE %s)
                AND (%s IS NULL OR i."NAME" ILIKE %s)
                AND (s."MIN_SALARY" >= %s)
                AND (s."MAX_SALARY" <= %s)
            ORDER BY skill_match_count DESC
            LIMIT 20;
            '''

            params = (
                selected_user,
                currency, f"%{currency}%" if currency else None,
                job_type, f"%{job_type}%" if job_type else None,
                pay_period, f"%{pay_period}%" if pay_period else None,
                company, f"%{company}%" if company else None,
                industry, f"%{industry}%" if industry else None,
                min_salary,
                max_salary
            )

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            
            execution_time = end_time - start_time

            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")
            
            if rows:
                df = pd.DataFrame(rows, columns=["COMPANY_NAME", "JOB_TITLE", "PAY_PERIOD", "CURRENCY", "WORK_TYPE",
                                                 "MIN_SALARY", "MAX_SALARY", "INDUSTRY"])
                display(df)
            else:
                print("No job postings found with the given filters.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(search_jobs)

display(VBox([
    Label("Select User:"),
    user_dropdown,
    Label("Enter Job Filters:"),
    currency_input,
    job_type_input,
    pay_period_input,
    industry_input,
    company_input,
    salary_slider,
    search_button,
    output
]))

VBox(children=(Label(value='Select User:'), Dropdown(description='User:', options=(('Select a user', None), ('…

## 4. 

## 5. Select all the how many job postings each company has and its maximum recorded employee count. add also the ratio of growing thanks to the time column computed as the proportion of employee since the first time recorded employees count and the last, replicate also for the follower 

In [13]:
SELECT c."NAME" AS company_name, COUNT(p."JOB_ID") AS job_postings, 
       MAX(e."EMPLOYEE_COUNT") AS max_employees
FROM public."COMPANIES" c
LEFT JOIN public."POSTING" p ON c."ID" = p."COMPANY_ID"
LEFT JOIN public."EMPLOYEE_COUNTS" e ON c."ID" = e."COMPANY_ID"
GROUP BY c."NAME"
ORDER BY job_postings DESC;

SyntaxError: invalid syntax (3866171482.py, line 1)

In [23]:
company_name_input = widgets.Text(placeholder="Enter Company Name", description="Company:")
search_button = widgets.Button(description="Analyze Company Growth", button_style="primary")
output = widgets.Output()

def analyze_company_growth(b):
    with output:
        output.clear_output()
        company_name = company_name_input.value.strip()
        
        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
                    SELECT 
                    c."NAME" AS company_name,
                    jc.job_postings,
                    eg.last_employee_count,
                    eg.first_employee_count,
                    eg.last_follower_count,
                    eg.first_follower_count,
                    CASE 
                        WHEN eg.first_employee_count = 0 THEN NULL 
                        ELSE (eg.last_employee_count - eg.first_employee_count) * 1.0 / eg.first_employee_count 
                    END AS employee_growth_ratio,
                    CASE 
                        WHEN eg.first_follower_count = 0 THEN NULL 
                        ELSE (eg.last_follower_count - eg.first_follower_count) * 1.0 / eg.first_follower_count 
                    END AS follower_growth_ratio
                FROM public."COMPANIES" c
                JOIN (
                    SELECT 
                        "COMPANY_ID",
                        MIN("EMPLOYEE_COUNT") AS first_employee_count,
                        MAX("EMPLOYEE_COUNT") AS last_employee_count,
                        MIN("FOLLOWER_COUNT") AS first_follower_count,
                        MAX("FOLLOWER_COUNT") AS last_follower_count
                    FROM public."EMPLOYEE_COUNTS"
                    GROUP BY "COMPANY_ID"
                ) AS eg ON c."ID" = eg."COMPANY_ID"
                JOIN (
                    SELECT "COMPANY_ID", COUNT(*) AS job_postings
                    FROM public."POSTING"
                    GROUP BY "COMPANY_ID"
                ) AS jc ON c."ID" = jc."COMPANY_ID"
                WHERE (%s IS NULL OR c."NAME" ILIKE %s)
                ORDER BY employee_growth_ratio DESC;
            '''

            params = (company_name if company_name else None, f"%{company_name}%" if company_name else None)

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            
            execution_time = end_time - start_time

            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")
            
            if rows:
                df = pd.DataFrame(rows, columns=["Company Name", "Job Postings", "Max Employees", 
                                                 "Min Employees", "Max Followers", "Min Followers", 
                                                 "Employee Growth Ratio", "Follower Growth Ratio"])
                display(df)
            else:
                print("No company data found.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(analyze_company_growth)

display(widgets.VBox([
    widgets.Label("Enter a company name to filter results (leave blank for all):"),
    company_name_input,
    search_button,
    output
]))

VBox(children=(Label(value='Enter a company name to filter results (leave blank for all):'), Text(value='', de…

## 6. Select all the VOGLIO SAPERE SE NEL MIO STATO CI SONO TANTE AZIENDE DELLA MIA STESSA TIPOLOGIA(INDUSTRY) al mio stesso livello: nmumero di job posting, numero di employee e numero di follower. E CHE HANNO LE MIE STESSE SPECIALITA

## 7. Select all the Finds the salary ranges for different industries based on job postings.

 I also want to know this info for industry and job title
 
Finds job listings by country and their average salary, but only for countries with more than 10 job postings.

In [None]:
SELECT c."COUNTRY", COUNT(p."JOB_ID") AS total_jobs, 
       AVG(s."MED_SALARY") AS avg_salary
FROM public."POSTING" p
JOIN public."COMPANIES" c ON p."COMPANY_ID" = c."ID"
LEFT JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
GROUP BY c."COUNTRY"
HAVING COUNT(p."JOB_ID") > 10
ORDER BY avg_salary DESC;



SELECT i."NAME" AS industry, 
       COUNT(j."JOB_ID") AS job_count, 
       AVG(s."MIN_SALARY") AS avg_min_salary,
       AVG(s."MED_SALARY") AS avg_med_salary,
       AVG(s."MAX_SALARY") AS avg_max_salary
FROM public."JOB_INDUSTRIES" ji
JOIN public."INDUSTRIES" i ON ji."INDUSTRY_ID" = i."ID"
JOIN public."SALARIES" s ON ji."JOB_ID" = s."JOB_ID"
JOIN public."POSTING" j ON ji."JOB_ID" = j."JOB_ID"
GROUP BY i."NAME"
ORDER BY job_count DESC;


## 8. Select all the ...
 calcolare la top 5 dei lavori con in percentuale più proposte di lavoro da remoto

## 9. Select all the ...
- visualizzare i top 3 lavori con minor tempo medio per chiudere la job posting(possiamo usare anche le application per quel lavoro per calcolare quanto sia proficua quella proposta di lavoro) la piu richiesta tipologia di lavoro
aggiungere anche quali sono le piu diffici da prendere anche considerando il rapportto applies views

## 10. Finds the top 10 most in-demand skills by industry, including salary insights.
 
aggiungere poi: non so se a livello di sql si possa fare, ma pensavo di selezionare le skill richieste per un lavoro nella top 3 dei salari retribuiti e con un esperience level = entry level

In [None]:
SELECT sk."SKILL_NAME", ind."NAME" AS industry, 
       COUNT(js."JOB_ID") AS job_count, 
       ROUND(AVG(s."MED_SALARY"), 2) AS avg_salary
FROM public."JOB_SKILLS" js
JOIN public."SKILL" sk ON js."SKILL_ABR" = sk."SKILL_ABR"
JOIN public."JOB_INDUSTRIES" ji ON js."JOB_ID" = ji."JOB_ID"
JOIN public."INDUSTRIES" ind ON ji."INDUSTRY_ID" = ind."ID"
LEFT JOIN public."SALARIES" s ON js."JOB_ID" = s."JOB_ID"
GROUP BY sk."SKILL_NAME", ind."NAME"
HAVING COUNT(js."JOB_ID") > 5
ORDER BY job_count DESC, avg_salary DESC
LIMIT 10;


NOTE.
aggiungere experience_level alla tabella USER al posto di experience
Tramite una select prendere tutti i valori della colonna FORMATTED_EXPERIENCE_LEVEL dalla tabella posting e fare in modo che durante l inserimento da PYTHON puo essere scelto un unico valore tra questi della lista

aggiungere skill_id su tabella USER

prendere tutte le skill possibili (nome) dalla tabella SKILL e dare la possibilita all utente di inserire le proprie skill ognuna di queste sara una riga nella tabella di appoggio che verra creata USER_SKILL
Quando si cerca un job posting verranno visualizzate quelle che matchano almeno un numero da scegliere di skill richieste dal job (JOB_SKILLS) e quelle dell utente loggato o registrato






Views can make the query faster allowing for temporarily queries that store huge info instead to compute it into the query itself, but these are not stored in the memory like usual table and this can cause slower access since the views are not optimized to being accessed like normal table.

TRADE OFF: Faster computation of JOIN for example that can be directly stored into a virtual table, Slower access since it is not optimized for it.