## Install and import packages

In [1]:
%pip install pandas numpy sqlalchemy psycopg2 ipywidgets

Collecting sqlalchemy
  Obtaining dependency information for sqlalchemy from https://files.pythonhosted.org/packages/60/7f/ea1086136bc648cd4713a1e01869f7fc31979d67b3a8f973f5d9ab8de7e1/sqlalchemy-2.0.40-cp310-cp310-win_amd64.whl.metadata
  Downloading sqlalchemy-2.0.40-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting greenlet>=1 (from sqlalchemy)
  Obtaining dependency information for greenlet>=1 from https://files.pythonhosted.org/packages/96/28/d62835fb33fb5652f2e98d34c44ad1a0feacc8b1d3f1aecab035f51f267d/greenlet-3.1.1-cp310-cp310-win_amd64.whl.metadata
  Downloading greenlet-3.1.1-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Downloading sqlalchemy-2.0.40-cp310-cp310-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---- ----------------------------------- 0.2/2.1 MB 7.3 MB/s eta 0:00:01
   ---------------------------------------- 2.1/2.1 MB 26.9 MB/s eta 0:00:00
Downloading greenlet-3.1.1-cp310-cp310-win_amd64.whl (298 kB)
   --------


[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import time
from ipywidgets import widgets, VBox, Label
import psycopg2
from IPython.display import display
import re
import pandas as pd
from sqlalchemy import create_engine

## Connect to DB

In [1]:
DB_NAME = "Linkedin"
DB_USER = "postgres"
DB_PASS = "Sapienza"
DB_HOST = "localhost"
DB_PORT = "5432"

def get_db_connection():
    return psycopg2.connect(
        dbname=DB_NAME, user=DB_USER, password=DB_PASS,
        host=DB_HOST, port=DB_PORT)

Distinct is not very performant then if we do not have duplicate rows into this column we can also avoid to use the DISTINCT statement

## SQL

### Insert your personal information

We prefered to use the like concat instead that equals or not equals in order to extract meaningful information also if we do not correctly insert the exact value in the where condition

In [3]:
conn = get_db_connection()
cur = conn.cursor()

cur.execute('''SELECT DISTINCT "FORMATTED_EXPERIENCE_LEVEL" FROM public."POSTING" WHERE "FORMATTED_EXPERIENCE_LEVEL" IS NOT NULL ORDER BY "FORMATTED_EXPERIENCE_LEVEL" ASC''')
experience_levels = [row[0] for row in cur.fetchall()]
cur.close()

cur = conn.cursor()
cur.execute('''SELECT DISTINCT "ID", "SKILL_NAME" FROM public."SKILL" WHERE "SKILL_NAME" IS NOT NULL ORDER BY "SKILL_NAME" ASC''')
skill_data = cur.fetchall()
skill_dict = {row[1]: row[0] for row in skill_data}
skill_names = list(skill_dict.keys())
cur.close()

email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

name_input = widgets.Text(placeholder="Insert your name")
surname_input = widgets.Text(placeholder="Insert your surname")
email_input = widgets.Text(placeholder="Insert your email")
experience_input = widgets.Dropdown(options=experience_levels, description="Experience:")
skill_input = widgets.SelectMultiple(
    options=skill_names,
    description="Skills:",
    rows=10
)

button = widgets.Button(description="Insert Data", button_style="success")
output = widgets.Output()

def insert_data(b):
    with output:
        output.clear_output()
        name = name_input.value
        surname = surname_input.value
        email = email_input.value
        experience = experience_input.value
        selected_skills = skill_input.value

        if not re.match(email_pattern, email):
            print("Warning: Invalid email format. Please enter a valid email address.")
            #email_input.value = ""
            return
        
        try:
            cur = conn.cursor()
            query_user = '''INSERT INTO public."USERS" ("NAME", "SURNAME", "MAIL", "EXPERIENCE") 
                            VALUES (%s, %s, %s, %s) RETURNING "ID"'''
            cur.execute(query_user, (name, surname, email, experience))
            user_id = cur.fetchone()[0]
            conn.commit()
            
            if selected_skills:
                query_skill = '''INSERT INTO public."USER_SKILL" ("USER_ID", "SKILL_ID") VALUES (%s, %s)'''
                for skill_name in selected_skills:
                    skill_id = skill_dict[skill_name]
                    cur.execute(query_skill, (user_id, skill_id))
                
                conn.commit()
            
            cur.close()
            print("Data inserted successfully!")
        except Exception as e:
            print(f"Error while connecting to database: {e}")

button.on_click(insert_data)

display(widgets.VBox([
    widgets.Label("Name:"), name_input,
    widgets.Label("Surname:"), surname_input,
    widgets.Label("Email:"), email_input,
    experience_input,
    skill_input,
    button, output
]))

VBox(children=(Label(value='Name:'), Text(value='', placeholder='Insert your name'), Label(value='Surname:'), …

### 1. Select all the posting job for company name given by input and with requested location and for a specific job location

In [4]:
company_input = widgets.Text(placeholder="Insert the company name")
location_input = widgets.Text(placeholder="Insert the preferred location")
job_title_input = widgets.Text(placeholder="Insert the job title")
search_button = widgets.Button(description="Look for job posting", button_style="primary")
output = widgets.Output()

def search_jobs(b):
    with output:
        output.clear_output()
        company = company_input.value.strip() or None
        location = location_input.value.strip() or None
        job_title = job_title_input.value.strip() or None

        conn = None

        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
            SELECT p."JOB_ID", c."NAME", p."LOCATION", p."TITLE" FROM public."POSTING" p
            JOIN public."COMPANIES" c ON c."ID" = p."COMPANY_ID"
            WHERE (%s IS NULL OR UPPER(c."NAME") ILIKE UPPER(%s)) 
            AND (%s IS NULL OR UPPER(p."LOCATION") ILIKE UPPER(%s)) 
            AND (%s IS NULL OR UPPER(p."TITLE") ILIKE UPPER(%s))
            ORDER BY "JOB_ID" ASC
            LIMIT 10;
            '''

            params = (
                company, f"%{company}%" if company else None,
                location, f"%{location}%" if location else None,
                job_title, f"%{job_title}%" if job_title else None
            )

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["JOB_ID", "COMPANY_NAME", "LOCATION", "TITLE"])
                display(df)
            else:
                print("No job postings found with the selected parameters.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()
                
search_button.on_click(search_jobs)

display(widgets.VBox([
    widgets.Label("Company Name:"), company_input,
    widgets.Label("Location:"), location_input,
    widgets.Label("Job Title:"), job_title_input,
    search_button, output
]))

VBox(children=(Label(value='Company Name:'), Text(value='', placeholder='Insert the company name'), Label(valu…

### 2. Select all the job postings for the TechGiant (more than 500 employees and in the IT industry) DA IMPLEMENTARE SICURAMENTE

#### Without linking with the foreign key between COMPANY_ID in the POSTING table and ID in the COMPANY table

In [5]:
industry_input = widgets.Text(placeholder="Insert the Industry domain")
company_input = widgets.Text(placeholder="Insert the company name")
search_button = widgets.Button(description="Look for job posting", button_style="primary")
output = widgets.Output()

def search_jobs(b):
    with output:
        output.clear_output()
        company = company_input.value.strip() or None
        industry = industry_input.value.strip() or None

        conn = None

        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
            SELECT DISTINCT P."COMPANY_NAME", P."VIEWS", P."PAY_PERIOD", P."APPLIES", P."EXPIRY", CI."INDUSTRY"
            FROM public."POSTING" P 
            JOIN public."COMPANY_INDUSTRY" CI ON P."COMPANY_ID" = CI."COMPANY_ID"
            JOIN public."EMPLOYEE_COUNTS" EC ON P."COMPANY_ID" = EC."COMPANY_ID" 
            WHERE EC."EMPLOYEE_COUNT" > 2000 
            AND "VIEWS" IS NOT NULL
            AND (%s IS NULL OR UPPER(CI."INDUSTRY") ILIKE UPPER(%s))
            AND (%s IS NULL OR UPPER(P."COMPANY_NAME") ILIKE UPPER(%s)) 
            ORDER BY "VIEWS" DESC
            LIMIT 20;
            '''
            
            params = (
                industry, f"%{industry}%" if industry else None,
                company, f"%{company}%" if company else None
            )

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows,
                                  columns=["COMPANY_NAME", "VIEWS", "PAY_PERIOD", "APPLIES", "EXPIRY", "INDUSTRY"])
                display(df)
            else:
                print("No job postings found for TechGiant companies.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(search_jobs)
display(widgets.VBox([
    widgets.Label("Industry:"), industry_input,
    widgets.Label("Company Name:"), company_input,
    search_button, output
]))

VBox(children=(Label(value='Industry:'), Text(value='', placeholder='Insert the Industry domain'), Label(value…

In [24]:
industry_input = widgets.Text(placeholder="Insert the Industry domain")
company_input = widgets.Text(placeholder="Insert the company name")
search_button = widgets.Button(description="Look for job posting", button_style="primary")
output = widgets.Output()

def search_jobs(b):
    with output:
        output.clear_output()
        company = company_input.value.strip() or None
        industry = industry_input.value.strip() or None

        conn = None

        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
            SELECT DISTINCT P."JOB_ID", P."COMPANY_NAME", P."TITLE", P."VIEWS", P."PAY_PERIOD", P."APPLIES", P."EXPIRY", CI."INDUSTRY"
            FROM public."POSTING" P 
            JOIN public."COMPANY_INDUSTRY" CI ON P."COMPANY_ID" = CI."COMPANY_ID"
            JOIN public."EMPLOYEE_COUNTS" EC ON P."COMPANY_ID" = EC."COMPANY_ID" 
            WHERE EC."EMPLOYEE_COUNT" > 2000 
            AND "VIEWS" IS NOT NULL
            AND (%s IS NULL OR UPPER(CI."INDUSTRY") ILIKE UPPER(%s))
            AND (%s IS NULL OR UPPER(P."COMPANY_NAME") ILIKE UPPER(%s)) 
            ORDER BY "VIEWS" DESC
            LIMIT 20;
            '''
            
            params = (
                industry, f"%{industry}%" if industry else None,
                company, f"%{company}%" if company else None
            )

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows,
                                  columns=["JOB_ID", "COMPANY_NAME", "TITLE", "VIEWS", "PAY_PERIOD", "APPLIES", "EXPIRY", "INDUSTRY"])
                display(df)
            else:
                print("No job postings found for TechGiant companies.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(search_jobs)
display(widgets.VBox([
    widgets.Label("Industry:"), industry_input,
    widgets.Label("Company Name:"), company_input,
    search_button, output
]))

VBox(children=(Label(value='Industry:'), Text(value='', placeholder='Insert the Industry domain'), Label(value…

### 3. Select all the companies and job position requested which salary is between two values given by input. Add info such as, all the part-time with USD payment and Monthly pay period, I want also to select the industry and the job should match at least 1 of my skill, order by number of skill matched

Considering today is today but one year early select all the active job postings that allows for remote working, that pay in USD, from companies with name given by input and that respect the characteristics of the logged user DA INTEGRARE CON LA 4

In [7]:
def fetch_users():
    conn = get_db_connection()
    cur = conn.cursor()
    cur.execute('SELECT "ID", "NAME", "SURNAME", "MAIL" FROM public."USERS" ORDER BY "NAME";')
    users = cur.fetchall()
    cur.close()
    conn.close()
    return {f"{name} {surname} ({email})": user_id for user_id, name, surname, email in users}

user_dict = fetch_users()
user_dropdown = widgets.Dropdown(
    options=[("Select a user", None)] + list(user_dict.items()),
    description="User:",
    style={"description_width": "auto"})
currency_input = widgets.Text(placeholder="Currency (e.g., USD)", description="Currency:")
job_type_input = widgets.Text(placeholder="Job Type (e.g., Part-time)", description="Job Type:")
pay_period_input = widgets.Text(placeholder="Pay Period (e.g., Monthly)", description="Pay Period:")
industry_input = widgets.Text(placeholder="Industry (e.g., Tech)", description="Industry:")
company_input = widgets.Text(placeholder="Company Name", description="Company:")
min_salary_input = widgets.IntText(value=0, description="Min Salary:")
max_salary_input = widgets.IntText(value=100000, description="Max Salary:")
salary_slider = widgets.IntSlider(
    value=50000, min=0, max=100000, step=100, description="Quick Select:", continuous_update=True
)

def update_salary_inputs(change):
    min_salary_input.value, max_salary_input.value = salary_slider.value - 10000, salary_slider.value + 10000

salary_slider.observe(update_salary_inputs, names="value")
search_button = widgets.Button(description="Look for job postings", button_style="primary")
output = widgets.Output()

def search_jobs(b):
    with output:
        output.clear_output()
        currency = currency_input.value.strip() or None
        job_type = job_type_input.value.strip() or None
        pay_period = pay_period_input.value.strip() or None
        industry = industry_input.value.strip() or None
        company = company_input.value.strip() or None
        min_salary = int(min_salary_input.value)
        max_salary = int(max_salary_input.value)
        selected_user = user_dropdown.value
        if not selected_user:
            print("Please select a user before searching for jobs.")
            return
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''            
            SELECT p."COMPANY_NAME", p."TITLE", p."PAY_PERIOD", p."CURRENCY", p."WORK_TYPE", 
            s."MIN_SALARY", s."MAX_SALARY", i."NAME" AS industry, skill_match_count
            FROM public."POSTING" p
            JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
            JOIN public."JOB_INDUSTRIES" ji ON p."JOB_ID" = ji."JOB_ID"
            JOIN public."INDUSTRIES" i ON ji."INDUSTRY_ID" = i."ID"
            LEFT JOIN (
                SELECT js."JOB_ID", COUNT(*) AS skill_match_count
                FROM public."JOB_SKILLS" js
                JOIN public."SKILL" sk ON js."SKILL_ID" = sk."ID"
                JOIN public."USER_SKILL" us ON sk."ID" = us."SKILL_ID"
                WHERE us."USER_ID" = %s
                GROUP BY js."JOB_ID"
            ) jsm ON p."JOB_ID" = jsm."JOB_ID"
            WHERE 
                p."EXPIRY" >= (EXTRACT(EPOCH FROM NOW()) - (365 * 24 * 60 * 60)) * 1000
                AND skill_match_count IS NOT NULL
                AND (%s IS NULL OR UPPER(p."CURRENCY") ILIKE UPPER(%s))
                AND (%s IS NULL OR UPPER(p."WORK_TYPE") ILIKE UPPER(%s))
                AND (%s IS NULL OR UPPER(p."PAY_PERIOD") ILIKE UPPER(%s))
                AND (%s IS NULL OR UPPER(p."COMPANY_NAME") ILIKE UPPER(%s))
                AND (%s IS NULL OR UPPER(i."NAME") ILIKE UPPER(%s))
                AND (s."MIN_SALARY" >= %s)
                AND (s."MAX_SALARY" <= %s)
            ORDER BY skill_match_count DESC
            LIMIT 20;
            '''
            params = (
                selected_user,
                currency, f"%{currency}%" if currency else None,
                job_type, f"%{job_type}%" if job_type else None,
                pay_period, f"%{pay_period}%" if pay_period else None,
                company, f"%{company}%" if company else None,
                industry, f"%{industry}%" if industry else None,
                min_salary,
                max_salary
            )

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")
            
            if rows:
                df = pd.DataFrame(rows, columns=["COMPANY_NAME", "JOB_TITLE", "PAY_PERIOD", "CURRENCY", "WORK_TYPE",
                                                 "MIN_SALARY", "MAX_SALARY", "INDUSTRY", "SKILL_MATCHED"])
                display(df)
            else:
                print("No job postings found with the given filters.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(search_jobs)
display(widgets.VBox([
    widgets.Label("Select User:"), user_dropdown,
    widgets.Label("Enter Job Filters:"), currency_input,
    job_type_input, pay_period_input, industry_input, company_input,
    min_salary_input, max_salary_input, salary_slider,
    search_button, output
]))

VBox(children=(Label(value='Select User:'), Dropdown(description='User:', options=(('Select a user', None), ('…

### 4. NEGATED SUB QUERY NELLA WHERE CONDITION

POTREMMO INSERIRE QUI PER ESEMPIO DOVE NON E RICHIESTO ESSERE AL DI SOPRA DELL ENTRY LEVEL, IMPORTANTE ANCHE PER L OTTIMIZZAZIONE FARE PRIMA NELLA SUBQUERY UNA SELECT ALL E POI FARE SOLO UNA SELECT 1 PER ANNULLARE I TEMPI DI PROCESSIN DELL ULTIMA.


TUTTE LE INDUSTRIE PIU REDDITIZIE IN CUI POTER LAVORARE SENZA AVERE LIVELLO DI ESPERIENZA PIU DI ENTRY LEVEL, DATE LE MIE SKILL

In [13]:
def fetch_users():
    conn = get_db_connection()
    cur = conn.cursor()
    cur.execute('SELECT "ID", "NAME", "SURNAME", "MAIL", "EXPERIENCE" FROM public."USERS" ORDER BY "NAME";')
    users = cur.fetchall()
    cur.close()
    conn.close()
    return {f"{name} {surname} ({email})": (user_id) for user_id, name, surname, email, experience in users}

user_dict = fetch_users()
user_dropdown = widgets.Dropdown(
    options=[("Select a user", (None, None))] + list(user_dict.items()),
    description="User:",
    style={"description_width": "auto"}
)

search_button = widgets.Button(description="Find industries", button_style="primary")
output = widgets.Output()

def find_profitable_industries(b):
    with output:
        output.clear_output()
        selected_user_id = user_dropdown.value
        
        if not selected_user_id:
            print("Select a valid user")
            return

        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = """
                SELECT ind."NAME" AS industry_name,
                       COUNT(p."JOB_ID") AS job_postings,
                       ROUND(AVG(COALESCE(s."MIN_SALARY", 0)), 3) AS avg_min_salary,
                       ROUND(AVG(COALESCE(s."MAX_SALARY", 0)), 3) AS avg_max_salary
                FROM public."INDUSTRIES" ind
                JOIN public."JOB_INDUSTRIES" ji ON ind."ID" = ji."INDUSTRY_ID"
                JOIN public."POSTING" p ON ji."JOB_ID" = p."JOB_ID"
                LEFT JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
                JOIN public."JOB_SKILLS" js ON p."JOB_ID" = js."JOB_ID"
                JOIN public."USER_SKILL" us ON js."SKILL_ID" = us."SKILL_ID"
                WHERE p."FORMATTED_EXPERIENCE_LEVEL" = (SELECT "EXPERIENCE" FROM public."USERS" WHERE "ID" = %s)
                AND us."USER_ID" = %s
                GROUP BY ind."NAME"
                ORDER BY avg_min_salary DESC, avg_max_salary DESC
                LIMIT 10;
            """
            # COSI MATCHANO TUTTE LE SKILL DEL JOB CON QUELLE DELL UTENTE SELEZIONATO
            # CON QUESTA SOTTO INVECE MATCHA SOLO UNA DELLE SKILL DEL LAVORO TRA QUELLE DELL UTENTE
            '''
                        query = """
                SELECT ind."NAME" AS industry_name,
                       COUNT(p."JOB_ID") AS job_postings,
                       ROUND(AVG(COALESCE(s."MIN_SALARY", 0)), 3) AS avg_min_salary,
                       ROUND(AVG(COALESCE(s."MAX_SALARY", 0)), 3) AS avg_max_salary
                FROM public."INDUSTRIES" ind
                JOIN public."JOB_INDUSTRIES" ji ON ind."ID" = ji."INDUSTRY_ID"
                JOIN public."POSTING" p ON ji."JOB_ID" = p."JOB_ID"
                LEFT JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
                WHERE p."FORMATTED_EXPERIENCE_LEVEL" = (SELECT "EXPERIENCE" FROM public."USERS" WHERE "ID" = %s)
                AND EXISTS (
                    SELECT 1 FROM public."JOB_SKILLS" js
                    WHERE js."JOB_ID" = p."JOB_ID"
                    AND EXISTS (
                        SELECT 1 FROM public."USER_SKILL" us
                        WHERE us."USER_ID" = %s
                        AND us."SKILL_ID" = js."SKILL_ID"
                    )
                )
                GROUP BY ind."NAME"
                ORDER BY avg_min_salary DESC, avg_max_salary DESC
                LIMIT 10;
            """
'''
            start_time = time.time()
            cur.execute(query, (selected_user_id, selected_user_id))
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["Industry", "Job Postings", "Avg Min Salary", "Avg Max Salary"])
                display(df)
            else:
                print("No data founded for the given user")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(find_profitable_industries)
display(widgets.VBox([
    widgets.Label("Select a user to find more payed industries for him/her:"),
    user_dropdown,
    search_button,
    output
]))

VBox(children=(Label(value='Select a user to find more payed industries for him/her:'), Dropdown(description='…

### 5. Select all the how many job postings each company has and its maximum recorded employee count. add also the ratio of growing thanks to the time column computed as the proportion of employee since the first time recorded employees count and the last, replicate also for the follower 

VERRA OTTIMIZATA CON VIEW SULLE DUE JOIN TRAMITE QUERY ANNIDATE

NULLIF to avoid division by zero

In [46]:
company_name_input = widgets.Text(placeholder="Enter Company Name", description="Company:")
search_button = widgets.Button(description="Analyze Company Growth", button_style="primary")
output = widgets.Output()

def analyze_company_growth(b):
    with output:
        output.clear_output()
        company_name = company_name_input.value.strip()
        
        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = '''
                SELECT 
                    c."NAME" AS company_name,
                    jc.job_postings,
                    eg.first_employee_count,                   
                    eg.last_employee_count,
                    eg.first_follower_count,                    
                    eg.last_follower_count,
                    ROUND((eg.last_employee_count - eg.first_employee_count) * 1.0 / NULLIF(eg.first_employee_count, 0), 3) AS employee_growth_ratio,
                    ROUND((eg.last_follower_count - eg.first_follower_count) * 1.0 / NULLIF(eg.first_follower_count, 0), 3) AS follower_growth_ratio
                FROM public."COMPANIES" c
                JOIN (
                    SELECT 
                        "COMPANY_ID",
                        MIN("EMPLOYEE_COUNT") AS first_employee_count,
                        MAX("EMPLOYEE_COUNT") AS last_employee_count,
                        MIN("FOLLOWER_COUNT") AS first_follower_count,
                        MAX("FOLLOWER_COUNT") AS last_follower_count
                    FROM public."EMPLOYEE_COUNTS"
                    GROUP BY "COMPANY_ID"
                ) AS eg ON c."ID" = eg."COMPANY_ID"
                JOIN (
                    SELECT "COMPANY_ID", COUNT(*) AS job_postings
                    FROM public."POSTING"
                    GROUP BY "COMPANY_ID"
                ) AS jc ON c."ID" = jc."COMPANY_ID"
                WHERE (%s IS NULL OR UPPER(c."NAME") ILIKE UPPER(%s))
                ORDER BY employee_growth_ratio DESC, follower_growth_ratio DESC;
            '''
            params = (company_name if company_name else None, f"%{company_name}%" if company_name else None)

            start_time = time.time()
            cur.execute(query, params)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")
            
            if rows:
                df = pd.DataFrame(rows, columns=["Company Name", "Job Postings", "Min Employees", 
                                                 "Max Employees", "Min Followers", "Max Followers", 
                                                 "Employee Growth Ratio", "Follower Growth Ratio"])
                display(df)
            else:
                print("No company data found.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(analyze_company_growth)
display(widgets.VBox([
    widgets.Label("Enter a company name to filter results (leave blank for all):"),
    company_name_input,
    search_button,
    output
]))

VBox(children=(Label(value='Enter a company name to filter results (leave blank for all):'), Text(value='', de…

### 6. Select all the VOGLIO SAPERE SE NEL MIO STATO CI SONO TANTE AZIENDE DELLA MIA STESSA TIPOLOGIA(INDUSTRY) al mio stesso livello: numero di job posting, numero di employee e numero di follower. E CHE HANNO LE MIE STESSE SPECIALITA, nelle select condition mettere la media del salario massimo e minimo per capire io come comportarmi con i salari

USA NOT IN INVECE DI <>

In [60]:
search_box = widgets.Text(
    description="Company:",
    placeholder="Enter company name"
)
search_button = widgets.Button(description="Analyze Jobs", button_style="primary")
output = widgets.Output()

def find_similar_companies(b):
    with output:
        output.clear_output()

        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            company_name = search_box.value

            query = """
                SELECT 
                    c."NAME" AS company_name,
                    c."STATE" AS company_state,
                    s."PAY_PERIOD" AS pay_period,
                    ci."INDUSTRY" AS industry,
                    COUNT(p."JOB_ID") AS job_postings,
                    MAX(ec."EMPLOYEE_COUNT") AS avg_employee_count,
                    MAX(ec."FOLLOWER_COUNT") AS avg_follower_count,
                    ROUND(AVG(s."MIN_SALARY"), 3) AS avg_min_salary,
                    ROUND(AVG(s."MAX_SALARY"), 3) AS avg_max_salary
                FROM public."COMPANIES" c
                JOIN public."COMPANY_INDUSTRY" ci ON c."ID" = ci."COMPANY_ID"
                LEFT JOIN public."POSTING" p ON c."ID" = p."COMPANY_ID"
                LEFT JOIN public."EMPLOYEE_COUNTS" ec ON c."ID" = ec."COMPANY_ID"
                LEFT JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
                WHERE 
                    ci."INDUSTRY" = (
                        SELECT "INDUSTRY" 
                        FROM public."COMPANY_INDUSTRY" 
                        WHERE "COMPANY_ID" = (
                            SELECT "ID" FROM public."COMPANIES" WHERE "NAME" ILIKE %s
                        )
                    ) 
                    AND c."STATE" = (
                        SELECT "STATE" 
                        FROM public."COMPANIES" 
                        WHERE "NAME" ILIKE %s
                    )
                    AND c."ID" <> (
                        SELECT "ID" 
                        FROM public."COMPANIES" 
                        WHERE "NAME" ILIKE %s
                    )
                AND ec."EMPLOYEE_COUNT" IS NOT NULL
                AND ec."FOLLOWER_COUNT" IS NOT NULL
                AND s."MAX_SALARY" IS NOT NULL
                AND s."MIN_SALARY" IS NOT NULL
                GROUP BY c."NAME", c."STATE", s."PAY_PERIOD", ci."INDUSTRY"
                ORDER BY avg_employee_count DESC, avg_follower_count DESC
                LIMIT 10;
            """
            
            start_time = time.time()
            cur.execute(query, (company_name, company_name, company_name))
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["Company Name", "State", "Pay Period", "Industry", "Job Postings", 
                                                 "Employee Count", "Follower Count", "Avg Min Salary", "Avg Max Salary"])
                display(df)
            else:
                print("No data found for the query.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(find_similar_companies)
display(widgets.VBox([
    widgets.Label("Enter your company name and click below to analyze similar companies:"),
    search_box,
    search_button,
    output
]))

VBox(children=(Label(value='Enter your company name and click below to analyze similar companies:'), Text(valu…

### 7. Finds job listings by country and their salary informations, but only for countries with more than 50 job postings

In [10]:
search_button = widgets.Button(description="Analyze Jobs", button_style="primary")
output = widgets.Output()

def jobs_by_country(b):
    with output:
        output.clear_output()

        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = """
                SELECT c."COUNTRY", COUNT(p."JOB_ID") AS total_jobs, 
                ROUND(AVG(s."MIN_SALARY"),3) AS avg_min_salary,
                ROUND(AVG(s."MED_SALARY"),3) AS avg_med_salary,
                ROUND(AVG(s."MAX_SALARY"),3) AS avg_max_salary
                FROM public."POSTING" p
                JOIN public."COMPANIES" c ON p."COMPANY_ID" = c."ID"
                LEFT JOIN public."SALARIES" s ON p."JOB_ID" = s."JOB_ID"
                GROUP BY c."COUNTRY"
                HAVING COUNT(p."JOB_ID") > 50
                ORDER BY total_jobs DESC;
            """
            
            start_time = time.time()
            cur.execute(query)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["Country", "Number of Job Offers", "Avg Min Salary", "Avg Med Salary", "Avg Max Salary"])
                display(df)
            else:
                print("No data found for the query.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()
search_button.on_click(jobs_by_country)

display(widgets.VBox([
    widgets.Label("Click below to analyze the job postings for each country:"),
    search_button,
    output
]))

VBox(children=(Label(value='Click below to analyze the job postings for each country:'), Button(button_style='…

### 8. Select the top 10 job title which in percentage are more requested remotely

In [13]:
search_button = widgets.Button(description="Analyze Remote Jobs", button_style="primary")
output = widgets.Output()

def top_5_jobs(b):
    with output:
        output.clear_output()

        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = """
                SELECT 
                p."TITLE", 
                COUNT(*) AS total_offers, 
                SUM(CASE WHEN p."REMOTE_ALLOWED" = '1.0' THEN 1 ELSE 0 END) AS remotes_allowed, 
                ROUND(SUM(CASE WHEN p."REMOTE_ALLOWED" = '1.0' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 3) AS percentage_remote
                FROM public."POSTING" p
                GROUP BY "TITLE"
                HAVING COUNT(*) > 0
                ORDER BY remotes_allowed DESC
                LIMIT 10;
            """
            start_time = time.time()
            cur.execute(query)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["Job Title", "Total Job Offers", "Remote Jobs Allowed", "Percentage Remote"])
                display(df)
            else:
                print("No data found for the query.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()
                
search_button.on_click(top_5_jobs)
display(widgets.VBox([
    widgets.Label("Click below to analyze the remote job postings:"),
    search_button,
    output
]))

VBox(children=(Label(value='Click below to analyze the remote job postings:'), Button(button_style='primary', …

### 9. Select the top titles with less time requested to close the posting 
- visualizzare i top 3 lavori con minor tempo medio per chiudere la job posting(possiamo usare anche le application per quel lavoro per calcolare quanto sia proficua quella proposta di lavoro) la piu richiesta tipologia di lavoro
aggiungere anche quali sono le piu diffici da prendere anche considerando il rapportto applies views

In [14]:
search_button = widgets.Button(description="Analyze Job Postings", button_style="primary")
output = widgets.Output()

def most_requested_jobs(b):
    with output:
        output.clear_output()

        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()

            query = """
            SELECT p."TITLE", 
            ROUND(AVG(p."CLOSED_TIME" - p."LISTED_TIME"),3) AS avg_closing_time,
                       SUM(CASE 
                               WHEN p."APPLIES" IS NOT NULL AND p."APPLIES" > 0 
                               THEN p."APPLIES" 
                               ELSE 0 
                           END) AS total_applies,
                       SUM(CASE 
                               WHEN p."VIEWS" IS NOT NULL AND p."VIEWS" > 0 
                               THEN p."VIEWS" 
                               ELSE 0 
                           END) AS total_views,
                       COUNT(p."TITLE") AS num_postings
                FROM public."POSTING" p
                GROUP BY p."TITLE"
                HAVING COUNT(p."TITLE") >= 5 AND AVG(p."CLOSED_TIME" - p."LISTED_TIME") > 86400
                ORDER BY avg_closing_time ASC
                LIMIT 3;
            """
            start_time = time.time()
            cur.execute(query)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["Job Title", "Avg Closing Time (ms)", "Total Applies", "Total Views", "Num Postings"])
                display(df)
            else:
                print("No data found for the query.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(most_requested_jobs)
display(widgets.VBox([
    widgets.Label("Click below to analyze the job postings with the lowest closing time:"),
    search_button,
    output
]))

VBox(children=(Label(value='Click below to analyze the job postings with the lowest closing time:'), Button(bu…

### 10. Finds the top 10 most in-demand and payed skills by industry

In [21]:
search_button = widgets.Button(description="Analyze Job Skills and Industries", button_style="primary")
output = widgets.Output()

def most_payed_skills(b):
    with output:
        output.clear_output()
        conn = None
        try:
            conn = get_db_connection()
            cur = conn.cursor()
            
            query = """
            SELECT sk."SKILL_NAME", ind."NAME" AS industry, 
                   COUNT(js."JOB_ID") AS job_count, 
                   ROUND(AVG(s."MAX_SALARY"), 2) AS avg_max_salary
            FROM public."JOB_SKILLS" js
            JOIN public."SKILL" sk ON js."SKILL_ID" = sk."ID"
            JOIN public."JOB_INDUSTRIES" ji ON js."JOB_ID" = ji."JOB_ID"
            JOIN public."INDUSTRIES" ind ON ji."INDUSTRY_ID" = ind."ID"
            LEFT JOIN public."SALARIES" s ON js."JOB_ID" = s."JOB_ID"
            GROUP BY sk."SKILL_NAME", ind."NAME"
            ORDER BY job_count DESC, avg_max_salary DESC
            LIMIT 10;
        """
            
            start_time = time.time()
            cur.execute(query)
            rows = cur.fetchall()
            end_time = time.time()
            execution_time = end_time - start_time
            cur.close()
            conn.close()

            print(f"Query executed in {execution_time:.4f} seconds")

            if rows:
                df = pd.DataFrame(rows, columns=["Skill Name", "Industry", "Job Count", "Avg Max Salary"])
                display(df)
            else:
                print("No data found for the query.")

        except psycopg2.Error as e:
            if conn:
                conn.rollback()
            print(f"Database error: {e}")

        except Exception as e:
            print(f"Unexpected error: {e}")

        finally:
            if conn:
                conn.close()

search_button.on_click(most_payed_skills)
display(widgets.VBox([
    widgets.Label("Click below to analyze the job skills and industries data:"),
    search_button,
    output
]))

VBox(children=(Label(value='Click below to analyze the job skills and industries data:'), Button(button_style=…

Views can make the query faster allowing for temporarily queries that store huge info instead to compute it into the query itself, but these are not stored in the memory like usual table and this can cause slower access since the views are not optimized to being accessed like normal table.

TRADE OFF: Faster computation of JOIN for example that can be directly stored into a virtual table, Slower access since it is not optimized for it.

NOTE PER OTTIMIZZAZIONE:

Depending on your query patterns, you might want to add additional indexes. For example, if you frequently query JOB_SKILLS by SKILL_ID, consider adding an index on SKILL_ID in JOB_SKILLS.

Likewise, if certain columns are frequently used in filters (e.g., STATE in COMPANIES, EXPERIENCE in USERS), indexes might improve performance.

MODIFICARE IL TIMESTAMP ORA IN UNIX MILLISECONDS DIRETTAMENTE CON PYTHON NELLA RISPOSTA

## SQL evaluation and optimization

### sql1