<a href="https://colab.research.google.com/github/crystalclcm/JobPostings/blob/main/untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd

# Load all three CSVs
old = pd.read_csv('/content/IE_DS_2015_to_2019.csv')
new = pd.read_csv('/content/DS_2021_2023.csv')
eu_old = pd.read_csv('/content/EU_DS_2015_to_2019.csv')  # Upload this file first

# Normalize column names
old.columns = [c.strip() for c in old.columns]
new.columns = [c.strip() for c in new.columns]
eu_old.columns = [c.strip() for c in eu_old.columns]

# Map Ireland name to IE
old['geo'] = old['geo'].replace({'Ireland': 'IE'})
eu_old['geo'] = eu_old['geo'].replace({'European Union': 'EU27_2020', 'EU': 'EU27_2020'})

# Keep only needed columns
old = old[['TIME_PERIOD', 'geo', 'OBS_VALUE']].dropna()
new = new[['TIME_PERIOD', 'geo', 'OBS_VALUE']].dropna()
eu_old = eu_old[['TIME_PERIOD', 'geo', 'OBS_VALUE']].dropna()

# Convert year to int
old['TIME_PERIOD'] = old['TIME_PERIOD'].astype(int)
new['TIME_PERIOD'] = new['TIME_PERIOD'].astype(int)
eu_old['TIME_PERIOD'] = eu_old['TIME_PERIOD'].astype(int)

# Combine all datasets
combined = pd.concat([old, new, eu_old])

# Filter for IE and EU aggregate
combined = combined[combined['geo'].isin(['IE', 'EU27_2020'])]

# Pivot for visualization
pivot = combined.pivot_table(index='TIME_PERIOD', columns='geo', values='OBS_VALUE')
pivot.sort_index(inplace=True)



In [None]:

# --- Visualization: Eurostat Trend (Ireland vs EU) ---
import plotly.express as px

fig = px.line(
    pivot.reset_index(),
    x='TIME_PERIOD',
    y=pivot.columns,
    markers=True,
    title='Digital Skills Trend: Ireland vs EU (2015–2023)',
    labels={'value':'% Digital Skills','TIME_PERIOD':'Year'},
    color_discrete_map={'IE':'blue','EU27_2020':'green'}
)
fig.show()





In [None]:

fig_area = px.area(
    pivot.reset_index(),
    x='TIME_PERIOD',
    y=pivot.columns,
    title='Growth in Digital Skills: Ireland vs EU (2015–2023)',
    labels={'value':'% Digital Skills','TIME_PERIOD':'Year'},
    color_discrete_map={'IE':'blue','EU27_2020':'green'}
)


In [None]:

change = pivot.iloc[-1] - pivot.iloc[0]
change_df = change.reset_index()
change_df.columns = ['Region','Change']

fig_change = px.bar(
    change_df,
    x='Region',
    y='Change',
    title='Percentage Point Change in Digital Skills (2015–2023)',
    labels={'Change':'Change in % points'}
)
fig_change.show()


In [None]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# --- Eurostat pivot table (already created in your previous code) ---
# pivot contains TIME_PERIOD as index and columns ['IE','EU27_2020']

# --- Kaggle job posting snapshot ---
skill_df = pd.DataFrame({
    'Skill': ['Cloud', 'SQL', 'Python', 'Statistics', 'AI', 'Java', 'Machine Learning', 'Data Science', 'Big Data'],
    'Percentage': [7.4, 4.2, 3.6, 3.1, 2.9, 2.2, 1.8, 1.3, 1.2]
})

# Sort skills for better visualization
sorted_skills = skill_df.sort_values('Percentage', ascending=True)

# Create dashboard
fig_dashboard = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Eurostat Digital Skills Trend (IE vs EU)', 'Job Posting Skill Demand (2023-2024)'),
    column_widths=[0.55, 0.45]
)

# Left: Eurostat trend
fig_dashboard.add_trace(go.Scatter(x=pivot.index, y=pivot['IE'], mode='lines+markers', name='Ireland'), row=1, col=1)
if 'EU27_2020' in pivot.columns:
    fig_dashboard.add_trace(go.Scatter(x=pivot.index, y=pivot['EU27_2020'], mode='lines+markers', name='EU'), row=1, col=1)
fig_dashboard.update_xaxes(title_text='Year', row=1, col=1)
fig_dashboard.update_yaxes(title_text='% Digital Skills', row=1, col=1)

# Right: Kaggle snapshot
fig_dashboard.add_trace(go.Bar(x=sorted_skills['Percentage'], y=sorted_skills['Skill'], orientation='h', name='Skills'), row=1, col=2)
fig_dashboard.update_xaxes(title_text='Share of postings (%)', row=1, col=2)

fig_dashboard.update_layout(
    title_text='Digital Skills vs Industry Demand',
    height=600,
    annotations=[dict(text='Note: Kaggle dataset represents 2023-2024 snapshot', x=0.75, y=-0.15, xref='paper', yref='paper', showarrow=False)]
)

fig_dashboard.show()


In [None]:
import pandas as pd
import plotly.express as px


In [None]:
old = pd.read_csv('/content/IE_DS_2015_to_2019.csv')
new = pd.read_csv('/content/DS_2021_2023.csv')


In [None]:
old.columns = [c.strip() for c in old.columns]
new.columns = [c.strip() for c in new.columns]


In [None]:
old['geo'] = old['geo'].replace({'Ireland': 'IE'})

In [None]:
old = old[['TIME_PERIOD', 'geo', 'OBS_VALUE']].dropna()
new = new[['TIME_PERIOD', 'geo', 'OBS_VALUE']].dropna()

In [None]:
old['TIME_PERIOD'] = old['TIME_PERIOD'].astype(int)
new['TIME_PERIOD'] = new['TIME_PERIOD'].astype(int)


In [None]:
eu_codes = ['EU27_2020','EU28','EU27','EU']
eu_old = next((code for code in eu_codes if code in old['geo'].unique()), None)
eu_new = next((code for code in eu_codes if code in new['geo'].unique()), None)

old_filtered = old[old['geo'].isin(['IE'] + ([eu_old] if eu_old else []))]
new_filtered = new[new['geo'].isin(['IE'] + ([eu_new] if eu_new else []))]


In [None]:
combined = pd.concat([old_filtered, new_filtered])
pivot = combined.pivot_table(index='TIME_PERIOD', columns='geo', values='OBS_VALUE')
pivot.sort_index(inplace=True)


In [None]:
fig1 = px.line(pivot.reset_index(), x='TIME_PERIOD', y=pivot.columns,
               markers=True,
               title='Digital Skills Trend: Ireland vs EU',
               labels={'value':'% Digital Skills','TIME_PERIOD':'Year'},
               color_discrete_map={'IE':'blue','EU27_2020':'green'})
fig1.show()


In [None]:
from google.colab import files


In [None]:

# Load job postings dataset
job_df = pd.read_csv('/content/postings.csv')  # Replace with actual filename

# Combine skills_desc and description
job_df['combined_text'] = job_df['skills_desc'].fillna('') + ' ' + job_df['description'].fillna('')

# Expanded skill list
skills_to_track = ['AI', 'Machine Learning', 'Data Science', 'Statistics', 'Python', 'Java', 'SQL', 'Cloud', 'Big Data']

# Count occurrences across entire dataset
skill_counts = {}
for skill in skills_to_track:
    skill_counts[skill] = job_df['combined_text'].str.contains(skill, case=False, na=False).sum()

# Convert to DataFrame
skill_df = pd.DataFrame(list(skill_counts.items()), columns=['Skill', 'Frequency'])
print(skill_df)

# Visualize
fig2 = px.bar(skill_df, x='Skill', y='Frequency', color='Skill',
             title='Job Posting Skill Demand (All Data)')
fig2.show()


              Skill  Frequency
0                AI     117270
1  Machine Learning       1726
2      Data Science       1404
3        Statistics       2078
4            Python       4905
5              Java       4102
6               SQL       6028
7             Cloud       7850
8          Big Data        715


In [None]:
# Normalize to percentage
total = skill_df['Frequency'].sum()
skill_df['Percentage'] = (skill_df['Frequency'] / total) * 100

# Visualize normalized data
fig_norm = px.bar(skill_df, x='Skill', y='Percentage', color='Skill',
                  title='Job Posting Skill Demand (Normalized %)')
fig_norm.show()


In [None]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Eurostat Digital Skills Trend', 'Job Posting Skill Demand'))

# Add Eurostat line chart
for col in pivot.columns:
    fig.add_trace(go.Scatter(x=pivot.index, y=pivot[col], mode='lines+markers', name=col), row=1, col=1)

# Add Job Posting bar chart
fig.add_trace(go.Bar(x=skill_df['Skill'], y=skill_df['Frequency'], name='Skills'), row=1, col=2)

fig.update_layout(title_text='Digital Skills vs Industry Demand', showlegend=True)
fig.show()


In [None]:

import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# --- Combine text fields & normalize ---
job_df['combined_text'] = (
    job_df['skills_desc'].fillna('') + ' ' + job_df['description'].fillna('')
).str.lower()

# --- Regex patterns with word boundaries & synonyms ---
patterns = {
    'AI': r'\b(ai|artificial intelligence)\b',
    'Machine Learning': r'\b(machine learning|ml)\b',
    'Data Science': r'\b(data science|data scientist)\b',
    'Statistics': r'\b(statistics?|statistical)\b',
    'Python': r'\bpython\b',
    'Java': r'\bjava\b',
    'SQL': r'\b(sql|structured query language)\b',
    'Cloud': r'\b(cloud|aws|amazon web services|azure|microsoft azure|gcp|google cloud)\b',
    'Big Data': r'\b(big data|hadoop|spark)\b'
}

def mentions(series, pattern):
    return series.str.contains(pattern, regex=True, na=False)

total_postings = len(job_df)
freq = {skill: mentions(job_df['combined_text'], pat).sum() for skill, pat in patterns.items()}

skill_df = (
    pd.DataFrame({'Skill': list(freq.keys()), 'Frequency': list(freq.values())})
      .sort_values('Frequency', ascending=False)
      .reset_index(drop=True)
)
skill_df['Percentage'] = (skill_df['Frequency'] / total_postings) * 100

print("Total postings:", total_postings)
print(skill_df)



This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.



Total postings: 123849
              Skill  Frequency  Percentage
0             Cloud       9051    7.308093
1               SQL       5194    4.193817
2            Python       4655    3.758609
3        Statistics       3959    3.196635
4                AI       3700    2.987509
5              Java       2672    2.157466
6  Machine Learning       2121    1.712569
7      Data Science       1390    1.122334
8          Big Data       1376    1.111030


In [None]:

# --- Raw counts (optionally log scale) ---
fig_counts = px.bar(
    skill_df, x='Skill', y='Frequency', color='Skill',
    title='Job Posting Skill Demand (Counts)'
)
# Uncomment the next line if AI dwarfs the chart and you want clearer separation:
# fig_counts.update_yaxes(type='log')
fig_counts.show()

# --- Normalized percentages (recommended for the dashboard) ---
fig_pct = px.bar(
    skill_df, x='Skill', y='Percentage', color='Skill',
    title='Job Posting Skill Demand (Normalized %)',
    text=skill_df['Percentage'].round(2).astype(str) + '%'
)
fig_pct.update_traces(textposition='outside')
fig_pct.update_layout(yaxis_title='Share of postings (%)')
fig_pct.show()


In [None]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# --- Subplots: Eurostat trend (left) + normalized skills snapshot (right) ---
fig_dash = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Eurostat Digital Skills Trend (IE vs EU)', 'Job Posting Skill Demand (Normalized %)'),
    column_widths=[0.55, 0.45]
)

# Left: Eurostat line(s)
for col in pivot.columns:
    fig_dash.add_trace(
        go.Scatter(x=pivot.index, y=pivot[col], mode='lines+markers', name=col),
        row=1, col=1
    )
fig_dash.update_xaxes(title_text='Year', row=1, col=1)
fig_dash.update_yaxes(title_text='% with basic/above basic digital skills', row=1, col=1)

# Right: Normalized skills bar
fig_dash.add_trace(
    go.Bar(x=skill_df['Skill'], y=skill_df['Percentage'], name='Skills'),
    row=1, col=2
)
fig_dash.update_yaxes(title_text='Share of postings (%)', row=1, col=2)

fig_dash.update_layout(
    title_text='Digital Skills vs Industry Demand',
    showlegend=True,
    bargap=0.25
)



In [None]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2, subplot_titles=('Eurostat Digital Skills Trend', 'Job Posting Skill Demand'))

# Left: Eurostat trend
fig.add_trace(go.Scatter(x=pivot.index, y=pivot['IE'], mode='lines+markers', name='Ireland'), row=1, col=1)
fig.add_trace(go.Scatter(x=pivot.index, y=pivot['EU27_2020'], mode='lines+markers', name='EU'), row=1, col=1)

# Right: Horizontal bars for job postings
sorted_skills = skill_df.sort_values('Percentage', ascending=True)
fig.add_trace(go.Bar(x=sorted_skills['Percentage'], y=sorted_skills['Skill'], orientation='h', name='Skills'), row=1, col=2)

fig.update_layout(title_text='Digital Skills vs Industry Demand', height=600)
fig.update_xaxes(title_text='Year', row=1, col=1)
fig.update_yaxes(title_text='% Digital Skills', row=1, col=1)
fig.update_xaxes(title_text='Share of postings (%)', row=1, col=2)

fig.show()


In [None]:
#include in meeting

import plotly.express as px

fig_eurostat = px.line(pivot.reset_index(), x='TIME_PERIOD', y=pivot.columns,
                       markers=True,
                       title='Eurostat Digital Skills Trend: Ireland vs EU',
                       labels={'value':'% Digital Skills','TIME_PERIOD':'Year'},
                       color_discrete_map={'IE':'blue','EU27_2020':'red'})
fig_eurostat.show()


In [None]:

sorted_skills = skill_df.sort_values('Percentage', ascending=True)

fig_skills = px.bar(sorted_skills, x='Percentage', y='Skill', orientation='h',
                    title='Job Posting Skill Demand (Normalized %)',
                    labels={'Percentage':'Share of postings (%)'})
fig_skills.show()


In [None]:

from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig_dashboard = make_subplots(rows=1, cols=2, subplot_titles=('Eurostat Digital Skills Trend', 'Job Posting Skill Demand'))

# Left: Eurostat trend
fig_dashboard.add_trace(go.Scatter(x=pivot.index, y=pivot['IE'], mode='lines+markers', name='Ireland'), row=1, col=1)
fig_dashboard.add_trace(go.Scatter(x=pivot.index, y=pivot['EU27_2020'], mode='lines+markers', name='EU'), row=1, col=1)

# Right: Horizontal bars for job postings
fig_dashboard.add_trace(go.Bar(x=sorted_skills['Percentage'], y=sorted_skills['Skill'], orientation='h', name='Skills'), row=1, col=2)

fig_dashboard.update_layout(title_text='Digital Skills vs Industry Demand', height=600)
fig_dashboard.update_xaxes(title_text='Year', row=1, col=1)
fig_dashboard.update_yaxes(title_text='% Digital Skills', row=1, col=1)
fig_dashboard.update_xaxes(title_text='Share of postings (%)', row=1, col=2)

fig_dashboard.show()
