In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 🌍 Global Unemployment Analysis (1991–2021)

**Author:** Ega Octavina  
**Goal:** Perform data cleaning and basic exploratory analysis of global unemployment trends to gain insights useful for economic understanding and practice data analysis skills.  
**Dataset:** [Kaggle: Global Unemployment Data](https://www.kaggle.com/datasets/sazidthe1/global-unemployment-data)


In [None]:
import pandas as pd

# STEP 1: Load the dataset
df = pd.read_csv("/kaggle/input/global-unemployment-data/global_unemployment_data.csv")

# STEP 2: Reshape the data (wide to long)
df_long = pd.melt(
    df,
    id_vars=['country_name', 'indicator_name', 'sex', 'age_group', 'age_categories'],
    value_vars=['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'],
    var_name='Year',
    value_name='Unemployment Rate'
)

# STEP 3: Convert Year to integer
df_long['Year'] = pd.to_numeric(df_long['Year'], errors='coerce').astype('Int64')

# STEP 4: Convert Unemployment Rate to float
df_long['Unemployment Rate'] = pd.to_numeric(df_long['Unemployment Rate'], errors='coerce')

# STEP 5: Drop rows with missing unemployment rates (optional but useful)
df_long = df_long.dropna(subset=['Unemployment Rate'])

# STEP 6: Inspect the cleaned data
print(df_long.info())
df_long.head()


In [None]:
df_long.info()
df_long.describe()
df_long.head()


In [None]:
# Check Missing Values
df_long.isnull().sum()


In [None]:
# Unique Values Per Column
print("Countries:", df_long['country_name'].nunique())
print("Indicators:", df_long['indicator_name'].unique())
print("Sex categories:", df_long['sex'].unique())
print("Age groups:", df_long['age_group'].unique())


In [None]:
#  Reshape Wide to Long Format Properly
# Columns representing years
year_columns = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']

# Melt the dataset: convert year columns into one 'year' column
df_long = df.melt(
    id_vars=['country_name', 'indicator_name', 'sex', 'age_group', 'age_categories'],
    value_vars=year_columns,
    var_name='year',
    value_name='unemployment_rate'
)

# Convert 'year' to int and 'unemployment_rate' to float
df_long['year'] = df_long['year'].astype(int)
df_long['unemployment_rate'] = pd.to_numeric(df_long['unemployment_rate'], errors='coerce')

# Confirm the result
print(df_long.info())
df_long.head()


In [None]:
# Global average unemployment rate by year
global_trend = df_long.groupby('year')['unemployment_rate'].mean().reset_index()

# Preview the trend
print(global_trend)


In [None]:
# ANALYZE Top 10 Countries by Average Unemployment
# Calculate average unemployment per country
top_countries = df_long.groupby('country_name')['unemployment_rate'].mean().reset_index()

# Sort descending
top_countries = top_countries.sort_values(by='unemployment_rate', ascending=False)

# Show top 10
print(top_countries.head(10))


In [None]:
# ANALYZE Compare Unemployment Between Sexes
# Average unemployment by sex per year
gender_gap = df_long.groupby(['year', 'sex'])['unemployment_rate'].mean().reset_index()

print(gender_gap.head())


In [None]:
# ANALYZE Filter for a Country
indonesia_data = df_long[df_long['country_name'] == 'Indonesia']

# Check first few rows
indonesia_data.head()


In [None]:
# Save cleaned data
df_long.to_csv('cleaned_global_unemployment.csv', index=False)
