# Statistical Analysis - Tech Industry Salaries

This notebook performs in-depth statistical analysis on the cleaned salary dataset.

## Objectives:
- Analyze salary distributions
- Compare salaries across dimensions (role, location, experience, work setting)
- Identify trends over time
- Perform correlation analysis
- Generate insights for decision-making

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Cleaned Data

In [None]:
# Load cleaned dataset
df = pd.read_csv('../data/cleaned/ds_salaries_cleaned.csv')

print(f"Dataset loaded: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"\nDate range: {df['work_year'].min()} - {df['work_year'].max()}")
df.head()

## 2. Overall Salary Distribution

In [None]:
# Overall salary statistics
print("Overall Salary Statistics (USD)")
print("=" * 60)
salary_stats = df['salary_in_usd'].describe()
print(salary_stats)

print(f"\nMode: ${df['salary_in_usd'].mode().values[0]:,.0f}")
print(f"Standard Deviation: ${df['salary_in_usd'].std():,.0f}")
print(f"Coefficient of Variation: {(df['salary_in_usd'].std() / df['salary_in_usd'].mean() * 100):.2f}%")

In [None]:
# Test for normality
statistic, p_value = stats.normaltest(df['salary_in_usd'])
print(f"\nNormality Test (D'Agostino-Pearson):")
print(f"Test statistic: {statistic:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Distribution is {'NOT normal' if p_value < 0.05 else 'normal'} (α=0.05)")

In [None]:
# Salary quartiles and percentiles
percentiles = [10, 25, 50, 75, 90, 95, 99]
print("\nSalary Percentiles (USD):")
print("=" * 60)
for p in percentiles:
    value = df['salary_in_usd'].quantile(p/100)
    print(f"{p:2d}th percentile: ${value:,.0f}")

## 3. Salary by Experience Level

In [None]:
# Salary statistics by seniority
print("Salary Analysis by Seniority Level")
print("=" * 80)

seniority_stats = df.groupby('seniority')['salary_in_usd'].agg([
    ('Count', 'count'),
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std Dev', 'std'),
    ('Min', 'min'),
    ('Max', 'max')
]).round(0)

# Sort by typical career progression
seniority_order = ['JUNIOR', 'MID', 'SENIOR', 'EXECUTIVE']
seniority_stats = seniority_stats.reindex(seniority_order)

print(seniority_stats.to_string())

In [None]:
# Calculate salary growth between levels
print("\nSalary Growth Between Seniority Levels:")
print("=" * 60)

for i in range(len(seniority_order)-1):
    current = seniority_order[i]
    next_level = seniority_order[i+1]
    
    current_salary = seniority_stats.loc[current, 'Mean']
    next_salary = seniority_stats.loc[next_level, 'Mean']
    
    increase = next_salary - current_salary
    pct_increase = (increase / current_salary) * 100
    
    print(f"{current} → {next_level}:")
    print(f"  Absolute: ${increase:,.0f}")
    print(f"  Percentage: {pct_increase:.1f}%")
    print()

## 4. Salary by Job Category

In [None]:
# Salary statistics by job category
print("Salary Analysis by Job Category")
print("=" * 80)

job_stats = df.groupby('job_category')['salary_in_usd'].agg([
    ('Count', 'count'),
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std Dev', 'std'),
    ('Min', 'min'),
    ('Max', 'max')
]).round(0).sort_values('Mean', ascending=False)

print(job_stats.to_string())

In [None]:
# Top 10 highest paying job titles
print("\nTop 10 Highest Paying Job Titles (by median):")
print("=" * 80)

top_jobs = df.groupby('job_title').agg({
    'salary_in_usd': ['count', 'median', 'mean']
}).round(0)

top_jobs.columns = ['Count', 'Median Salary', 'Mean Salary']
top_jobs = top_jobs[top_jobs['Count'] >= 3]  # Filter for jobs with at least 3 entries
top_jobs = top_jobs.sort_values('Median Salary', ascending=False).head(10)

print(top_jobs.to_string())

## 5. Salary by Location

In [None]:
# Top 15 countries by average salary
print("Top 15 Countries by Average Salary")
print("=" * 80)

country_stats = df.groupby('company_location').agg({
    'salary_in_usd': ['count', 'mean', 'median']
}).round(0)

country_stats.columns = ['Count', 'Mean Salary', 'Median Salary']
country_stats = country_stats[country_stats['Count'] >= 5]  # At least 5 entries
country_stats = country_stats.sort_values('Mean Salary', ascending=False).head(15)

print(country_stats.to_string())

In [None]:
# US vs non-US salary comparison
df['is_us'] = df['company_location'] == 'US'

print("\nUS vs Non-US Salary Comparison:")
print("=" * 60)

us_comparison = df.groupby('is_us')['salary_in_usd'].agg([
    ('Count', 'count'),
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std Dev', 'std')
]).round(0)

us_comparison.index = ['Non-US', 'US']
print(us_comparison.to_string())

# T-test for US vs non-US
us_salaries = df[df['is_us'] == True]['salary_in_usd']
non_us_salaries = df[df['is_us'] == False]['salary_in_usd']

t_stat, p_value = stats.ttest_ind(us_salaries, non_us_salaries)
print(f"\nT-test (US vs Non-US):")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4e}")
print(f"Difference is {'statistically significant' if p_value < 0.05 else 'not significant'} (α=0.05)")

## 6. Salary by Work Setting (Remote/Hybrid/On-site)

In [None]:
# Salary by work setting
print("Salary Analysis by Work Setting")
print("=" * 80)

work_setting_stats = df.groupby('work_setting')['salary_in_usd'].agg([
    ('Count', 'count'),
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std Dev', 'std'),
    ('Min', 'min'),
    ('Max', 'max')
]).round(0).sort_values('Mean', ascending=False)

print(work_setting_stats.to_string())

In [None]:
# ANOVA test for work setting
remote = df[df['work_setting'] == 'Remote']['salary_in_usd']
hybrid = df[df['work_setting'] == 'Hybrid']['salary_in_usd']
onsite = df[df['work_setting'] == 'On-site']['salary_in_usd']

f_stat, p_value = stats.f_oneway(remote, hybrid, onsite)

print(f"\nANOVA Test (Work Setting):")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4e}")
print(f"Difference is {'statistically significant' if p_value < 0.05 else 'not significant'} (α=0.05)")

## 7. Salary by Company Size

In [None]:
# Salary by company size
print("Salary Analysis by Company Size")
print("=" * 80)

size_stats = df.groupby('company_size_label')['salary_in_usd'].agg([
    ('Count', 'count'),
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std Dev', 'std'),
    ('Min', 'min'),
    ('Max', 'max')
]).round(0)

# Order by size
size_order = ['Small', 'Medium', 'Large']
size_stats = size_stats.reindex(size_order)

print(size_stats.to_string())

## 8. Temporal Trends (Year-over-Year)

In [None]:
# Salary trends over years
print("Salary Trends Over Years")
print("=" * 80)

yearly_stats = df.groupby('work_year')['salary_in_usd'].agg([
    ('Count', 'count'),
    ('Mean', 'mean'),
    ('Median', 'median'),
    ('Std Dev', 'std')
]).round(0)

print(yearly_stats.to_string())

# Calculate YoY growth
print("\nYear-over-Year Growth:")
print("=" * 60)
for i in range(len(yearly_stats) - 1):
    year1 = yearly_stats.index[i]
    year2 = yearly_stats.index[i + 1]
    
    salary1 = yearly_stats.loc[year1, 'Mean']
    salary2 = yearly_stats.loc[year2, 'Mean']
    
    growth = ((salary2 - salary1) / salary1) * 100
    
    print(f"{year1} → {year2}: {growth:+.2f}%")

## 9. Multi-dimensional Analysis

In [None]:
# Salary by seniority and work setting
print("Salary by Seniority and Work Setting")
print("=" * 80)

pivot_seniority_work = pd.pivot_table(
    df,
    values='salary_in_usd',
    index='seniority',
    columns='work_setting',
    aggfunc='mean'
).round(0)

pivot_seniority_work = pivot_seniority_work.reindex(seniority_order)
print(pivot_seniority_work.to_string())

In [None]:
# Salary by job category and seniority
print("\nTop 5 Job Categories by Seniority (Mean Salary)")
print("=" * 80)

pivot_job_seniority = pd.pivot_table(
    df,
    values='salary_in_usd',
    index='job_category',
    columns='seniority',
    aggfunc='mean'
).round(0)

pivot_job_seniority = pivot_job_seniority[seniority_order]
pivot_job_seniority = pivot_job_seniority.sort_values('SENIOR', ascending=False).head()
print(pivot_job_seniority.to_string())

In [None]:
# Salary by company size and seniority
print("\nSalary by Company Size and Seniority")
print("=" * 80)

pivot_size_seniority = pd.pivot_table(
    df,
    values='salary_in_usd',
    index='company_size_label',
    columns='seniority',
    aggfunc='mean'
).round(0)

pivot_size_seniority = pivot_size_seniority.reindex(size_order)
pivot_size_seniority = pivot_size_seniority[seniority_order]
print(pivot_size_seniority.to_string())

## 10. Correlation Analysis

In [None]:
# Encode categorical variables for correlation
df_encoded = df.copy()

# Seniority encoding (ordinal)
seniority_encoding = {'JUNIOR': 1, 'MID': 2, 'SENIOR': 3, 'EXECUTIVE': 4}
df_encoded['seniority_encoded'] = df_encoded['seniority'].map(seniority_encoding)

# Work setting encoding
work_encoding = {'On-site': 0, 'Hybrid': 50, 'Remote': 100}
df_encoded['work_setting_encoded'] = df_encoded['work_setting'].map(work_encoding)

# Company size encoding (ordinal)
size_encoding = {'Small': 1, 'Medium': 2, 'Large': 3}
df_encoded['size_encoded'] = df_encoded['company_size_label'].map(size_encoding)

# Select numeric columns for correlation
corr_columns = ['salary_in_usd', 'work_year', 'seniority_encoded', 
                'work_setting_encoded', 'size_encoded', 'remote_ratio']

correlation_matrix = df_encoded[corr_columns].corr()

print("Correlation with Salary:")
print("=" * 60)
salary_corr = correlation_matrix['salary_in_usd'].sort_values(ascending=False)
print(salary_corr.to_string())

## 11. Key Insights Summary

In [None]:
print("="*80)
print("KEY INSIGHTS - TECH INDUSTRY SALARY ANALYSIS")
print("="*80)

# Overall statistics
print(f"\n1. OVERALL MARKET:")
print(f"   • Average Salary: ${df['salary_in_usd'].mean():,.0f}")
print(f"   • Median Salary: ${df['salary_in_usd'].median():,.0f}")
print(f"   • Salary Range: ${df['salary_in_usd'].min():,.0f} - ${df['salary_in_usd'].max():,.0f}")

# Seniority insights
print(f"\n2. SENIORITY IMPACT:")
junior_avg = seniority_stats.loc['JUNIOR', 'Mean']
executive_avg = seniority_stats.loc['EXECUTIVE', 'Mean']
multiplier = executive_avg / junior_avg
print(f"   • Junior average: ${junior_avg:,.0f}")
print(f"   • Executive average: ${executive_avg:,.0f}")
print(f"   • Executives earn {multiplier:.1f}x more than Juniors")

# Top paying category
print(f"\n3. TOP PAYING CATEGORIES:")
top_category = job_stats['Mean'].idxmax()
top_category_salary = job_stats.loc[top_category, 'Mean']
print(f"   • Highest: {top_category} (${top_category_salary:,.0f})")

# Work setting
print(f"\n4. WORK SETTING:")
for setting in ['Remote', 'Hybrid', 'On-site']:
    if setting in work_setting_stats.index:
        count = work_setting_stats.loc[setting, 'Count']
        avg = work_setting_stats.loc[setting, 'Mean']
        pct = (count / len(df)) * 100
        print(f"   • {setting}: {pct:.1f}% of jobs, avg ${avg:,.0f}")

# Geographic
print(f"\n5. GEOGRAPHIC:")
us_avg = us_comparison.loc['US', 'Mean']
non_us_avg = us_comparison.loc['Non-US', 'Mean']
premium = ((us_avg - non_us_avg) / non_us_avg) * 100
print(f"   • US average: ${us_avg:,.0f}")
print(f"   • Non-US average: ${non_us_avg:,.0f}")
print(f"   • US premium: {premium:.1f}%")

# Temporal
print(f"\n6. TRENDS:")
first_year = yearly_stats.index[0]
last_year = yearly_stats.index[-1]
first_salary = yearly_stats.loc[first_year, 'Mean']
last_salary = yearly_stats.loc[last_year, 'Mean']
total_growth = ((last_salary - first_salary) / first_salary) * 100
print(f"   • {first_year} average: ${first_salary:,.0f}")
print(f"   • {last_year} average: ${last_salary:,.0f}")
print(f"   • Total growth: {total_growth:+.1f}%")

print("\n" + "="*80)

## 12. Export Analysis Results

In [None]:
# Create summary dataframes for export
analysis_results = {
    'seniority_analysis': seniority_stats,
    'job_category_analysis': job_stats,
    'country_analysis': country_stats,
    'work_setting_analysis': work_setting_stats,
    'company_size_analysis': size_stats,
    'yearly_trends': yearly_stats
}

# Save to Excel with multiple sheets
output_path = '../data/results/analysis_summary.xlsx'
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    for sheet_name, data in analysis_results.items():
        data.to_excel(writer, sheet_name=sheet_name)

print(f"✓ Analysis results saved to: {output_path}")

## Summary

### Analysis Completed:
1. ✓ Overall salary distribution analysis
2. ✓ Seniority level comparison
3. ✓ Job category analysis
4. ✓ Geographic analysis (countries, US vs non-US)
5. ✓ Work setting comparison (remote/hybrid/on-site)
6. ✓ Company size analysis
7. ✓ Temporal trends (year-over-year)
8. ✓ Multi-dimensional cross-analysis
9. ✓ Statistical testing (t-tests, ANOVA)
10. ✓ Correlation analysis
11. ✓ Key insights generation

### Next Steps:
- Proceed to `03_visualization.ipynb` for visual exploration
- Create interactive charts and dashboards
- Generate presentation-ready visualizations