# 01 - Veri Yukleme ve Istatistiksel Analiz

Bu dosya, 2025 Yazilim Sektoru Maas Anketi icin temel veri yukleme, on isleme ve istatistiksel analiz islemlerini ele alir

## Hedefleri:
- Temizlenmis veri setini yuklemek ve dogrulamak
- Etki buyuklukleri ile istatistiksel testler gerceklestirmek
- Temel veri ozetlerini oluşturmak
- Sonraki analizler icin temel metrikleri hesaplamak

## Kütüphaneleri import etme ve Setup

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind, f_oneway, kruskal
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Set up plotting style
sns.set_palette("husl")
plt.rcParams['font.family'] = 'DejaVu Sans'

# Constants
FIG_DIR = '../figures'
LOCATION_NOTE = 'Note: Estimated location is inferred from company location and work mode (Office/Hybrid → company location). Not definitive. "Yurtdışı TR hub" responses are excluded from location-based inference.'

# Ensure output directory exists
os.makedirs(FIG_DIR, exist_ok=True)

## Veri Yükleme ve Doğrulama

In [None]:
def load_data() -> pd.DataFrame:
    """Load the cleaned dataset and perform basic validation"""
    df = pd.read_csv('../data/2025_cleaned_data.csv')
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df

# Load data
df = load_data()
print(f'Dataset yuklendi: {df.shape[0]} satir, {df.shape[1]} sutun')
print(f'Tarih araligi: {df["timestamp"].min()} to {df["timestamp"].max()}')

## Veri Genel Bakış ve Özet İstatistikler

In [None]:
# Display basic dataset information
print("Dataset Bilgileri:")
print(df.info())

print("\nIlk birkaç satir:")
display(df.head())

print("\nTemel sayisal sutunlarin ozet istatistikleri:")
key_cols = ['salary_numeric', 'experience_years', 'seniority_level_ic']
display(df[key_cols].describe())

## Etki Büyüklüğü Hesaplama Fonksiyonu

In [None]:
def calculate_effect_size(group1, group2):
    """Calculate Cohen's d effect size for comparing two groups"""
    n1, n2 = len(group1), len(group2)
    pooled_std = np.sqrt(((n1 - 1) * group1.var() + (n2 - 1) * group2.var()) / (n1 + n2 - 2))
    return (group1.mean() - group2.mean()) / pooled_std

## İstatistiksel Testler: Temel Karşılaştırmalar

In [None]:
def perform_statistical_tests(df: pd.DataFrame):
    """Perform hypothesis tests with effect sizes for key comparisons"""
    results = {}
    
    # React vs Non-React
    if 'frontend_React' in df.columns:
        react_salaries = df[df['frontend_React'] == 1]['salary_numeric']
        non_react_salaries = df[df['frontend_React'] == 0]['salary_numeric']
        
        if len(react_salaries) > 10 and len(non_react_salaries) > 10:
            t_stat, p_value = ttest_ind(react_salaries, non_react_salaries, equal_var=False)
            effect_size = calculate_effect_size(react_salaries, non_react_salaries)
            
            results['react_vs_non_react'] = {
                'react_mean': react_salaries.mean(),
                'non_react_mean': non_react_salaries.mean(),
                'mean_diff': react_salaries.mean() - non_react_salaries.mean(),
                'p_value': p_value,
                'effect_size': effect_size,
                'significant': p_value < 0.05,
                'react_count': len(react_salaries),
                'non_react_count': len(non_react_salaries)
            }
    
    # Remote vs Office
    if 'work_mode_Remote' in df.columns and 'work_mode_Office' in df.columns:
        remote_salaries = df[df['work_mode_Remote'] == 1]['salary_numeric']
        office_salaries = df[df['work_mode_Office'] == 1]['salary_numeric']
        
        if len(remote_salaries) > 10 and len(office_salaries) > 10:
            t_stat, p_value = ttest_ind(remote_salaries, office_salaries, equal_var=False)
            effect_size = calculate_effect_size(remote_salaries, office_salaries)
            
            results['remote_vs_office'] = {
                'remote_mean': remote_salaries.mean(),
                'office_mean': office_salaries.mean(),
                'mean_diff': remote_salaries.mean() - office_salaries.mean(),
                'p_value': p_value,
                'effect_size': effect_size,
                'significant': p_value < 0.05,
                'remote_count': len(remote_salaries),
                'office_count': len(office_salaries)
            }
    
    # Europe vs Turkey
    if 'company_location_Avrupa' in df.columns and 'company_location_Turkiye' in df.columns:
        europe_salaries = df[df['company_location_Avrupa'] == 1]['salary_numeric']
        turkey_salaries = df[df['company_location_Turkiye'] == 1]['salary_numeric']
        
        if len(europe_salaries) > 10 and len(turkey_salaries) > 10:
            t_stat, p_value = ttest_ind(europe_salaries, turkey_salaries, equal_var=False)
            effect_size = calculate_effect_size(europe_salaries, turkey_salaries)
            
            results['europe_vs_turkey'] = {
                'europe_mean': europe_salaries.mean(),
                'turkey_mean': turkey_salaries.mean(),
                'mean_diff': europe_salaries.mean() - turkey_salaries.mean(),
                'p_value': p_value,
                'effect_size': effect_size,
                'significant': p_value < 0.05,
                'europe_count': len(europe_salaries),
                'turkey_count': len(turkey_salaries)
            }
    
    # Gender gap
    male_salaries = df[df['gender'] == 0]['salary_numeric']
    female_salaries = df[df['gender'] == 1]['salary_numeric']
    
    if len(male_salaries) > 10 and len(female_salaries) > 10:
        t_stat, p_value = ttest_ind(male_salaries, female_salaries, equal_var=False)
        effect_size = calculate_effect_size(male_salaries, female_salaries)
        
        results['gender_gap'] = {
            'male_mean': male_salaries.mean(),
            'female_mean': female_salaries.mean(),
            'mean_diff': male_salaries.mean() - female_salaries.mean(),
            'p_value': p_value,
            'effect_size': effect_size,
            'significant': p_value < 0.05,
            'male_count': len(male_salaries),
            'female_count': len(female_salaries)
        }
    
    return results

# Perform statistical tests
test_results = perform_statistical_tests(df)

# Display results in a formatted table
results_df = pd.DataFrame([
    {
        'Comparison': test_name.replace('_', ' ').title(),
        'Group 1 Mean': result['mean_diff'] + result.get('non_react_mean', result.get('office_mean', result.get('turkey_mean', result.get('female_mean', 0)))),
        'Group 2 Mean': result.get('non_react_mean', result.get('office_mean', result.get('turkey_mean', result.get('female_mean', 0)))),
        'Mean Difference': result['mean_diff'],
        'P-value': result['p_value'],
        'Effect Size (Cohen\'s d)': result['effect_size'],
        'Significant': result['significant'],
        'Group 1 Count': result.get('react_count', result.get('remote_count', result.get('europe_count', result.get('male_count', 0)))),
        'Group 2 Count': result.get('non_react_count', result.get('office_count', result.get('turkey_count', result.get('female_count', 0))))
    }
    for test_name, result in test_results.items()
])

display(results_df)

## Grup Karşılaştırmaları: Kıdem Seviyeleri

In [None]:
# Seniority level group comparison
if 'seniority_level_ic' in df.columns:
    valid = df[['seniority_level_ic', 'salary_numeric']].dropna()
    groups = [g['salary_numeric'].values for _, g in valid.groupby('seniority_level_ic') if len(g) >= 10]
    labels = [str(k) for k, g in valid.groupby('seniority_level_ic') if len(g) >= 10]
    
    if len(groups) >= 2:
        # ANOVA test
        try:
            f_stat, p_anova = f_oneway(*groups)
        except Exception:
            f_stat, p_anova = np.nan, np.nan
        
        # Kruskal-Wallis test
        try:
            h_stat, p_kruskal = kruskal(*groups)
        except Exception:
            h_stat, p_kruskal = np.nan, np.nan

        print("Seniority level group comparison:")
        print(f"  Groups (n>=10): {labels}")
        print(f"  ANOVA p-value: {p_anova:.4f}" if not np.isnan(p_anova) else "  ANOVA p-value: NA")
        print(f"  Kruskal-Wallis p-value: {p_kruskal:.4f}" if not np.isnan(p_kruskal) else "  Kruskal-Wallis p-value: NA")

        # Tukey HSD post-hoc test
        try:
            tukey = pairwise_tukeyhsd(endog=valid['salary_numeric'], groups=valid['seniority_level_ic'].astype(str), alpha=0.05)
            print("  Tukey HSD (significant pairs):")
            for res in tukey.summary().data[1:]:
                grp1, grp2, meandiff, p_adj, lower, upper, reject = res
                if reject:
                    print(f"    {grp1} vs {grp2}: diff={meandiff:.1f}, p_adj={p_adj:.4f}")
        except Exception:
            print("  Tukey HSD: NA")

## Grup Karşılaştırmaları: Yönetim Düzeyleri

In [None]:
# Management level group comparison
if 'is_manager' in df.columns:
    management_cols = [c for c in df.columns if c.startswith('management_')]
    managers = df[df['is_manager'] == 1].copy()
    
    if not managers.empty and management_cols:
        def get_management_level(row):
            for col in management_cols:
                try:
                    if row[col] == 1:
                        return col.replace('management_', '').replace('_', ' ')
                except KeyError:
                    continue
            return 'Unknown'

        managers['management_level_label'] = managers.apply(get_management_level, axis=1)
        managers = managers[managers['management_level_label'] != 'Unknown']
        
        if not managers.empty:
            valid_m = managers[['management_level_label', 'salary_numeric']].dropna()
            mgroups = [g['salary_numeric'].values for _, g in valid_m.groupby('management_level_label') if len(g) >= 5]
            mlabels = [k for k, g in valid_m.groupby('management_level_label') if len(g) >= 5]
            
            if len(mgroups) >= 2:
                # ANOVA test
                try:
                    f_stat_m, p_anova_m = f_oneway(*mgroups)
                except Exception:
                    f_stat_m, p_anova_m = np.nan, np.nan
                
                # Kruskal-Wallis test
                try:
                    h_stat_m, p_kruskal_m = kruskal(*mgroups)
                except Exception:
                    h_stat_m, p_kruskal_m = np.nan, np.nan

                print("\nManagement level group comparison:")
                print(f"  Groups (n>=5): {mlabels}")
                print(f"  ANOVA p-value: {p_anova_m:.4f}" if not np.isnan(p_anova_m) else "  ANOVA p-value: NA")
                print(f"  Kruskal-Wallis p-value: {p_kruskal_m:.4f}" if not np.isnan(p_kruskal_m) else "  Kruskal-Wallis p-value: NA")

                # Tukey HSD post-hoc test
                try:
                    tukey_m = pairwise_tukeyhsd(endog=valid_m['salary_numeric'], groups=valid_m['management_level_label'], alpha=0.05)
                    print("  Tukey HSD (significant pairs):")
                    for res in tukey_m.summary().data[1:]:
                        grp1, grp2, meandiff, p_adj, lower, upper, reject = res
                        if reject:
                            print(f"    {grp1} vs {grp2}: diff={meandiff:.1f}, p_adj={p_adj:.4f}")
                except Exception:
                    print("  Tukey HSD: NA")

## Veri Özet Tabloları

In [None]:
# Create summary tables for key variables

# Salary by career level
career_summary = df.groupby('seniority_level_ic')['salary_numeric'].agg(['count', 'mean', 'std', 'min', 'max']).round(1)
career_summary.columns = ['Count', 'Mean Salary', 'Std Dev', 'Min', 'Max']
# print(career_summary)
career_summary.index = ['Management', 'Junior', 'Mid', 'Senior', 'Staff Engineer', 'Team Lead', 'Architect']
print("Salary by Career Level:")
display(career_summary)

# Gender distribution
gender_summary = df.groupby('gender')['salary_numeric'].agg(['count', 'mean', 'std']).round(1)
gender_summary.columns = ['Count', 'Mean Salary', 'Std Dev']
gender_summary.index = ['Male', 'Female']
print("\nSalary by Gender:")
display(gender_summary)

# Work mode distribution
work_mode_data = []
for mode in ['Remote', 'Hybrid', 'Office']:
    col = f'work_mode_{mode}'
    if col in df.columns:
        vals = df.loc[df[col] == 1, 'salary_numeric']
        if len(vals) > 0:
            work_mode_data.append({
                'Work Mode': mode,
                'Count': len(vals),
                'Mean Salary': vals.mean(),
                'Std Dev': vals.std()
            })

if work_mode_data:
    work_mode_summary = pd.DataFrame(work_mode_data).round(1)
    print("\nSalary by Work Mode:")
    display(work_mode_summary)

## Save Results for Other Notebooks

In [None]:
# Save the processed dataframe and test results for use in other notebooks
import pickle

# Save dataframe
df.to_pickle('../data/processed_dataframe.pkl')

# Save test results
with open('../data/statistical_test_results.pkl', 'wb') as f:
    pickle.dump(test_results, f)

print("Data and results saved for use in other notebooks.")
print(f"Dataset shape: {df.shape}")
print(f"Number of statistical tests performed: {len(test_results)}")