# World Happiness Analysis

## Overview
This notebook provides a comprehensive analysis of global happiness data using advanced statistical methods and interactive visualizations. The analysis explores relationships between happiness scores and various socio-economic factors across countries and time periods.

## Features
- **Data Cleaning & Preprocessing**: Robust handling of missing values and outliers
- **Exploratory Data Analysis**: Statistical summaries and distributions
- **Advanced Visualizations**: Interactive plots with Plotly
- **Statistical Analysis**: Correlation analysis and regression modeling
- **Time Series Analysis**: Happiness trends over time
- **Output Management**: All plots saved as HTML files for GitHub

## Data Sources
- World Happiness Report data downloaded from Kaggle and saved into local directory (happiness.csv)
- Variables include Life Ladder, GDP per capita, Social Support, etc.

---

In [15]:
# Import Required Libraries
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

# Statistical Analysis
import statsmodels.formula.api as stats
from statsmodels.formula.api import ols
from scipy import stats as scipy_stats
from scipy.stats import pearsonr, spearmanr
import sklearn
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Visualization Libraries
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt

# Date and Time
from datetime import datetime
import json

# Create output directories
output_dirs = ['outputs', 'outputs/plots', 'outputs/data', 'outputs/models']
for dir_name in output_dirs:
    os.makedirs(dir_name, exist_ok=True)
    print(f"Created directory: {dir_name}")

# Initialize Plotly for offline mode
init_notebook_mode(connected=True)

print(" All libraries imported successfully!")
print(" Output directories created!")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Created directory: outputs
Created directory: outputs/plots
Created directory: outputs/data
Created directory: outputs/models


 All libraries imported successfully!
 Output directories created!
Analysis started at: 2025-07-18 21:33:50


In [16]:
# Advanced Data Loading and Validation
def load_and_validate_data(file_path):
    """Load data with comprehensive validation and error handling."""
    try:
        # Load data with different encoding attempts
        encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
        df = None
        
        for encoding in encodings:
            try:
                df = pd.read_csv(file_path, encoding=encoding)
                print(f" Data loaded successfully with {encoding} encoding")
                break
            except UnicodeDecodeError:
                continue
        
        if df is None:
            raise ValueError("Could not load data with any encoding")
        
        # Basic validation
        print(f"Dataset shape: {df.shape}")
        print(f"Data types: {df.dtypes.value_counts().to_dict()}")
        
        # Check for completely empty rows/columns
        empty_rows = df.isnull().all(axis=1).sum()
        empty_cols = df.isnull().all(axis=0).sum()
        
        if empty_rows > 0:
            print(f"Found {empty_rows} completely empty rows - removing them")
            df = df.dropna(how='all')
        
        if empty_cols > 0:
            print(f"Found {empty_cols} completely empty columns - removing them")
            df = df.dropna(axis=1, how='all')
        
        return df
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load the happiness data
happiness_2024 = load_and_validate_data("happiness.csv")

if happiness_2024 is not None:
    print("\nDataset Overview:")
    print(happiness_2024.head())
    print(f"\nColumns: {list(happiness_2024.columns)}")
else:
    print("Failed to load data. Please check the file path and format.")

 Data loaded successfully with utf-8 encoding
Dataset shape: (2363, 11)
Data types: {dtype('float64'): 9, dtype('O'): 1, dtype('int64'): 1}

Dataset Overview:
      Country   Year  Life Ladder  Log GDP per capita  Social Support  \
0  Afghanistan  2008     3.723590            7.350416        0.450662   
1  Afghanistan  2009     4.401778            7.508646        0.552308   
2  Afghanistan  2010     4.758381            7.613900        0.539075   
3  Afghanistan  2011     3.831719            7.581259        0.521104   
4  Afghanistan  2012     3.782938            7.660506        0.520637   

   Healthy Life Expectancy   Freedom   Generosity  Perceptions   \
0                 50.500000  0.718114    0.164055      0.881686   
1                 50.799999  0.678896    0.187297      0.850035   
2                 51.099998  0.600127    0.117861      0.706766   
3                 51.400002  0.495901    0.160098      0.731109   
4                 51.700001  0.530935    0.234157      0.775620   


In [17]:
# Comprehensive Data Exploration
def explore_dataset(df):
    """Comprehensive dataset exploration with detailed statistics."""
    print("=" * 60)
    print("COMPREHENSIVE DATA EXPLORATION")
    print("=" * 60)
    
    # Basic Information
    print("\n1. BASIC DATASET INFORMATION")
    print("-" * 40)
    print(f"Shape: {df.shape}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print(f"Number of duplicate rows: {df.duplicated().sum()}")
    
    # Data Types
    print("\n2. DATA TYPES")
    print("-" * 40)
    print(df.dtypes)
    
    # Missing Values Analysis
    print("\n3. MISSING VALUES ANALYSIS")
    print("-" * 40)
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Percentage': missing_percent
    }).sort_values('Missing Count', ascending=False)
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Numerical Columns Statistics
    print("\n4. NUMERICAL COLUMNS STATISTICS")
    print("-" * 40)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    print(f"Numerical columns: {list(numerical_cols)}")
    print(df[numerical_cols].describe())
    
    # Categorical Columns Analysis
    print("\n5. CATEGORICAL COLUMNS ANALYSIS")
    print("-" * 40)
    categorical_cols = df.select_dtypes(include=['object']).columns
    print(f"Categorical columns: {list(categorical_cols)}")
    for col in categorical_cols:
        print(f"\n{col}:")
        print(f"  Unique values: {df[col].nunique()}")
        print(f"  Top 5 values: {df[col].value_counts().head().to_dict()}")
    
    return numerical_cols, categorical_cols

# Explore the dataset
if happiness_2024 is not None:
    numerical_cols, categorical_cols = explore_dataset(happiness_2024)
    
    # Save exploration results
    exploration_results = {
        'timestamp': datetime.now().isoformat(),
        'shape': happiness_2024.shape,
        'columns': list(happiness_2024.columns),
        'numerical_columns': list(numerical_cols),
        'categorical_columns': list(categorical_cols),
        'missing_values': happiness_2024.isnull().sum().to_dict()
    }
    
    with open('outputs/data/exploration_results.json', 'w') as f:
        json.dump(exploration_results, f, indent=2)
    
    print("\nExploration results saved to outputs/data/exploration_results.json")

COMPREHENSIVE DATA EXPLORATION

1. BASIC DATASET INFORMATION
----------------------------------------
Shape: (2363, 11)
Memory Usage: 0.31 MB
Number of duplicate rows: 0

2. DATA TYPES
----------------------------------------
Country                      object
Year                          int64
Life Ladder                 float64
Log GDP per capita          float64
Social Support              float64
Healthy Life Expectancy     float64
Freedom                     float64
Generosity                  float64
Perceptions                 float64
Positive Effect             float64
Negative Effect             float64
dtype: object

3. MISSING VALUES ANALYSIS
----------------------------------------
                          Missing Count  Percentage
Perceptions                         125    5.289886
Generosity                           81    3.427846
Healthy Life Expectancy              63    2.666102
Freedom                              36    1.523487
Log GDP per capita                 

In [18]:
# Advanced Data Cleaning and Preprocessing
def clean_and_preprocess_data(df):
    """Advanced data cleaning with multiple strategies."""
    print("ADVANCED DATA CLEANING")
    print("=" * 50)
    
    # Create a copy to avoid modifying original
    df_clean = df.copy()
    
    # 1. Clean column names
    print("\n1. Cleaning column names...")
    df_clean.columns = df_clean.columns.str.strip().str.replace(' ', '_').str.lower()
    print(f"   Cleaned columns: {list(df_clean.columns)}")
    
    # 2. Handle missing values with advanced strategies
    print("\n2. Handling missing values...")
    
    # For numerical columns
    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        missing_count = df_clean[col].isnull().sum()
        if missing_count > 0:
            if missing_count / len(df_clean) > 0.5:  # More than 50% missing
                print(f"     {col}: {missing_count} missing values (>50%) - considering removal")
            else:
                # Use median for skewed distributions, mean for normal
                if abs(df_clean[col].skew()) > 1:
                    df_clean[col].fillna(df_clean[col].median(), inplace=True)
                    print(f"    {col}: Filled {missing_count} missing values with median")
                else:
                    df_clean[col].fillna(df_clean[col].mean(), inplace=True)
                    print(f"    {col}: Filled {missing_count} missing values with mean")
    
    # For categorical columns
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        missing_count = df_clean[col].isnull().sum()
        if missing_count > 0:
            # Fill with mode or 'Unknown'
            if df_clean[col].mode().empty:
                df_clean[col].fillna('Unknown', inplace=True)
            else:
                df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
            print(f"    {col}: Filled {missing_count} missing values with mode/Unknown")
    
    # 3. Remove duplicates
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    removed_duplicates = initial_rows - len(df_clean)
    if removed_duplicates > 0:
        print(f"\n3. Removed {removed_duplicates} duplicate rows")
    
    # 4. Detect and handle outliers using IQR method
    print("\n4. Detecting and handling outliers...")
    outlier_summary = {}
    
    for col in numerical_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
        outlier_count = len(outliers)
        
        if outlier_count > 0:
            outlier_summary[col] = {
                'count': outlier_count,
                'percentage': (outlier_count / len(df_clean)) * 100,
                'bounds': (lower_bound, upper_bound)
            }
            
            # Cap outliers instead of removing them
            df_clean[col] = np.clip(df_clean[col], lower_bound, upper_bound)
            print(f"     {col}: Capped {outlier_count} outliers ({outlier_count/len(df_clean)*100:.1f}%)")
    
    # 5. Data type optimization
    print("\n5. Optimizing data types...")
    for col in numerical_cols:
        if df_clean[col].dtype == 'float64':
            df_clean[col] = pd.to_numeric(df_clean[col], downcast='float')
        elif df_clean[col].dtype == 'int64':
            df_clean[col] = pd.to_numeric(df_clean[col], downcast='integer')
    
    # 6. Create derived features
    print("\n6. Creating derived features...")
    
    # Add data quality score
    df_clean['data_quality_score'] = (df_clean.notnull().sum(axis=1) / len(df_clean.columns)) * 100
    
    print(f"\n Data cleaning completed!")
    print(f"   Original shape: {df.shape}")
    print(f"   Cleaned shape: {df_clean.shape}")
    print(f"   Rows removed: {df.shape[0] - df_clean.shape[0]}")
    
    return df_clean, outlier_summary

# Clean the data
if happiness_2024 is not None:
    happiness_clean, outlier_info = clean_and_preprocess_data(happiness_2024)
    
    # Save cleaned data
    happiness_clean.to_csv('outputs/data/happiness_cleaned.csv', index=False)
    print("\n Cleaned data saved to outputs/data/happiness_cleaned.csv")
    
    # Save outlier information
    with open('outputs/data/outlier_summary.json', 'w') as f:
        json.dump(outlier_info, f, indent=2, default=str)
    
    print(" Outlier summary saved to outputs/data/outlier_summary.json")
else:
    print(" No data to clean")

ADVANCED DATA CLEANING

1. Cleaning column names...
   Cleaned columns: ['country', 'year', 'life_ladder', 'log_gdp_per_capita', 'social_support', 'healthy_life_expectancy', 'freedom', 'generosity', 'perceptions', 'positive_effect', 'negative_effect']

2. Handling missing values...
    log_gdp_per_capita: Filled 28 missing values with mean
    social_support: Filled 13 missing values with median
    healthy_life_expectancy: Filled 63 missing values with median
    freedom: Filled 36 missing values with mean
    generosity: Filled 81 missing values with mean
    perceptions: Filled 125 missing values with median
    positive_effect: Filled 24 missing values with mean
    negative_effect: Filled 16 missing values with mean

4. Detecting and handling outliers...
     life_ladder: Capped 2 outliers (0.1%)
     log_gdp_per_capita: Capped 1 outliers (0.0%)
     social_support: Capped 50 outliers (2.1%)
     healthy_life_expectancy: Capped 27 outliers (1.1%)
     freedom: Capped 16 outliers (

In [None]:
# Advanced Visualization Functions
def create_correlation_heatmap(df, save_path=None):
    """Create an advanced correlation heatmap with annotations."""
    print(" Creating correlation heatmap...")
    
    # Select only numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    correlation_matrix = df[numerical_cols].corr()
    
    # Create heatmap with annotations
    fig = go.Figure(data=go.Heatmap(
        z=correlation_matrix.values,
        x=correlation_matrix.columns,
        y=correlation_matrix.columns,
        colorscale='RdBu',
        zmid=0,
        text=np.round(correlation_matrix.values, 2),
        texttemplate="%{text}",
        textfont={"size": 10},
        colorbar=dict(title="Correlation Coefficient")
    ))
    
    fig.update_layout(
        title="Correlation Matrix Heatmap",
        xaxis_title="Features",
        yaxis_title="Features",
        width=800,
        height=600
    )
    
    fig.show()
    
    if save_path:
        fig.write_html(save_path)
        print(f" Correlation heatmap saved to {save_path}")
    
    return fig

def create_distribution_plots(df, target_col, save_path=None):
    """Create comprehensive distribution plots."""
    print(f" Creating distribution plots for {target_col}...")
    
    # Create subplots
    fig = go.Figure()
    
    # Histogram
    fig.add_trace(go.Histogram(
        x=df[target_col],
        nbinsx=30,
        name='Distribution',
        opacity=0.7,
        marker_color='skyblue'
    ))
    
    # Add normal distribution overlay
    mean_val = df[target_col].mean()
    std_val = df[target_col].std()
    x_range = np.linspace(df[target_col].min(), df[target_col].max(), 100)
    y_normal = scipy_stats.norm.pdf(x_range, mean_val, std_val) * len(df) * (df[target_col].max() - df[target_col].min()) / 30
    
    fig.add_trace(go.Scatter(
        x=x_range,
        y=y_normal,
        mode='lines',
        name='Normal Distribution',
        line=dict(color='red', width=2)
    ))
    
    fig.update_layout(
        title=f'Distribution of {target_col}',
        xaxis_title=target_col,
        yaxis_title='Frequency',
        showlegend=True,
        width=800,
        height=500
    )
    
    fig.show()
    
    if save_path:
        fig.write_html(save_path)
        print(f" Distribution plot saved to {save_path}")
    
    return fig

def create_top_countries_chart(df, metric_col, n_countries=15, save_path=None):
    """Create interactive bar chart for top countries."""
    print(f" Creating top countries chart for {metric_col}...")
    
    # Get top countries
    top_countries = df.nlargest(n_countries, metric_col)
    
    fig = px.bar(
        top_countries,
        x='country',
        y=metric_col,
        title=f'Top {n_countries} Countries by {metric_col}',
        color=metric_col,
        color_continuous_scale='Viridis',
        text=metric_col
    )
    
    fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig.update_layout(
        xaxis_title='Country',
        yaxis_title=metric_col,
        xaxis_tickangle=-45,
        width=1000,
        height=600
    )
    
    fig.show()
    
    if save_path:
        fig.write_html(save_path)
        print(f" Top countries chart saved to {save_path}")
    
    return fig

def create_world_map(df, metric_col, save_path=None):
    """Create interactive world choropleth map."""
    print(f" Creating world map for {metric_col}...")
    
    fig = px.choropleth(
        df,
        locations='country',
        locationmode='country names',
        color=metric_col,
        hover_name='country',
        hover_data={metric_col: ':.2f'},
        color_continuous_scale='Viridis',
        title=f'World Map: {metric_col}'
    )
    
    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=True,
            projection_type='natural earth'
        ),
        width=1000,
        height=600
    )
    
    fig.show()
    
    if save_path:
        fig.write_html(save_path)
        print(f" World map saved to {save_path}")
    
    return fig

def create_scatter_matrix(df, columns, save_path=None):
    """Create scatter matrix plot."""
    print(" Creating scatter matrix...")
    
    fig = px.scatter_matrix(
        df,
        dimensions=columns,
        title="Scatter Matrix of Key Variables",
        width=1000,
        height=800
    )
    
    fig.show()
    
    if save_path:
        fig.write_html(save_path)
        print(f" Scatter matrix saved to {save_path}")
    
    return fig

# Create visualizations if data is available
if 'happiness_clean' in locals() and happiness_clean is not None:
    print(" CREATING ADVANCED VISUALIZATIONS")
    print("=" * 50)
    
    # Determine the main happiness column
    happiness_col = None
    for col in happiness_clean.columns:
        if 'life' in col.lower() or 'ladder' in col.lower() or 'happiness' in col.lower():
            happiness_col = col
            break
    
    if happiness_col:
        print(f" Main happiness metric: {happiness_col}")
        
        # Create all visualizations
        viz_files = {}
        
        # 1. Correlation heatmap
        viz_files['correlation'] = create_correlation_heatmap(
            happiness_clean, 
            'outputs/plots/correlation_heatmap.html'
        )
        
        # 2. Distribution plots
        viz_files['distribution'] = create_distribution_plots(
            happiness_clean, 
            happiness_col, 
            'outputs/plots/happiness_distribution.html'
        )
        
        # 3. Top countries chart
        viz_files['top_countries'] = create_top_countries_chart(
            happiness_clean, 
            happiness_col, 
            save_path='outputs/plots/top_countries.html'
        )
        
        # 4. World map
        viz_files['world_map'] = create_world_map(
            happiness_clean, 
            happiness_col, 
            save_path='outputs/plots/world_happiness_map.html'
        )
        
        # 5. Scatter matrix of key variables
        key_vars = [col for col in happiness_clean.select_dtypes(include=[np.number]).columns 
                   if col != 'data_quality_score'][:5]  # Top 5 numerical columns
        
        viz_files['scatter_matrix'] = create_scatter_matrix(
            happiness_clean, 
            key_vars, 
            'outputs/plots/scatter_matrix.html'
        )
        
        print("\n All visualizations created and saved!")
        
    else:
        print(" Could not identify main happiness column")
else:
    print(" No cleaned data available for visualization")

 CREATING ADVANCED VISUALIZATIONS
 Main happiness metric: life_ladder
📊 Creating correlation heatmap...


 Correlation heatmap saved to outputs/plots/correlation_heatmap.html
 Creating distribution plots for life_ladder...


 Distribution plot saved to outputs/plots/happiness_distribution.html
 Creating top countries chart for life_ladder...


 Top countries chart saved to outputs/plots/top_countries.html
 Creating world map for life_ladder...


 World map saved to outputs/plots/world_happiness_map.html
 Creating scatter matrix...


 Scatter matrix saved to outputs/plots/scatter_matrix.html

 All visualizations created and saved!


In [20]:
# Advanced Statistical Analysis
def perform_statistical_analysis(df, target_col):
    """Perform comprehensive statistical analysis."""
    print(" ADVANCED STATISTICAL ANALYSIS")
    print("=" * 50)
    
    results = {}
    
    # 1. Descriptive Statistics
    print("\n1. DESCRIPTIVE STATISTICS")
    print("-" * 30)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    desc_stats = df[numerical_cols].describe()
    print(desc_stats)
    results['descriptive_stats'] = desc_stats.to_dict()
    
    # 2. Correlation Analysis
    print("\n2. CORRELATION ANALYSIS")
    print("-" * 30)
    correlation_matrix = df[numerical_cols].corr()
    
    # Find highest correlations with target
    target_correlations = correlation_matrix[target_col].sort_values(ascending=False)
    print(f"Top correlations with {target_col}:")
    for var, corr in target_correlations.head(10).items():
        if var != target_col:
            print(f"  {var}: {corr:.3f}")
    
    results['correlations'] = target_correlations.to_dict()
    
    # 3. Regression Analysis
    print("\n3. REGRESSION ANALYSIS")
    print("-" * 30)
    
    # Prepare features (exclude target and non-predictive columns)
    feature_cols = [col for col in numerical_cols 
                   if col != target_col and 'quality' not in col.lower()]
    
    if len(feature_cols) > 0:
        X = df[feature_cols].fillna(df[feature_cols].mean())
        y = df[target_col].fillna(df[target_col].mean())
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Linear Regression
        lr_model = linear_model.LinearRegression()
        lr_model.fit(X_train, y_train)
        y_pred_lr = lr_model.predict(X_test)
        
        lr_r2 = r2_score(y_test, y_pred_lr)
        lr_mse = mean_squared_error(y_test, y_pred_lr)
        
        print(f"Linear Regression - R²: {lr_r2:.3f}, MSE: {lr_mse:.3f}")
        
        # Random Forest
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        y_pred_rf = rf_model.predict(X_test)
        
        rf_r2 = r2_score(y_test, y_pred_rf)
        rf_mse = mean_squared_error(y_test, y_pred_rf)
        
        print(f"Random Forest - R²: {rf_r2:.3f}, MSE: {rf_mse:.3f}")
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': rf_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nTop 5 Most Important Features:")
        print(feature_importance.head())
        
        results['regression'] = {
            'linear_regression': {'r2': lr_r2, 'mse': lr_mse},
            'random_forest': {'r2': rf_r2, 'mse': rf_mse},
            'feature_importance': feature_importance.to_dict('records')
        }
    
    # 4. Normality Tests
    print("\n4. NORMALITY TESTS")
    print("-" * 30)
    
    for col in numerical_cols[:5]:  # Test first 5 numerical columns
        stat, p_value = scipy_stats.shapiro(df[col].dropna().sample(min(5000, len(df[col].dropna()))))
        is_normal = p_value > 0.05
        print(f"{col}: {'Normal' if is_normal else 'Not Normal'} (p-value: {p_value:.4f})")
    
    return results

# Perform statistical analysis
if 'happiness_clean' in locals() and happiness_clean is not None and happiness_col:
    stats_results = perform_statistical_analysis(happiness_clean, happiness_col)
    
    # Save results
    with open('outputs/data/statistical_analysis.json', 'w') as f:
        json.dump(stats_results, f, indent=2, default=str)
    
    print("\n Statistical analysis completed and saved!")
else:
    print(" No data available for statistical analysis")

 ADVANCED STATISTICAL ANALYSIS

1. DESCRIPTIVE STATISTICS
------------------------------
              year  life_ladder  log_gdp_per_capita  social_support  \
count  2363.000000  2363.000000         2363.000000     2363.000000   
mean   2014.763860     5.484212            9.399757        0.811017   
std       5.059436     1.123360            1.144933        0.116118   
min    2005.000000     2.131486            5.727654        0.504812   
25%    2011.000000     4.646750            8.520145        0.744106   
50%    2015.000000     5.448725            9.491772        0.834395   
75%    2019.000000     6.323592           10.381806        0.903636   
max    2023.000000     8.018934           11.675588        0.987343   

       healthy_life_expectancy      freedom   generosity  perceptions  \
count              2363.000000  2363.000000  2363.000000  2363.000000   
mean                 63.516346     0.750671    -0.002081     0.760446   
std                   6.474825     0.137121     0.15

In [None]:
# Time Series Analysis
def perform_time_series_analysis(df, target_col, time_col='year'):
    """Perform time series analysis if year data is available."""
    print(" TIME SERIES ANALYSIS")
    print("=" * 50)
    
    # Check if year column exists
    year_columns = [col for col in df.columns if 'year' in col.lower()]
    
    if not year_columns:
        print(" No year column found for time series analysis")
        return None
    
    time_col = year_columns[0]
    print(f"Using time column: {time_col}")
    
    # Aggregate by year
    yearly_data = df.groupby(time_col)[target_col].agg(['mean', 'std', 'count']).reset_index()
    yearly_data.columns = [time_col, 'mean_happiness', 'std_happiness', 'country_count']
    
    # Create time series plot
    fig = go.Figure()
    
    # Add mean line
    fig.add_trace(go.Scatter(
        x=yearly_data[time_col],
        y=yearly_data['mean_happiness'],
        mode='lines+markers',
        name='Mean Happiness',
        line=dict(color='blue', width=3),
        marker=dict(size=8)
    ))
    
    # Add confidence interval
    fig.add_trace(go.Scatter(
        x=yearly_data[time_col],
        y=yearly_data['mean_happiness'] + yearly_data['std_happiness'],
        mode='lines',
        name='Upper Bound',
        line=dict(color='lightblue', width=0),
        showlegend=False
    ))
    
    fig.add_trace(go.Scatter(
        x=yearly_data[time_col],
        y=yearly_data['mean_happiness'] - yearly_data['std_happiness'],
        mode='lines',
        name='Lower Bound',
        line=dict(color='lightblue', width=0),
        fill='tonexty',
        fillcolor='rgba(173, 216, 230, 0.3)',
        showlegend=False
    ))
    
    fig.update_layout(
        title=f'Global Happiness Trends Over Time',
        xaxis_title='Year',
        yaxis_title='Average Happiness Score',
        width=1000,
        height=600,
        hovermode='x unified'
    )
    
    fig.show()
    fig.write_html('outputs/plots/happiness_time_series.html')
    
    # Calculate trend
    from scipy import stats
    slope, intercept, r_value, p_value, std_err = stats.linregress(yearly_data[time_col], yearly_data['mean_happiness'])
    
    trend_direction = "increasing" if slope > 0 else "decreasing"
    print(f"\n Trend Analysis:")
    print(f"   Slope: {slope:.4f} points per year")
    print(f"   Direction: {trend_direction}")
    print(f"   R-squared: {r_value**2:.3f}")
    print(f"   P-value: {p_value:.4f}")
    
    # Top and bottom countries over time
    if 'country' in df.columns:
        country_trends = df.groupby(['country', time_col])[target_col].mean().reset_index()
        
        # Calculate country-wise trends
        country_slopes = {}
        for country in country_trends['country'].unique():
            country_data = country_trends[country_trends['country'] == country]
            if len(country_data) > 2:  # Need at least 3 points for trend
                slope, _, _, _, _ = stats.linregress(country_data[time_col], country_data[target_col])
                country_slopes[country] = slope
        
        # Top improving and declining countries
        if country_slopes:
            sorted_slopes = sorted(country_slopes.items(), key=lambda x: x[1], reverse=True)
            
            print(f"\n Top 5 Most Improving Countries:")
            for country, slope in sorted_slopes[:5]:
                print(f"   {country}: +{slope:.4f} points/year")
            
            print(f"\n Top 5 Most Declining Countries:")
            for country, slope in sorted_slopes[-5:]:
                print(f"   {country}: {slope:.4f} points/year")
    
    return {
        'yearly_data': yearly_data.to_dict('records'),
        'trend': {
            'slope': slope,
            'r_squared': r_value**2,
            'p_value': p_value,
            'direction': trend_direction
        }
    }

# Perform time series analysis
if 'happiness_clean' in locals() and happiness_clean is not None and happiness_col:
    time_series_results = perform_time_series_analysis(happiness_clean, happiness_col)
    
    if time_series_results:
        # Save time series results
        with open('outputs/data/time_series_analysis.json', 'w') as f:
            json.dump(time_series_results, f, indent=2, default=str)
        
        print("\n Time series analysis completed and saved!")
    else:
        print(" Time series analysis could not be performed")
else:
    print(" No data available for time series analysis")

 TIME SERIES ANALYSIS
Using time column: year



 Trend Analysis:
   Slope: -0.0017 points per year
   Direction: decreasing
   R-squared: 0.001
   P-value: 0.8806

 Top 5 Most Improving Countries:
   Guinea: +0.1431 points/year
   Serbia: +0.1374 points/year
   Bulgaria: +0.1314 points/year
   Ivory Coast: +0.1165 points/year
   Congo (Brazzaville): +0.1132 points/year

 Top 5 Most Declining Countries:
   Bhutan: -0.2435 points/year
   Syria: -0.3449 points/year
   South Sudan: -0.4229 points/year
   Somalia: -0.4302 points/year
   Angola: -0.5806 points/year

 Time series analysis completed and saved!


In [24]:
# Generate Comprehensive Report
def generate_analysis_report(df, target_col, stats_results, time_series_results=None):
    """Generate a comprehensive analysis report."""
    print(" GENERATING COMPREHENSIVE REPORT")
    print("=" * 50)
    
    report = {
        'metadata': {
            'analysis_date': datetime.now().isoformat(),
            'dataset_shape': df.shape,
            'target_variable': target_col,
            'total_countries': df['country'].nunique() if 'country' in df.columns else 'N/A',
            'time_range': f"{df['year'].min()} - {df['year'].max()}" if 'year' in df.columns else 'N/A'
        },
        'key_findings': {},
        'data_quality': {
            'missing_values': df.isnull().sum().sum(),
            'duplicate_rows': df.duplicated().sum(),
            'data_completeness': ((df.size - df.isnull().sum().sum()) / df.size) * 100
        },
        'statistical_summary': stats_results,
        'time_series_analysis': time_series_results,
        'recommendations': []
    }
    
    # Key findings
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    
    # Top performing countries
    if 'country' in df.columns:
        top_countries = df.nlargest(10, target_col)['country'].tolist()
        bottom_countries = df.nsmallest(10, target_col)['country'].tolist()
        
        report['key_findings']['top_countries'] = top_countries
        report['key_findings']['bottom_countries'] = bottom_countries
    
    # Variable relationships
    correlations = df[numerical_cols].corr()[target_col].sort_values(ascending=False)
    strong_positive = correlations[correlations > 0.7].index.tolist()
    strong_negative = correlations[correlations < -0.7].index.tolist()
    
    report['key_findings']['strong_positive_correlations'] = strong_positive
    report['key_findings']['strong_negative_correlations'] = strong_negative
    
    # Data insights
    report['key_findings']['happiness_statistics'] = {
        'mean': float(df[target_col].mean()),
        'median': float(df[target_col].median()),
        'std': float(df[target_col].std()),
        'min': float(df[target_col].min()),
        'max': float(df[target_col].max())
    }
    
    # Recommendations
    recommendations = [
        "Focus on improving variables with strongest positive correlations to happiness",
        "Investigate countries with declining happiness trends for policy insights",
        "Consider regional analysis for more targeted interventions",
        "Monitor data quality and completeness for future analyses"
    ]
    
    if time_series_results and time_series_results['trend']['slope'] < 0:
        recommendations.append("Address declining global happiness trend with targeted policies")
    
    report['recommendations'] = recommendations
    
    # Save report
    with open('outputs/data/comprehensive_report.json', 'w') as f:
        json.dump(report, f, indent=2, default=str)
    
    # Generate HTML report
    html_report = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>World Happiness Analysis Report</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 40px; }}
            .header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
            .section {{ margin: 20px 0; }}
            .key-stat {{ background-color: #e8f4f8; padding: 10px; margin: 10px 0; border-radius: 3px; }}
            .country-list {{ columns: 2; }}
            .recommendation {{ background-color: #f9f9f9; padding: 10px; margin: 5px 0; border-left: 4px solid #007acc; }}
        </style>
    </head>
    <body>
        <div class="header">
            <h1>World Happiness Analysis Report</h1>
            <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            <p><strong>Dataset:</strong> {df.shape[0]} rows, {df.shape[1]} columns</p>
        </div>
        
        <div class="section">
            <h2>Executive Summary</h2>
            <div class="key-stat">
                <strong>Global Average Happiness:</strong> {df[target_col].mean():.2f}
            </div>
            <div class="key-stat">
                <strong>Data Quality:</strong> {((df.size - df.isnull().sum().sum()) / df.size) * 100:.1f}% complete
            </div>
        </div>
        
        <div class="section">
            <h2>Key Findings</h2>
            <h3>Top 10 Happiest Countries</h3>
            <div class="country-list">
                {"<br>".join(f"{i+1}. {country}" for i, country in enumerate(top_countries))}
            </div>
            
            <h3>Statistical Insights</h3>
            <ul>
                <li>Happiness scores range from {df[target_col].min():.2f} to {df[target_col].max():.2f}</li>
                <li>Standard deviation: {df[target_col].std():.2f}</li>
                <li>Median happiness: {df[target_col].median():.2f}</li>
            </ul>
        </div>
        
        <div class="section">
            <h2>Recommendations</h2>
            {"".join(f'<div class="recommendation">{rec}</div>' for rec in recommendations)}
        </div>
        
        <div class="section">
            <h2>Generated Files</h2>
            <ul>
                <li>correlation_heatmap.html - Interactive correlation matrix</li>
                <li>happiness_distribution.html - Distribution analysis</li>
                <li>top_countries.html - Top performing countries</li>
                <li>world_happiness_map.html - Global happiness map</li>
                <li>happiness_time_series.html - Time series analysis</li>
                <li>comprehensive_report.json - Detailed analysis results</li>
            </ul>
        </div>
    </body>
    </html>
    """
    
    with open('outputs/analysis_report.html', 'w') as f:
        f.write(html_report)
    
    print(" Comprehensive report generated!")
    print(" HTML report saved to outputs/analysis_report.html")
    print(" JSON report saved to outputs/data/comprehensive_report.json")
    
    return report

# Generate final report
if 'happiness_clean' in locals() and happiness_clean is not None and happiness_col:
    final_report = generate_analysis_report(
        happiness_clean, 
        happiness_col, 
        stats_results if 'stats_results' in locals() else {},
        time_series_results if 'time_series_results' in locals() else None
    )
    
    print("\n" + "="*60)
    print(" ANALYSIS COMPLETE!")
    print("="*60)
    print(" All files saved to outputs/ directory:")
    print("    plots/ - Interactive HTML visualizations")
    print("    data/ - Cleaned data and analysis results")
    print("    analysis_report.html - Comprehensive report")
    print("="*60)
else:
    print(" Cannot generate report - no data available")

 GENERATING COMPREHENSIVE REPORT
 Comprehensive report generated!
 HTML report saved to outputs/analysis_report.html
 JSON report saved to outputs/data/comprehensive_report.json

 ANALYSIS COMPLETE!
 All files saved to outputs/ directory:
    plots/ - Interactive HTML visualizations
    data/ - Cleaned data and analysis results
    analysis_report.html - Comprehensive report
