# 📊 SPSS Data Analysis: DBA 710 Multiple Stores

## Overview
This notebook provides comprehensive analysis of the **DBA 710 Multiple Stores.sav** SPSS dataset, including:
- 🔍 **Schema Exploration**: Data structure, variable types, and metadata
- 📈 **Descriptive Statistics**: Central tendency, dispersion, and distribution analysis
- 🔗 **Correlation Analysis**: Relationships between variables and multivariate patterns
- 📋 **Data Quality Assessment**: Missing values, outliers, and data integrity

---

In [None]:
# Essential Libraries for SPSS Analysis
import pandas as pd
import numpy as np
import pyreadstat
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import itertools

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)

# Configure visualization settings
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Enterprise color palette
enterprise_colors = {
    'primary': '#1f77b4',
    'secondary': '#ff7f0e', 
    'success': '#2ca02c',
    'warning': '#d62728',
    'info': '#9467bd',
    'accent': '#8c564b'
}

print("📚 Libraries loaded successfully!")
print(f"🐼 Pandas version: {pd.__version__}")
print(f"📊 NumPy version: {np.__version__}")
print(f"📈 Matplotlib version: {plt.matplotlib.__version__}")
print("✅ Ready for SPSS data analysis!")

## 📂 Data Loading & Initial Inspection

Let's load the SPSS file and examine its basic structure and metadata.

In [None]:
# Load SPSS file with metadata
print("🔄 Loading SPSS file: DBA 710 Multiple Stores.sav")
print("=" * 50)

try:
    # Read SPSS file with pyreadstat to preserve metadata
    df, meta = pyreadstat.read_sav(
        'notebooks/DBA 710 Multiple Stores.sav',
        apply_value_formats=True,  # Apply SPSS value labels
        formats_as_ordered_category=True  # Preserve ordered categories
    )
    
    print(f"✅ Successfully loaded SPSS file!")
    print(f"📊 Dataset shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
    print(f"💾 Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
except FileNotFoundError:
    print("❌ File not found. Please ensure 'DBA 710 Multiple Stores.sav' is in the notebooks/ directory")
    print("📁 Current working directory contents:")
    import os
    for item in os.listdir('notebooks/'):
        if item.endswith('.sav'):
            print(f"   📄 {item}")
    raise
except Exception as e:
    print(f"❌ Error loading file: {str(e)}")
    raise

## 🔍 Comprehensive Schema Analysis

Detailed exploration of the dataset structure, variable types, and SPSS metadata.

In [None]:
# Display comprehensive schema information
print("🔍 COMPREHENSIVE SCHEMA ANALYSIS")
print("=" * 50)

# Basic dataset info
print(f"📊 Dataset Overview:")
print(f"   • Total observations: {df.shape[0]:,}")
print(f"   • Total variables: {df.shape[1]}")
print(f"   • Missing values: {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100:.2f}%)")
print(f"   • Complete cases: {df.dropna().shape[0]:,} ({df.dropna().shape[0] / df.shape[0] * 100:.2f}%)")

print(f"\n📋 Variable Information:")
print("-" * 30)

# Create comprehensive variable summary
variable_summary = []

for i, col in enumerate(df.columns, 1):
    col_info = {
        'Position': i,
        'Variable': col,
        'Type': str(df[col].dtype),
        'Non_Null': df[col].count(),
        'Null_Count': df[col].isnull().sum(),
        'Null_Percent': f"{df[col].isnull().sum() / len(df) * 100:.1f}%",
        'Unique_Values': df[col].nunique(),
        'Memory_MB': f"{df[col].memory_usage(deep=True) / 1024**2:.3f}"
    }
    
    # Add variable label if available from SPSS metadata
    if hasattr(meta, 'column_names_to_labels') and col in meta.column_names_to_labels:
        col_info['SPSS_Label'] = meta.column_names_to_labels[col]
    else:
        col_info['SPSS_Label'] = 'No label'
    
    # Add sample values for categorical/string variables
    if df[col].dtype == 'object' or df[col].dtype.name == 'category':
        unique_vals = df[col].dropna().unique()[:5]  # First 5 unique values
        col_info['Sample_Values'] = ', '.join([str(val) for val in unique_vals])
        if len(unique_vals) == 5 and df[col].nunique() > 5:
            col_info['Sample_Values'] += ', ...'
    else:
        # For numeric variables, show range
        if df[col].count() > 0:
            col_info['Sample_Values'] = f"Range: {df[col].min():.3f} to {df[col].max():.3f}"
        else:
            col_info['Sample_Values'] = "All missing"
    
    variable_summary.append(col_info)

# Convert to DataFrame for better display
schema_df = pd.DataFrame(variable_summary)

# Display the schema table
print(schema_df.to_string(index=False))

# Data type summary
print(f"\n📊 Data Type Distribution:")
print("-" * 30)
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
    print(f"   • {dtype}: {count} variables ({count/len(df.columns)*100:.1f}%)")

# SPSS Metadata Information
if hasattr(meta, 'file_encoding'):
    print(f"\n🔤 SPSS File Information:")
    print("-" * 30)
    print(f"   • File encoding: {meta.file_encoding}")
    if hasattr(meta, 'creation_time'):
        print(f"   • Creation time: {meta.creation_time}")
    if hasattr(meta, 'modification_time'):
        print(f"   • Last modified: {meta.modification_time}")

# Value labels (SPSS factor levels)
if hasattr(meta, 'value_labels') and meta.value_labels:
    print(f"\n🏷️ SPSS Value Labels (Categorical Variables):")
    print("-" * 30)
    for var, labels in meta.value_labels.items():
        if var in df.columns:
            print(f"   • {var}:")
            for value, label in list(labels.items())[:5]:  # Show first 5 labels
                print(f"     {value}: {label}")
            if len(labels) > 5:
                print(f"     ... and {len(labels) - 5} more labels")
            print()

## 📈 Comprehensive Descriptive Statistics

Detailed statistical analysis for all variables, including measures of central tendency, dispersion, and distribution shape.

In [None]:
# Comprehensive descriptive statistics
print("📈 COMPREHENSIVE DESCRIPTIVE STATISTICS")
print("=" * 50)

# Separate numeric and categorical variables
numeric_vars = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_vars = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"🔢 Numeric Variables: {len(numeric_vars)}")
print(f"🏷️ Categorical Variables: {len(categorical_vars)}")
print()

# Enhanced descriptive statistics for numeric variables
if numeric_vars:
    print("🔢 NUMERIC VARIABLES - DETAILED STATISTICS")
    print("=" * 50)
    
    # Basic descriptive statistics
    basic_stats = df[numeric_vars].describe()
    
    # Additional statistics
    additional_stats = pd.DataFrame(index=numeric_vars)
    
    for var in numeric_vars:
        data = df[var].dropna()
        if len(data) > 0:
            # Central tendency
            additional_stats.loc[var, 'median'] = data.median()
            additional_stats.loc[var, 'mode'] = data.mode().iloc[0] if len(data.mode()) > 0 else np.nan
            
            # Dispersion
            additional_stats.loc[var, 'variance'] = data.var()
            additional_stats.loc[var, 'cv'] = (data.std() / data.mean()) * 100  # Coefficient of variation
            additional_stats.loc[var, 'range'] = data.max() - data.min()
            additional_stats.loc[var, 'iqr'] = data.quantile(0.75) - data.quantile(0.25)
            
            # Distribution shape
            additional_stats.loc[var, 'skewness'] = stats.skew(data)
            additional_stats.loc[var, 'kurtosis'] = stats.kurtosis(data)
            
            # Outlier detection (using IQR method)
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outliers = ((data < lower_bound) | (data > upper_bound)).sum()
            additional_stats.loc[var, 'outliers_count'] = outliers
            additional_stats.loc[var, 'outliers_percent'] = (outliers / len(data)) * 100
    
    # Combine basic and additional statistics
    comprehensive_stats = pd.concat([basic_stats.T, additional_stats], axis=1)
    
    # Display comprehensive statistics
    print(comprehensive_stats.round(4))
    
    # Statistical interpretation
    print("\n📊 Statistical Interpretation:")
    print("-" * 30)
    
    for var in numeric_vars[:5]:  # Show interpretation for first 5 variables
        if var in comprehensive_stats.index:
            stats_row = comprehensive_stats.loc[var]
            print(f"\n🔍 {var}:")
            
            # Central tendency interpretation
            mean_val = stats_row['mean']
            median_val = stats_row['median']
            if abs(mean_val - median_val) / median_val < 0.1:
                print(f"   • Distribution: Approximately symmetric (mean ≈ median)")
            elif mean_val > median_val:
                print(f"   • Distribution: Right-skewed (mean > median)")
            else:
                print(f"   • Distribution: Left-skewed (mean < median)")
            
            # Variability interpretation
            cv = stats_row['cv']
            if cv < 15:
                print(f"   • Variability: Low (CV = {cv:.1f}%)")
            elif cv < 35:
                print(f"   • Variability: Moderate (CV = {cv:.1f}%)")
            else:
                print(f"   • Variability: High (CV = {cv:.1f}%)")
            
            # Outlier interpretation
            outlier_pct = stats_row['outliers_percent']
            if outlier_pct == 0:
                print(f"   • Outliers: None detected")
            elif outlier_pct < 5:
                print(f"   • Outliers: Few detected ({outlier_pct:.1f}%)")
            else:
                print(f"   • Outliers: Many detected ({outlier_pct:.1f}%) - investigate further")

# Categorical variables analysis
if categorical_vars:
    print("\n\n🏷️ CATEGORICAL VARIABLES - FREQUENCY ANALYSIS")
    print("=" * 50)
    
    categorical_summary = []
    
    for var in categorical_vars:
        value_counts = df[var].value_counts(dropna=False)
        
        cat_info = {
            'Variable': var,
            'Categories': df[var].nunique(),
            'Missing': df[var].isnull().sum(),
            'Missing_Pct': f"{df[var].isnull().sum() / len(df) * 100:.1f}%",
            'Most_Frequent': value_counts.index[0] if len(value_counts) > 0 else 'N/A',
            'Most_Freq_Count': value_counts.iloc[0] if len(value_counts) > 0 else 0,
            'Most_Freq_Pct': f"{value_counts.iloc[0] / df[var].count() * 100:.1f}%" if len(value_counts) > 0 else '0%'
        }
        categorical_summary.append(cat_info)
        
        # Display frequency table for first few variables
        if len(categorical_summary) <= 3:  # Show details for first 3 categorical variables
            print(f"\n📊 {var} - Frequency Distribution:")
            print("-" * 40)
            freq_table = pd.DataFrame({
                'Category': value_counts.index,
                'Frequency': value_counts.values,
                'Percentage': (value_counts.values / df[var].count() * 100).round(2)
            })
            print(freq_table.head(10).to_string(index=False))  # Show top 10 categories
            if len(value_counts) > 10:
                print(f"... and {len(value_counts) - 10} more categories")
    
    # Summary table for all categorical variables
    print(f"\n📋 Categorical Variables Summary:")
    print("-" * 40)
    cat_summary_df = pd.DataFrame(categorical_summary)
    print(cat_summary_df.to_string(index=False))

## 🔗 Comprehensive Correlation Analysis

Multi-faceted correlation analysis including Pearson, Spearman, and visualization techniques.

In [None]:
# Comprehensive correlation analysis
print("🔗 COMPREHENSIVE CORRELATION ANALYSIS")
print("=" * 50)

if len(numeric_vars) < 2:
    print("⚠️ Warning: Less than 2 numeric variables found. Correlation analysis requires at least 2 numeric variables.")
else:
    # Calculate correlation matrices
    pearson_corr = df[numeric_vars].corr(method='pearson')
    spearman_corr = df[numeric_vars].corr(method='spearman')
    
    print(f"🔢 Analyzing correlations between {len(numeric_vars)} numeric variables")
    print(f"📊 Total correlation pairs: {len(numeric_vars) * (len(numeric_vars) - 1) // 2}")
    
    # Pearson correlation analysis
    print("\n📈 PEARSON CORRELATION MATRIX (Linear Relationships)")
    print("=" * 55)
    print(pearson_corr.round(3))
    
    # Find strong correlations
    print("\n🔍 STRONG CORRELATIONS (|r| > 0.7):")
    print("-" * 40)
    
    strong_correlations = []
    for i in range(len(numeric_vars)):
        for j in range(i+1, len(numeric_vars)):
            var1, var2 = numeric_vars[i], numeric_vars[j]
            r_value = pearson_corr.loc[var1, var2]
            if abs(r_value) > 0.7:
                # Calculate p-value
                data1 = df[var1].dropna()
                data2 = df[var2].dropna()
                common_idx = data1.index.intersection(data2.index)
                if len(common_idx) > 2:
                    _, p_value = pearsonr(df.loc[common_idx, var1], df.loc[common_idx, var2])
                    
                    correlation_strength = "Very Strong" if abs(r_value) > 0.9 else "Strong"
                    correlation_direction = "Positive" if r_value > 0 else "Negative"
                    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
                    
                    strong_correlations.append({
                        'Variable_1': var1,
                        'Variable_2': var2,
                        'Correlation': f"{r_value:.3f}",
                        'Strength': correlation_strength,
                        'Direction': correlation_direction,
                        'P_Value': f"{p_value:.6f}",
                        'Significance': significance,
                        'N': len(common_idx)
                    })
    
    if strong_correlations:
        strong_corr_df = pd.DataFrame(strong_correlations)
        print(strong_corr_df.to_string(index=False))
    else:
        print("No strong correlations (|r| > 0.7) found.")
    
    # Moderate correlations
    print("\n📊 MODERATE CORRELATIONS (0.3 < |r| ≤ 0.7):")
    print("-" * 45)
    
    moderate_correlations = []
    for i in range(len(numeric_vars)):
        for j in range(i+1, len(numeric_vars)):
            var1, var2 = numeric_vars[i], numeric_vars[j]
            r_value = pearson_corr.loc[var1, var2]
            if 0.3 < abs(r_value) <= 0.7:
                moderate_correlations.append({
                    'Variable_1': var1,
                    'Variable_2': var2,
                    'Correlation': f"{r_value:.3f}",
                    'Interpretation': 'Moderate Positive' if r_value > 0 else 'Moderate Negative'
                })
    
    if moderate_correlations:
        moderate_corr_df = pd.DataFrame(moderate_correlations)
        print(moderate_corr_df.head(10).to_string(index=False))  # Show top 10
        if len(moderate_correlations) > 10:
            print(f"... and {len(moderate_correlations) - 10} more moderate correlations")
    else:
        print("No moderate correlations found.")
    
    # Spearman correlation comparison
    print("\n📊 SPEARMAN vs PEARSON CORRELATION COMPARISON")
    print("=" * 50)
    print("(Shows differences in linear vs monotonic relationships)")
    print()
    
    correlation_comparison = []
    for i in range(len(numeric_vars)):
        for j in range(i+1, len(numeric_vars)):
            var1, var2 = numeric_vars[i], numeric_vars[j]
            pearson_r = pearson_corr.loc[var1, var2]
            spearman_r = spearman_corr.loc[var1, var2]
            difference = abs(spearman_r - pearson_r)
            
            if difference > 0.1:  # Significant difference suggests non-linear relationship
                correlation_comparison.append({
                    'Variable_1': var1,
                    'Variable_2': var2,
                    'Pearson_r': f"{pearson_r:.3f}",
                    'Spearman_r': f"{spearman_r:.3f}",
                    'Difference': f"{difference:.3f}",
                    'Interpretation': 'Non-linear relationship likely' if difference > 0.2 else 'Some non-linearity'
                })
    
    if correlation_comparison:
        comparison_df = pd.DataFrame(correlation_comparison)
        print("Relationships with notable Pearson vs Spearman differences:")
        print(comparison_df.to_string(index=False))
    else:
        print("No significant differences between Pearson and Spearman correlations detected.")
    
    # Statistical significance summary
    print("\n📋 CORRELATION SIGNIFICANCE SUMMARY")
    print("=" * 40)
    
    significance_counts = {'***': 0, '**': 0, '*': 0, 'ns': 0}
    total_tests = 0
    
    for i in range(len(numeric_vars)):
        for j in range(i+1, len(numeric_vars)):
            var1, var2 = numeric_vars[i], numeric_vars[j]
            data1 = df[var1].dropna()
            data2 = df[var2].dropna()
            common_idx = data1.index.intersection(data2.index)
            
            if len(common_idx) > 2:
                _, p_value = pearsonr(df.loc[common_idx, var1], df.loc[common_idx, var2])
                total_tests += 1
                
                if p_value < 0.001:
                    significance_counts['***'] += 1
                elif p_value < 0.01:
                    significance_counts['**'] += 1
                elif p_value < 0.05:
                    significance_counts['*'] += 1
                else:
                    significance_counts['ns'] += 1
    
    print(f"Total correlation tests performed: {total_tests}")
    print(f"Highly significant (p < 0.001): {significance_counts['***']} ({significance_counts['***']/total_tests*100:.1f}%)")
    print(f"Very significant (p < 0.01): {significance_counts['**']} ({significance_counts['**']/total_tests*100:.1f}%)")
    print(f"Significant (p < 0.05): {significance_counts['*']} ({significance_counts['*']/total_tests*100:.1f}%)")
    print(f"Not significant (p ≥ 0.05): {significance_counts['ns']} ({significance_counts['ns']/total_tests*100:.1f}%)")
    
    print("\nLegend: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant")

## 📊 Advanced Correlation Visualizations

Interactive and static visualizations to explore correlation patterns and relationships.

In [None]:
# Advanced correlation visualizations
print("📊 ADVANCED CORRELATION VISUALIZATIONS")
print("=" * 45)

if len(numeric_vars) >= 2:
    # 1. Interactive correlation heatmap
    print("Creating interactive correlation heatmap...")
    
    fig_heatmap = go.Figure(data=go.Heatmap(
        z=pearson_corr.values,
        x=pearson_corr.columns,
        y=pearson_corr.columns,
        colorscale='RdBu',
        zmid=0,
        text=pearson_corr.round(3).values,
        texttemplate="%{text}",
        textfont={"size": 10},
        hovertemplate="Variable 1: %{y}<br>Variable 2: %{x}<br>Correlation: %{z:.3f}<extra></extra>"
    ))
    
    fig_heatmap.update_layout(
        title="🔗 Interactive Pearson Correlation Matrix",
        width=800,
        height=700,
        font=dict(size=12)
    )
    
    fig_heatmap.show()
    
    # 2. Scatter plot matrix for key variables (top correlated pairs)
    if len(strong_correlations) > 0:
        print("Creating scatter plots for strongly correlated variable pairs...")
        
        # Select top 4 strong correlations for visualization
        top_pairs = strong_correlations[:4] if len(strong_correlations) >= 4 else strong_correlations
        
        fig_scatter = make_subplots(
            rows=2, cols=2,
            subplot_titles=[f"{pair['Variable_1']} vs {pair['Variable_2']} (r={pair['Correlation']})" 
                          for pair in top_pairs],
            vertical_spacing=0.15,
            horizontal_spacing=0.1
        )
        
        for i, pair in enumerate(top_pairs):
            row = (i // 2) + 1
            col = (i % 2) + 1
            
            var1, var2 = pair['Variable_1'], pair['Variable_2']
            
            # Get clean data for both variables
            clean_data = df[[var1, var2]].dropna()
            
            fig_scatter.add_trace(
                go.Scatter(
                    x=clean_data[var1],
                    y=clean_data[var2],
                    mode='markers',
                    marker=dict(size=6, opacity=0.6, color=enterprise_colors['primary']),
                    name=f"{var1} vs {var2}",
                    showlegend=False,
                    hovertemplate=f"{var1}: %{{x:.2f}}<br>{var2}: %{{y:.2f}}<extra></extra>"
                ),
                row=row, col=col
            )
            
            # Add trend line
            z = np.polyfit(clean_data[var1], clean_data[var2], 1)
            p = np.poly1d(z)
            x_trend = np.linspace(clean_data[var1].min(), clean_data[var1].max(), 100)
            
            fig_scatter.add_trace(
                go.Scatter(
                    x=x_trend,
                    y=p(x_trend),
                    mode='lines',
                    line=dict(color=enterprise_colors['secondary'], width=2),
                    name='Trend',
                    showlegend=False,
                    hoverinfo='skip'
                ),
                row=row, col=col
            )
        
        fig_scatter.update_layout(
            title="📈 Scatter Plots for Strongest Correlations",
            height=700,
            width=1000,
            font=dict(size=12)
        )
        
        fig_scatter.show()
    
    # 3. Hierarchical clustering of correlations
    if len(numeric_vars) >= 3:
        print("Creating correlation dendrogram for variable clustering...")
        
        from scipy.cluster.hierarchy import dendrogram, linkage
        from scipy.spatial.distance import squareform
        
        # Convert correlation to distance
        distance_matrix = 1 - abs(pearson_corr)
        
        # Perform hierarchical clustering
        condensed_distances = squareform(distance_matrix)
        linkage_matrix = linkage(condensed_distances, method='average')
        
        # Create dendrogram
        plt.figure(figsize=(12, 8))
        dendrogram(linkage_matrix, labels=numeric_vars, orientation='top', leaf_rotation=45)
        plt.title('🌳 Variable Clustering Based on Correlations', fontsize=16, pad=20)
        plt.xlabel('Variables', fontsize=12)
        plt.ylabel('Distance (1 - |Correlation|)', fontsize=12)
        plt.tight_layout()
        plt.show()
    
    # 4. Correlation network visualization
    if len(strong_correlations) > 0:
        print("Creating correlation network for strong relationships...")
        
        import networkx as nx
        
        # Create network graph
        G = nx.Graph()
        
        # Add nodes (variables)
        for var in numeric_vars:
            G.add_node(var)
        
        # Add edges (strong correlations)
        for corr in strong_correlations:
            weight = abs(float(corr['Correlation']))
            G.add_edge(corr['Variable_1'], corr['Variable_2'], weight=weight)
        
        # Calculate layout
        pos = nx.spring_layout(G, k=1, iterations=50)
        
        # Create edge traces
        edge_x = []
        edge_y = []
        edge_info = []
        
        for edge in G.edges():
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.extend([x0, x1, None])
            edge_y.extend([y0, y1, None])
            
            weight = G[edge[0]][edge[1]]['weight']
            edge_info.append(f"{edge[0]} - {edge[1]}: r = {weight:.3f}")
        
        edge_trace = go.Scatter(x=edge_x, y=edge_y,
                               line=dict(width=2, color='#888'),
                               hoverinfo='none',
                               mode='lines')
        
        # Create node traces
        node_x = []
        node_y = []
        node_text = []
        node_info = []
        
        for node in G.nodes():
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
            node_text.append(node)
            
            # Count connections
            adjacencies = list(G.neighbors(node))
            node_info.append(f'{node}<br>Connections: {len(adjacencies)}')
        
        node_trace = go.Scatter(x=node_x, y=node_y,
                               mode='markers+text',
                               hoverinfo='text',
                               text=node_text,
                               textposition="middle center",
                               hovertext=node_info,
                               marker=dict(showscale=True,
                                         colorscale='Viridis',
                                         size=20,
                                         colorbar=dict(thickness=15,
                                                     xanchor="left",
                                                     titleside="right")))
        
        # Color nodes by number of connections
        node_adjacencies = []
        for node in G.nodes():
            node_adjacencies.append(len(list(G.neighbors(node))))
        
        node_trace.marker.color = node_adjacencies
        
        # Create the figure
        fig_network = go.Figure(data=[edge_trace, node_trace],
                               layout=go.Layout(
                                   title='🕸️ Correlation Network (Strong Relationships Only)',
                                   titlefont_size=16,
                                   showlegend=False,
                                   hovermode='closest',
                                   margin=dict(b=20,l=5,r=5,t=40),
                                   annotations=[ dict(
                                       text="Variables connected by strong correlations (|r| > 0.7)",
                                       showarrow=False,
                                       xref="paper", yref="paper",
                                       x=0.005, y=-0.002 ) ],
                                   xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                   yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                   width=800,
                                   height=600))
        
        fig_network.show()

else:
    print("⚠️ Insufficient numeric variables for correlation visualization.")

print("\n✅ Correlation analysis and visualizations complete!")

## 📋 Data Quality Assessment & Summary

Comprehensive evaluation of data integrity, missing patterns, and analysis summary.

In [None]:
# Comprehensive data quality assessment
print("📋 COMPREHENSIVE DATA QUALITY ASSESSMENT")
print("=" * 50)

# Missing data analysis
print("🔍 Missing Data Analysis:")
print("-" * 30)

missing_summary = df.isnull().sum().sort_values(ascending=False)
missing_percentage = (missing_summary / len(df) * 100).round(2)

missing_analysis = pd.DataFrame({
    'Variable': missing_summary.index,
    'Missing_Count': missing_summary.values,
    'Missing_Percentage': missing_percentage.values,
    'Complete_Count': len(df) - missing_summary.values,
    'Data_Quality': ['Excellent' if x < 5 else 'Good' if x < 15 else 'Fair' if x < 30 else 'Poor' 
                    for x in missing_percentage.values]
})

print(missing_analysis[missing_analysis['Missing_Count'] > 0].to_string(index=False))

if missing_analysis['Missing_Count'].sum() == 0:
    print("🎉 Excellent! No missing values detected in the dataset.")
else:
    variables_with_missing = (missing_analysis['Missing_Count'] > 0).sum()
    print(f"\n📊 Missing Data Summary:")
    print(f"   • Variables with missing data: {variables_with_missing} of {len(df.columns)}")
    print(f"   • Total missing values: {missing_analysis['Missing_Count'].sum():,}")
    print(f"   • Overall completeness: {(1 - missing_analysis['Missing_Count'].sum() / (len(df) * len(df.columns))) * 100:.1f}%")

# Data type consistency
print(f"\n🔤 Data Type Analysis:")
print("-" * 25)

type_analysis = df.dtypes.value_counts()
print(f"Data type distribution:")
for dtype, count in type_analysis.items():
    print(f"   • {dtype}: {count} variables ({count/len(df.columns)*100:.1f}%)")

# Duplicate analysis
print(f"\n🔄 Duplicate Records Analysis:")
print("-" * 35)

duplicate_count = df.duplicated().sum()
if duplicate_count > 0:
    print(f"⚠️ Found {duplicate_count} duplicate records ({duplicate_count/len(df)*100:.2f}%)")
    print(f"   • Unique records: {len(df) - duplicate_count:,}")
    print(f"   • Total records: {len(df):,}")
else:
    print("✅ No duplicate records found.")

# Variable uniqueness analysis
print(f"\n🎯 Variable Uniqueness Analysis:")
print("-" * 35)

uniqueness_analysis = []
for col in df.columns:
    unique_count = df[col].nunique()
    unique_percentage = (unique_count / len(df)) * 100
    
    if unique_percentage == 100:
        uniqueness_type = "Identifier (100% unique)"
    elif unique_percentage > 95:
        uniqueness_type = "Near-identifier (>95% unique)"
    elif unique_percentage > 50:
        uniqueness_type = "High variability"
    elif unique_percentage > 10:
        uniqueness_type = "Moderate variability"
    else:
        uniqueness_type = "Low variability"
    
    uniqueness_analysis.append({
        'Variable': col,
        'Unique_Values': unique_count,
        'Unique_Percentage': f"{unique_percentage:.1f}%",
        'Classification': uniqueness_type
    })

uniqueness_df = pd.DataFrame(uniqueness_analysis)
print(uniqueness_df.to_string(index=False))

# Overall data quality score
print(f"\n🏆 OVERALL DATA QUALITY SCORE")
print("=" * 35)

# Calculate quality score (0-100)
completeness_score = (1 - missing_analysis['Missing_Count'].sum() / (len(df) * len(df.columns))) * 100
uniqueness_score = 100 if duplicate_count == 0 else max(0, 100 - (duplicate_count / len(df) * 100))
consistency_score = 90  # Base score, could be enhanced with more sophisticated checks

overall_score = (completeness_score * 0.4 + uniqueness_score * 0.3 + consistency_score * 0.3)

print(f"📊 Data Quality Components:")
print(f"   • Completeness: {completeness_score:.1f}/100 (40% weight)")
print(f"   • Uniqueness: {uniqueness_score:.1f}/100 (30% weight)")
print(f"   • Consistency: {consistency_score:.1f}/100 (30% weight)")
print(f"\n🎯 Overall Data Quality Score: {overall_score:.1f}/100")

if overall_score >= 90:
    quality_grade = "Excellent (A)"
elif overall_score >= 80:
    quality_grade = "Good (B)"
elif overall_score >= 70:
    quality_grade = "Fair (C)"
else:
    quality_grade = "Needs Improvement (D)"

print(f"📈 Data Quality Grade: {quality_grade}")

# Analysis summary and recommendations
print(f"\n📝 ANALYSIS SUMMARY & RECOMMENDATIONS")
print("=" * 45)

print(f"🔍 Dataset Characteristics:")
print(f"   • Sample size: {len(df):,} observations")
print(f"   • Variable count: {len(df.columns)} features")
print(f"   • Numeric variables: {len(numeric_vars)}")
print(f"   • Categorical variables: {len(categorical_vars)}")
print(f"   • Data quality: {quality_grade}")

if len(numeric_vars) >= 2:
    strong_corr_count = len(strong_correlations) if 'strong_correlations' in locals() else 0
    moderate_corr_count = len(moderate_correlations) if 'moderate_correlations' in locals() else 0
    
    print(f"\n🔗 Correlation Insights:")
    print(f"   • Strong correlations (|r| > 0.7): {strong_corr_count}")
    print(f"   • Moderate correlations (0.3-0.7): {moderate_corr_count}")
    
    if strong_corr_count > 0:
        print(f"   • Consider multicollinearity in modeling")
        print(f"   • Potential for dimensionality reduction")

print(f"\n💡 Recommendations for Further Analysis:")
recommendations = []

if missing_analysis['Missing_Count'].sum() > 0:
    recommendations.append("Handle missing values through imputation or deletion")

if duplicate_count > 0:
    recommendations.append("Investigate and remove duplicate records")

if len(numeric_vars) >= 3:
    recommendations.append("Consider principal component analysis (PCA) for dimensionality reduction")

if strong_corr_count > 2:
    recommendations.append("Examine multicollinearity before regression modeling")

if len(categorical_vars) > 0:
    recommendations.append("Perform chi-square tests for categorical associations")

recommendations.extend([
    "Validate findings with domain expertise",
    "Consider advanced statistical modeling based on research questions",
    "Explore temporal patterns if time variables are present"
])

for i, rec in enumerate(recommendations, 1):
    print(f"   {i}. {rec}")

print(f"\n🎯 Analysis Complete! The dataset has been thoroughly examined.")
print(f"📊 Ready for advanced statistical modeling and hypothesis testing.")