# Capstone 3 - Exploratory Data Analysis

### Table of contents
* [Introduction](#intro)

## Introduction <a name="intro"></a>

### Import relevant packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Retrieve variables
Let's retrieve the variables we saved in our data cleaning & wrangling phase.

In [17]:
# Retrieve original noise_data dataframe
%store -r noise_data

# Retrieve GeoPandas dataframes
%store -r districts_gdf 
%store -r schoolpoints_gdf

# Retrieve schools covered by sensor range
%store -r coverage_matrix

# Retrieve summary and achievement dataframes
%store -r summary_dfs
%store -r lg_achievement_dfs
%store -r hs_achievement_dfs
%store -r combined_summary_df
%store -r combined_achievement_df
%store -r merged_coverage_df

# Retrieve school lists
%store -r elem_middle_schools
%store -r high_schools

In [2]:
final_df = merged_coverage_df

In [4]:
# Perform comprehensive data quality analysis
def analyze_data_quality(df, dataset_name="Dataset"):
    """
    Analyze data quality including missing values, distributions, and potential issues.
"""
    print(f"\n{'='*20} {dataset_name} Analysis {'='*20}")
    
    # Basic dataset information
    print("\n1. Basic Information:")
    print(f"Number of rows: {len(df)}")
    print(f"Number of columns: {len(df.columns)}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    
    # Missing values analysis
    print("\n2. Missing Values Analysis:")
    missing = df.isnull().sum()
    missing_pct = (df.isnull().sum() / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing,
        'Percentage': missing_pct
    }).query('`Missing Values` > 0').sort_values('Percentage', ascending=False)
    
    if len(missing_df) > 0:
        print("\nColumns with missing values:")
        print(missing_df)
    else:
        print("No missing values found!")
    
    # Data types
    print("\n3. Data Types:")
    print(df.dtypes.value_counts())
    print("\nDetailed dtypes:")
    print(df.dtypes)
    
    # Numerical columns analysis
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    if len(numerical_cols) > 0:
        print("\n4. Numerical Columns Summary:")
        print(df[numerical_cols].describe())
        
        # Check for potential outliers
        print("\n5. Potential Outliers (values > 3 std devs from mean):")
        for col in numerical_cols:
            mean = df[col].mean()
            std = df[col].std()
            outliers = df[col][(df[col] > mean + 3*std) | (df[col] < mean - 3*std)]
            if len(outliers) > 0:
                print(f"\n{col}:")
                print(f"Number of outliers: {len(outliers)}")
                print(f"Outlier values: {outliers.value_counts().head()}")
    
    # Categorical columns analysis
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        print("\n6. Categorical Columns Summary:")
        for col in categorical_cols:
            unique_values = df[col].nunique()
            print(f"\n{col}:")
            print(f"Number of unique values: {unique_values}")
            if unique_values < 10:  # Only show value counts for columns with few unique values
                print(df[col].value_counts())
    
    # Date columns analysis
    date_cols = df.select_dtypes(include=['datetime64', 'period']).columns
    if len(date_cols) > 0:
        print("\n7. Date Range Analysis:")
        for col in date_cols:
            print(f"\n{col}:")
            print(f"Min date: {df[col].min()}")
            print(f"Max date: {df[col].max()}")
    
    return missing_df

# Run analysis on final merged dataset
missing_data = analyze_data_quality(final_df, "Final Merged Dataset")



1. Basic Information:
Number of rows: 207
Number of columns: 102
Memory usage: 0.16 MB

2. Missing Values Analysis:

Columns with missing values:
                               Missing Values  Percentage
regents_global                            135   65.217391
regents_living_env                        129   62.318841
regents_us_history                        129   62.318841
regents_algebra                           126   60.869565
college_ready_6yr                         121   58.454106
grad_rate_6yr                             121   58.454106
postsec_enroll_18mo                       121   58.454106
postsec_enroll_6mo                        115   55.555556
math_proficient_pct                       114   55.072464
math_lowest_third_proficiency             114   55.072464
math_avg_proficiency                      114   55.072464
ela_avg_proficiency                       112   54.106280
ela_lowest_third_proficiency              112   54.106280
ela_proficient_pct                      

NotImplementedError: 

In [5]:
# Optional: Create visualizations for data quality metrics

def plot_data_quality_visuals(df, missing_df):
    plt.figure(figsize=(15, 10))
    
    # Missing values heatmap
    plt.subplot(2, 1, 1)
    sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap')
    
    # Distribution plots for key metrics
    metrics = ['student_attendance_rate', 'achievement_score']  # Adjust based on your actual column names
    plt.subplot(2, 1, 2)
    for metric in metrics:
        if metric in df.columns:
            sns.kdeplot(data=df[metric].dropna(), label=metric)
    plt.title('Distribution of Key Metrics')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Create visualizations
plot_data_quality_visuals(final_df, missing_data)

NameError: name 'missing_data' is not defined