# Estadística descriptiva
Descripción de los datos de entrenamiento utilizando una serie de métricas que permiten describir las columnas del Dataset.

In [1]:
import os
# Vemos en que directorio estamos. Seguramente estaremos en:
# ............../logistic_regression/notebooks
print(os.getcwd())

/home/apa/Documentos/github/logistic_regression/notebooks


In [2]:
import sys
sys.path.append('..')  # Añade el directorio padre (logistic_regression) al path
from src.ft_functions import *

import pandas as pd
from tabulate import tabulate

In [3]:
def calculate_metrics(df):
    """Calculate metrics for float columns."""
    # Select numeric columns (float64)
    numeric_columns = df.select_dtypes(include=['float64']).columns
    metrics = {}
    
    for col in numeric_columns:
        values = df[col].dropna().tolist()
        metrics[col] = {
            "Count": ft_count(values),
            "Mean": ft_mean(values),
            "Std": ft_std(values),
            "Min": ft_min(values),
            "25%": ft_percentile(values, 0.25),
            "50%": ft_median(values),
            "75%": ft_percentile(values, 0.75),
            "Max": ft_max(values),
            "IQR": ft_iqr(values),
            "Skewness": ft_skewness(values),
            "Kurtosis": ft_kurtosis(values),
            "CV": ft_cv(values)
        }
    
    return metrics

In [4]:
def print_metrics_table(metrics):
    """Print calculated metrics in a formatted table with transposed orientation."""
    table_data = []
    metrics_to_display = [
        "Count", "Mean", "Std", "Min", "25%", "50%", "75%", "Max",
        "IQR", "Skewness", "Kurtosis", "CV"
    ]
    
    # Create header row (now column names)
    headers = ["COURSES"]
    
    # Create data rows (each column becomes a row)
    for col in metrics:
        row = [col]  # First cell contains column name
        for metric in metrics_to_display:
            value = metrics[col][metric]
            row.append(f"{value:.6f}" if isinstance(value, float) else f"{value}")
        table_data.append(row)
    
    # Add metric names as first column header
    headers.extend(metrics_to_display)
    
    print(tabulate(table_data, headers=headers, tablefmt="fancy_grid", stralign="center", numalign="center"))

In [5]:
def analyze_dataset(file_path='../datasets/dataset_train.csv'):
    """Analyze dataset by loading and calculating metrics."""
    try:
        df = pd.read_csv(file_path)
        metrics = calculate_metrics(df)
        print_metrics_table(metrics)
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found")
    except Exception as e:
        print(f"Error reading the file: {str(e)}")

if __name__ == "__main__":
    analyze_dataset()

╒═══════════════════════════════╤═════════╤═══════════╤══════════╤══════════╤═══════════╤═══════════╤══════════╤══════════╤═════════╤════════════╤════════════╤══════════╕
│            COURSES            │  Count  │   Mean    │   Std    │   Min    │    25%    │    50%    │   75%    │   Max    │   IQR   │  Skewness  │  Kurtosis  │    CV    │
╞═══════════════════════════════╪═════════╪═══════════╪══════════╪══════════╪═══════════╪═══════════╪══════════╪══════════╪═════════╪════════════╪════════════╪══════════╡
│          Arithmancy           │  1566   │  49634.6  │ 16679.8  │  -24370  │  38511.5  │  49013.5  │ 60811.2  │  104956  │ 22299.8 │ -0.041879  │  0.257625  │ 0.336052 │
├───────────────────────────────┼─────────┼───────────┼──────────┼──────────┼───────────┼───────────┼──────────┼──────────┼─────────┼────────────┼────────────┼──────────┤
│           Astronomy           │  1568   │  39.7971  │ 520.298  │ -966.741 │ -489.551  │  260.289  │ 524.772  │ 1016.21  │ 1014.32 │ -0.094544  