# Main Notebook

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

from logger import Logger
from helpers import get_dataset_dimensions, count_unique_values, replace_values, summarize_missing, drop_columns_missing, drop_rows_missing, impute_mode
from plotly.subplots import make_subplots


In [None]:
# Create logger instance
logger = Logger(__name__)

In [None]:
# Load the dataset into a pandas DataFrame
df_votings = pd.read_csv('../data/raw/kaggle/congress/CongressionalVotingID.shuf.lrn.csv')

# 2. Data Analysis & Preparation 

## 2.1. Maternal Health Risk Data Set

### 2.1.1. General Information

In [None]:
# Step 2.1.1.1. - Display the dataset information
df_votings.info()

In [None]:
# Step 2.1.1.2. - Display the first 5 rows of the dataset
df_votings.head()

In [None]:
# Step 2.1.1.3. - Display the dataset dimensions
get_dataset_dimensions(df_votings)

In [None]:
# Step 2.1.1.4. - Display column names & data types
print("Column Names and Data Types:")
print(df_votings.dtypes)

In [None]:
# Step 2.1.1.5. - Display the missing values per column
print("Missing Values per Column:")
print(summarize_missing(df_votings))

The missing values are all 0 as the are technical missing rather than captures as `unknown` or `?`. Therefore we should replace them with `None`

In [None]:
# Step 2.1.1.6. - Check for Unique Values and their amount
# Specify the columns to check for unique values
unique_value_columns = [col for col in df_votings.columns if col != 'ID']

# Call the function to count unique values in the specified columns
unique_values = count_unique_values(df_votings, unique_value_columns)

# Display the unique values
display(unique_values)

In [None]:
# Step 2.1.1.7. - Display a summary of statistics (numerical columns)
print("Summary Statistics:")
print(df_votings.describe())

--> Describe method is not not useful at this point as all columns are text based and nothing else is numeric except ID

## 2.2. Data Pre-Processing

### 2.2.1 Replace Values

In [None]:
# Specify the columns to replace values
replace_value_columns = [col for col in df_votings.columns if col != 'ID']

# Define the value mapping for replacement
replacement_mapping = {
    'y': 1,
    'n': 0,
    'unknown': np.nan,
    '?': np.nan
}

# Call the function to replace values in the specified columns
df_votings_replaced = replace_values(df_votings, replace_value_columns, replacement_mapping)

print(summarize_missing(df_votings_replaced))

### 2.2.2 Handle Missing Values

In [None]:
# 2. Drop columns that have more than 20% missing values
df1 = drop_columns_missing(df_votings_replaced, threshold=0.20)

# 3. Drop rows that have more than 30% missing values
df2 = drop_rows_missing(df1, threshold=0.30)

# 4. Impute missing values in the voting-related columns using the mode (most frequent value)
vote_cols = [c for c in df2.columns if c not in ['ID', 'class']]  # Define the columns to impute
df_votings_cleaned = impute_mode(df2, columns=vote_cols)

#  5. Summarize the missing values for the dataset
print(summarize_missing(df_votings_cleaned))

## 2.3 Data Analysis

### 2.3.1 Distribution of Votes by Party

In [None]:

def plot_vote_distribution(df) -> None:
    """
    Plot the distribution of votes for a specific party.
    
    Parameters:
    df (DataFrame): The DataFrame containing voting data.
    party (str): The party to plot ('democrat' or 'republican').
    
    Returns:
    None
    """
    # Selecting all columns except "ID" and "class" for voting-related columns
    vote_cols = [c for c in df.columns if c not in ["ID", "class"]]

    # Defining the grid size for subplots (4 rows and 4 columns)
    rows = 4
    cols = 4
    # Titles for each subplot will be the names of the voting columns
    subplot_titles = vote_cols

    # Create a subplot figure with the specified number of rows and columns
    fig = make_subplots(
        rows=rows, 
        cols=cols, 
        subplot_titles=subplot_titles,      # Setting titles for each subplot
        vertical_spacing=0.1,               # Vertical spacing between subplots
        horizontal_spacing=0.05             # Horizontal spacing between subplots  
    )

    # Loop through each of the voting columns to create a subplot for each
    for idx, col in enumerate(vote_cols, 1):
        # Determine the row and column position for each subplot based on index
        row = (idx-1)//cols + 1
        col_pos = (idx-1)%cols + 1
        
        # Calculate the normalized voting proportions for the "Democrat" class in the current column
        democrat_counts = df[df["class"] == "democrat"][col].value_counts(normalize=True)

        # Calculate the normalized voting proportions for the "Republican" class in the current column
        republican_counts = df[df["class"] == "republican"][col].value_counts(normalize=True)
        
        # Add a bar trace for Democrat votes in the current subplot (row, col)
        fig.add_trace(
            go.Bar(
                x=democrat_counts.index,        # Categories of votes (e.g., Yes, No, etc.)
                y=democrat_counts.values,       # Proportion of votes for each category
                name="Democrat",                # Label for the Democrat party in the legend
                marker_color="blue",            # Blue color for the Democrat bars
                showlegend=(idx == 1)           # Show the legend only for the first subplot
            ),
            row=row,            # Position the plot at the correct row
            col=col_pos         # Position the plot at the correct column
        )
    
        # Add a bar trace for Republican votes in the current subplot (row, col)
        fig.add_trace(
            go.Bar(
                x=republican_counts.index,      # Categories of votes (e.g., Yes, No, etc.)
                y=republican_counts.values,     # Proportion of votes for each category
                name="Republican",              # Label for the Republican party in the legend
                marker_color="red",             # Red color for the Republican bars
                showlegend=(idx == 1)           # Show the legend only for the first subplot
            ),
            row=row,        # Position the plot at the correct row
            col=col_pos     # Position the plot at the correct column
        )

    # Update the layout of the figure to include a title and adjust the overall look
    fig.update_layout(
        height=1200, 
        width=1200,
        title_text="Distribution of Votes by Party in %",
        bargap=0.2,
        legend=dict(orientation="h", yanchor="bottom", y=1.02)  # Position the legend horizontally above the subplots
    )

    # Update the x and y axis labels for each subplot
    for i in range(1, rows * cols + 1):
        # Remove x-axis titles (no need for titles in these bar plots)
        fig.update_xaxes(title_text="", row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
        # Set y-axis label to "Proportion" for all subplots
        fig.update_yaxes(title_text="Proportion", row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)

    # Display the figure
    fig.show()

In [None]:
plot_vote_distribution(df_votings)

In [None]:
plot_vote_distribution(df_votings_cleaned)

### 2.3.2 Correlation Matrix of Vote Features

In [None]:
def plot_vote_correlation(df) -> None:
    """
    Generates and displays a heatmap of the correlation matrix for a given DataFrame.

    This function first converts all values in the DataFrame to numeric values, 
    coercing any non-numeric values to NaN. Then, it calculates the correlation 
    matrix of the cleaned data and visualizes the correlation between different 
    features using a heatmap. The heatmap is displayed using Plotly, with a 
    color scale ranging from -1 (negative correlation) to +1 (positive correlation).

    Parameters:
    df (pandas.DataFrame): A pandas DataFrame containing vote-related features, 
                            where columns represent different features and rows 
                            represent individual observations.

    Returns:
    None: This function displays the heatmap directly but does not return any value.

    Example:
    plot_vote_correlation(vote_df)
    """
    # Apply the 'pd.to_numeric' function to convert all values in the DataFrame to numeric, 
    # coercing any non-numeric values to NaN (missing values).
    df_encoded = df.apply(pd.to_numeric, errors='coerce')

    # Calculate the correlation matrix of the cleaned and encoded DataFrame
    corr_test = df_encoded.corr()

    # Create a heatmap figure to visualize the correlation matrix
    fig_corr = go.Figure(data=go.Heatmap(
        z=corr_test.values,
        x=corr_test.columns,
        y=corr_test.index,
        colorscale="RdBu",
        zmin=-1,
        zmax=1,
        colorbar=dict(title="Correlation")
    ))

    # Update the layout of the heatmap
    fig_corr.update_layout(
        title="Correlation Matrix of Vote Features",  # Main title for the heatmap
        width=1000,  # Width of the heatmap figure (in pixels)
        height=800,  # Height of the heatmap figure (in pixels)
        title_x=0.5  # Center the title horizontally on the plot
    )

    # Display the heatmap
    fig_corr.show()

In [None]:
import plotly.express as px

def plot_votes_class_balance(df_votings):
    """
    Generates a histogram showing the class balance in congressional voting 
    along with annotations displaying the absolute count and percentage for 
    each class (e.g., Democrat and Republican).

    Parameters:
    df_votings (pandas.DataFrame): The dataframe containing the voting data 
                                    with a column named 'class' indicating 
                                    the political party ('democrat' or 'republican').

    Returns:
    plotly.graph_objects.Figure: A Plotly histogram figure with custom annotations.
    """
    # Calculate class counts and proportions
    class_counts = df_votings["class"].value_counts()  # Count the occurrences of each class
    class_props = class_counts / class_counts.sum()  # Calculate the proportions for each class

    # Create histogram without text_auto
    fig_class = px.histogram(
        df_votings,
        x="class",
        color="class",
        color_discrete_map={"democrat": "#0015BC", "republican": "#E9141D"},  # Assign colors for each class
        title="<b>Class Balance in Congressional Voting</b>",  # Title of the chart
        category_orders={"class": ["democrat", "republican"]},  # Order of categories
        labels={"class": "Political Party"},  # Label for the 'class' axis
        width=800,
        height=500
    )

    # Manually add annotations: absolute count + percentage
    for i, (count, prop) in enumerate(zip(class_counts, class_props)):
        fig_class.add_annotation(
            x=class_counts.index[i],  # Position annotation at the class
            y=count / 2,  # Center the annotation in the bar
            text=f"<b>{count} ({prop:.1%})</b>",  # Display the count and percentage
            showarrow=False,  # Do not display an arrow for annotation
            font=dict(size=14, color="white", family="Arial Black"),  # Font style for annotations
            align="center"  # Center align the text
        )

    # Layout adjustments
    fig_class.update_layout(
        uniformtext_minsize=12,  # Minimum size for uniform text
        uniformtext_mode="hide",  # Hide text when there is not enough space
        yaxis=dict(range=[0, max(class_counts) * 1.2])  # Adjust the y-axis range to give extra space
    )

    # Bar text (not needed – disabled)
    fig_class.update_traces(
        text=None,  # Disable bar text
        marker_line_width=0  # Disable the line around bars
    )

    fig_class.show()  # Display the histogram figure


In [None]:
plot_votes_class_balance(df_votings)

In [None]:
plot_votes_class_balance(df_votings_cleaned)