# 1. Notebook Preparation

## 1.1 Import necessary libraries

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

from utils.logger import Logger
from utils.helpers import get_dataset_dimensions, count_unique_values, replace_values, summarize_missing, drop_columns_missing, drop_rows_missing, impute_mode
from plotly.subplots import make_subplots


## 1.2. Create Utils & Helpers

In [72]:
# Create logger instance
logger = Logger(__name__)

## 1.3. Load The Data Set Into A PDF

In [73]:
# Load the dataset into a pandas DataFrame
df_votings = pd.read_csv('../data/raw/kaggle/congress/CongressionalVotingID.shuf.lrn.csv')

In [None]:
# Lead the infer dataset into a pandas DataFrame
df_votings_infer = pd.read_csv('../data/raw/kaggle/congress/CongressionalVotingID.shuf.tes.csv')

# 2. Data Analysis & Preparation 

## 2.1. Maternal Health Risk Data Set

### 2.1.1. General Information

In [75]:
# Step 2.1.1.1. - Display the dataset information
df_votings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      218 non-null    int64 
 1   class                                   218 non-null    object
 2   handicapped-infants                     218 non-null    object
 3   water-project-cost-sharing              218 non-null    object
 4   adoption-of-the-budget-resolution       218 non-null    object
 5   physician-fee-freeze                    218 non-null    object
 6   el-salvador-aid                         218 non-null    object
 7   religious-groups-in-schools             218 non-null    object
 8   anti-satellite-test-ban                 218 non-null    object
 9   aid-to-nicaraguan-contras               218 non-null    object
 10  mx-missile                              218 non-null    object
 11  immigr

In [76]:
# Step 2.1.1.2. - Display the first 5 rows of the dataset
df_votings.head()

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,362,democrat,y,n,y,n,y,y,y,n,y,y,n,n,y,y,n,unknown
1,86,democrat,n,n,y,n,y,y,n,n,n,y,y,y,y,y,n,y
2,264,democrat,y,n,y,n,n,n,y,y,y,n,n,n,n,n,y,unknown
3,258,republican,n,n,n,y,y,n,n,n,n,n,n,y,n,y,unknown,y
4,381,democrat,y,y,y,n,n,y,unknown,y,y,n,y,n,y,n,y,y


In [77]:
# Step 2.1.1.3. - Display the dataset dimensions
get_dataset_dimensions(df_votings)

2025-04-27 16:30:50,543 - helpers - INFO - Dataset dimensions: (218, 18)


(218, 18)

In [78]:
# Step 2.1.1.4. - Display column names & data types
print("Column Names and Data Types:")
print(df_votings.dtypes)

Column Names and Data Types:
ID                                         int64
class                                     object
handicapped-infants                       object
water-project-cost-sharing                object
adoption-of-the-budget-resolution         object
physician-fee-freeze                      object
el-salvador-aid                           object
religious-groups-in-schools               object
anti-satellite-test-ban                   object
aid-to-nicaraguan-contras                 object
mx-missile                                object
immigration                               object
synfuels-crporation-cutback               object
education-spending                        object
superfund-right-to-sue                    object
crime                                     object
duty-free-exports                         object
export-administration-act-south-africa    object
dtype: object


In [79]:
# Step 2.1.1.5. - Display the missing values per column
print("Missing Values per Column:")
print(summarize_missing(df_votings))

Missing Values per Column:
                                        missing_count  missing_pct
ID                                                  0          0.0
class                                               0          0.0
handicapped-infants                                 0          0.0
water-project-cost-sharing                          0          0.0
adoption-of-the-budget-resolution                   0          0.0
physician-fee-freeze                                0          0.0
el-salvador-aid                                     0          0.0
religious-groups-in-schools                         0          0.0
anti-satellite-test-ban                             0          0.0
aid-to-nicaraguan-contras                           0          0.0
mx-missile                                          0          0.0
immigration                                         0          0.0
synfuels-crporation-cutback                         0          0.0
education-spending                 

The missing values are all 0 as the are technical missing rather than captures as `unknown` or `?`. Therefore we should replace them with `None`

In [80]:
# Step 2.1.1.6. - Check for Unique Values and their amount
# Specify the columns to check for unique values
unique_value_columns = [col for col in df_votings.columns if col != 'ID']

# Call the function to count unique values in the specified columns
unique_values = count_unique_values(df_votings, unique_value_columns)

# Display the unique values
display(unique_values)

Unnamed: 0_level_0,democrat,n,republican,unknown,y
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
class,127,0,91,0,0
handicapped-infants,0,109,0,4,105
water-project-cost-sharing,0,90,0,28,100
adoption-of-the-budget-resolution,0,87,0,8,123
physician-fee-freeze,0,119,0,6,93
el-salvador-aid,0,104,0,7,107
religious-groups-in-schools,0,76,0,6,136
anti-satellite-test-ban,0,93,0,7,118
aid-to-nicaraguan-contras,0,89,0,12,117
mx-missile,0,107,0,8,103


In [81]:
# Step 2.1.1.7. - Display a summary of statistics (numerical columns)
print("Summary Statistics:")
print(df_votings.describe())

Summary Statistics:
               ID
count  218.000000
mean   209.279817
std    125.726093
min      6.000000
25%    102.250000
50%    209.500000
75%    315.500000
max    433.000000


--> Describe method is not not useful at this point as all columns are text based and nothing else is numeric except ID

## 2.2. Data Pre-Processing

### 2.2.1 Replace Values

In [82]:
# Specify the columns to replace values
replace_value_columns = [col for col in df_votings.columns if col != 'ID']
replace_value_columns_infer = [col for col in df_votings_infer.columns if col != 'ID']
# Define the value mapping for replacement
replacement_mapping = {
    'y': 1,
    'n': 0,
    'unknown': np.nan,
    '?': np.nan
}

# Call the function to replace values in the specified columns
df_votings_replaced = replace_values(df_votings, replace_value_columns, replacement_mapping)
df_votings_infer_replaced = replace_values(df_votings_infer, replace_value_columns_infer, replacement_mapping)

print(summarize_missing(df_votings_replaced))

                                        missing_count  missing_pct
ID                                                  0     0.000000
class                                               0     0.000000
handicapped-infants                                 4     1.834862
water-project-cost-sharing                         28    12.844037
adoption-of-the-budget-resolution                   8     3.669725
physician-fee-freeze                                6     2.752294
el-salvador-aid                                     7     3.211009
religious-groups-in-schools                         6     2.752294
anti-satellite-test-ban                             7     3.211009
aid-to-nicaraguan-contras                          12     5.504587
mx-missile                                          8     3.669725
immigration                                         3     1.376147
synfuels-crporation-cutback                        10     4.587156
education-spending                                 18     8.25


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



### 2.2.2 Handle Missing Values

In [83]:
# 2. Drop columns that have more than 20% missing values
df1 = drop_columns_missing(df_votings_replaced, threshold=0.20)

# 3. Drop rows that have more than 30% missing values
df2 = drop_rows_missing(df1, threshold=0.30)

# 4. Impute missing values in the voting-related columns using the mode (most frequent value)
vote_cols = [c for c in df2.columns if c not in ['ID', 'class']]  # Define the columns to impute
df_votings_cleaned = impute_mode(df2, columns=vote_cols)

#  5. Summarize the missing values for the dataset
print(summarize_missing(df_votings_cleaned))

                                   missing_count  missing_pct
ID                                             0          0.0
class                                          0          0.0
handicapped-infants                            0          0.0
water-project-cost-sharing                     0          0.0
adoption-of-the-budget-resolution              0          0.0
physician-fee-freeze                           0          0.0
el-salvador-aid                                0          0.0
religious-groups-in-schools                    0          0.0
anti-satellite-test-ban                        0          0.0
aid-to-nicaraguan-contras                      0          0.0
mx-missile                                     0          0.0
immigration                                    0          0.0
synfuels-crporation-cutback                    0          0.0
education-spending                             0          0.0
superfund-right-to-sue                         0          0.0
crime   

In [84]:
# 2. Drop columns that have more than 20% missing values
df1_infer = drop_columns_missing(df_votings_infer_replaced, threshold=0.20)


df_votings_cleaned_infer = df1_infer

#  5. Summarize the missing values for the dataset
print(summarize_missing(df_votings_cleaned_infer))

                                   missing_count  missing_pct
ID                                             0     0.000000
handicapped-infants                            8     3.686636
water-project-cost-sharing                    20     9.216590
adoption-of-the-budget-resolution              3     1.382488
physician-fee-freeze                           5     2.304147
el-salvador-aid                                8     3.686636
religious-groups-in-schools                    5     2.304147
anti-satellite-test-ban                        7     3.225806
aid-to-nicaraguan-contras                      3     1.382488
mx-missile                                    14     6.451613
immigration                                    4     1.843318
synfuels-crporation-cutback                   11     5.069124
education-spending                            13     5.990783
superfund-right-to-sue                         9     4.147465
crime                                          6     2.764977
duty-fre

## 2.3 Data Analysis

### 2.3.1 Distribution of Votes by Party

In [85]:

def plot_vote_distribution(df) -> None:
    """
    Plot the distribution of votes for a specific party.
    
    Parameters:
    df (DataFrame): The DataFrame containing voting data.
    party (str): The party to plot ('democrat' or 'republican').
    
    Returns:
    None
    """
    # Selecting all columns except "ID" and "class" for voting-related columns
    vote_cols = [c for c in df.columns if c not in ["ID", "class"]]

    # Defining the grid size for subplots (4 rows and 4 columns)
    rows = 4
    cols = 4
    # Titles for each subplot will be the names of the voting columns
    subplot_titles = vote_cols

    # Create a subplot figure with the specified number of rows and columns
    fig = make_subplots(
        rows=rows, 
        cols=cols, 
        subplot_titles=subplot_titles,      # Setting titles for each subplot
        vertical_spacing=0.1,               # Vertical spacing between subplots
        horizontal_spacing=0.05             # Horizontal spacing between subplots  
    )

    # Loop through each of the voting columns to create a subplot for each
    for idx, col in enumerate(vote_cols, 1):
        # Determine the row and column position for each subplot based on index
        row = (idx-1)//cols + 1
        col_pos = (idx-1)%cols + 1
        
        # Calculate the normalized voting proportions for the "Democrat" class in the current column
        democrat_counts = df[df["class"] == "democrat"][col].value_counts(normalize=True)

        # Calculate the normalized voting proportions for the "Republican" class in the current column
        republican_counts = df[df["class"] == "republican"][col].value_counts(normalize=True)
        
        # Add a bar trace for Democrat votes in the current subplot (row, col)
        fig.add_trace(
            go.Bar(
                x=democrat_counts.index,        # Categories of votes (e.g., Yes, No, etc.)
                y=democrat_counts.values,       # Proportion of votes for each category
                name="Democrat",                # Label for the Democrat party in the legend
                marker_color="blue",            # Blue color for the Democrat bars
                showlegend=(idx == 1)           # Show the legend only for the first subplot
            ),
            row=row,            # Position the plot at the correct row
            col=col_pos         # Position the plot at the correct column
        )
    
        # Add a bar trace for Republican votes in the current subplot (row, col)
        fig.add_trace(
            go.Bar(
                x=republican_counts.index,      # Categories of votes (e.g., Yes, No, etc.)
                y=republican_counts.values,     # Proportion of votes for each category
                name="Republican",              # Label for the Republican party in the legend
                marker_color="red",             # Red color for the Republican bars
                showlegend=(idx == 1)           # Show the legend only for the first subplot
            ),
            row=row,        # Position the plot at the correct row
            col=col_pos     # Position the plot at the correct column
        )

    # Update the layout of the figure to include a title and adjust the overall look
    fig.update_layout(
        height=1200, 
        width=1200,
        title_text="Distribution of Votes by Party in %",
        bargap=0.2,
        legend=dict(orientation="h", yanchor="bottom", y=1.02)  # Position the legend horizontally above the subplots
    )

    # Update the x and y axis labels for each subplot
    for i in range(1, rows * cols + 1):
        # Remove x-axis titles (no need for titles in these bar plots)
        fig.update_xaxes(title_text="", row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)
        # Set y-axis label to "Proportion" for all subplots
        fig.update_yaxes(title_text="Proportion", row=(i - 1) // cols + 1, col=(i - 1) % cols + 1)

    # Display the figure
    fig.show()

In [86]:
plot_vote_distribution(df_votings)

In [87]:
plot_vote_distribution(df_votings_cleaned)

### 2.3.2 Correlation Matrix of Vote Features

In [88]:
def plot_vote_correlation(df) -> None:
    """
    Generates and displays a heatmap of the correlation matrix for a given DataFrame.

    This function first converts all values in the DataFrame to numeric values, 
    coercing any non-numeric values to NaN. Then, it calculates the correlation 
    matrix of the cleaned data and visualizes the correlation between different 
    features using a heatmap. The heatmap is displayed using Plotly, with a 
    color scale ranging from -1 (negative correlation) to +1 (positive correlation).

    Parameters:
    df (pandas.DataFrame): A pandas DataFrame containing vote-related features, 
                            where columns represent different features and rows 
                            represent individual observations.

    Returns:
    None: This function displays the heatmap directly but does not return any value.

    Example:
    plot_vote_correlation(vote_df)
    """
    # Apply the 'pd.to_numeric' function to convert all values in the DataFrame to numeric, 
    # coercing any non-numeric values to NaN (missing values).
    df_encoded = df.apply(pd.to_numeric, errors='coerce')

    # Calculate the correlation matrix of the cleaned and encoded DataFrame
    corr_test = df_encoded.corr()

    # Create a heatmap figure to visualize the correlation matrix
    fig_corr = go.Figure(data=go.Heatmap(
        z=corr_test.values,
        x=corr_test.columns,
        y=corr_test.index,
        colorscale="RdBu",
        zmin=-1,
        zmax=1,
        colorbar=dict(title="Correlation")
    ))

    # Update the layout of the heatmap
    fig_corr.update_layout(
        title="Correlation Matrix of Vote Features",  # Main title for the heatmap
        width=1000,  # Width of the heatmap figure (in pixels)
        height=800,  # Height of the heatmap figure (in pixels)
        title_x=0.5  # Center the title horizontally on the plot
    )

    # Display the heatmap
    fig_corr.show()

In [89]:
import plotly.express as px

def plot_votes_class_balance(df_votings):
    """
    Generates a histogram showing the class balance in congressional voting 
    along with annotations displaying the absolute count and percentage for 
    each class (e.g., Democrat and Republican).

    Parameters:
    df_votings (pandas.DataFrame): The dataframe containing the voting data 
                                    with a column named 'class' indicating 
                                    the political party ('democrat' or 'republican').

    Returns:
    plotly.graph_objects.Figure: A Plotly histogram figure with custom annotations.
    """
    # Calculate class counts and proportions
    class_counts = df_votings["class"].value_counts()  # Count the occurrences of each class
    class_props = class_counts / class_counts.sum()  # Calculate the proportions for each class

    # Create histogram without text_auto
    fig_class = px.histogram(
        df_votings,
        x="class",
        color="class",
        color_discrete_map={"democrat": "#0015BC", "republican": "#E9141D"},  # Assign colors for each class
        title="<b>Class Balance in Congressional Voting</b>",  # Title of the chart
        category_orders={"class": ["democrat", "republican"]},  # Order of categories
        labels={"class": "Political Party"},  # Label for the 'class' axis
        width=800,
        height=500
    )

    # Manually add annotations: absolute count + percentage
    for i, (count, prop) in enumerate(zip(class_counts, class_props)):
        fig_class.add_annotation(
            x=class_counts.index[i],  # Position annotation at the class
            y=count / 2,  # Center the annotation in the bar
            text=f"<b>{count} ({prop:.1%})</b>",  # Display the count and percentage
            showarrow=False,  # Do not display an arrow for annotation
            font=dict(size=14, color="white", family="Arial Black"),  # Font style for annotations
            align="center"  # Center align the text
        )

    # Layout adjustments
    fig_class.update_layout(
        uniformtext_minsize=12,  # Minimum size for uniform text
        uniformtext_mode="hide",  # Hide text when there is not enough space
        yaxis=dict(range=[0, max(class_counts) * 1.2])  # Adjust the y-axis range to give extra space
    )

    # Bar text (not needed – disabled)
    fig_class.update_traces(
        text=None,  # Disable bar text
        marker_line_width=0  # Disable the line around bars
    )

    fig_class.show()  # Display the histogram figure


In [90]:
plot_votes_class_balance(df_votings)

In [91]:
plot_votes_class_balance(df_votings_cleaned)

In [92]:
df_votings_cleaned.to_csv("../data/cleaned/votings_cleaned.csv")
df_votings_cleaned_infer.to_csv("../data/cleaned/votings_cleaned_infer.csv")