In [None]:
import pandas as pd
import os
from google.colab import files

In [None]:
# Upload the input file
uploaded = files.upload()

# Rename the uploaded file to 'input' with the correct extension
for original_filename in uploaded.keys():
    # Get the file extension
    _, file_extension = os.path.splitext(original_filename)
    # Define the new filename
    new_filename = 'input' + file_extension
    # Rename the file
    os.rename(original_filename, new_filename)
    # Set the file_path to the new filename
    file_path = new_filename
    print(f"File uploaded and renamed to: {new_filename}")

In [None]:
def load_data(file_path):
    """
    Load data from a CSV or XLSX file into a DataFrame.

    Args:
        file_path (str): The path to the input file.

    Returns:
        pd.DataFrame: The loaded DataFrame.
    """
    try:
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV or XLSX file.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

In [None]:
def validate_data(df):
    """
    Perform basic data quality checks on the input DataFrame.
    Prints warnings for potential issues but preserves data, this serves as a warning/insight.
    'charge_off_date' NaN or None are replaced with NaT to be handled appropriately.
    'net_call_off' NaN or None are replaced with 0.0 to be handled appropriately.

    Args:
        df (pd.DataFrame): DataFrame containing loan servicing data

    Returns:
        df (pd.DataFrame): DataFrame containing loan servicing data

    """
    df_copy = df.copy()

    # Check for required columns
    required_columns = ['loan_id', 'boarding_date', 'charge_off_date',
                       'net_call_off', 'original_amount_financed']
    missing_columns = set(required_columns) - set(df_copy.columns)
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Convert dates - keep original if conversion fails
    for date_col in ['boarding_date', 'charge_off_date']:
        try:
            df_copy[date_col] = pd.to_datetime(df_copy[date_col], errors='coerce')
        except Exception as e:
            print(f"Warning: Some {date_col} values could not be converted to dates")

    # Convert numeric columns - keep original if conversion fails
    for num_col in ['net_call_off', 'original_amount_financed']:
        try:
            df_copy[num_col] = pd.to_numeric(df_copy[num_col], errors='coerce')
        except Exception as e:
            print(f"Warning: Some {num_col} values could not be converted to numbers")

    # Print warnings for potential data quality issues
    issues_found = False

    # Check only for critical missing values (boarding_date and original_amount_financed)
    critical_missing = df_copy[['boarding_date', 'original_amount_financed']].isnull()
    if critical_missing.any().any():
        issues_found = True
        print("\nMissing required values:")
        missing_rows = df_copy[critical_missing.any(axis=1)]
        print("loan_id | missing values")
        print("-" * 30)
        for idx, row in missing_rows.iterrows():
            missing_cols = [col for col in ['boarding_date', 'original_amount_financed'] if pd.isnull(row[col])]
            print(f"{row['loan_id']} | {', '.join(missing_cols)}")

    # Check for charge-off dates before boarding dates
    invalid_dates = df_copy[df_copy['charge_off_date'] < df_copy['boarding_date']]
    if not invalid_dates.empty:
        issues_found = True
        print("\nLoans with charge-off date before boarding date:")
        print("loan_id | boarding_date | charge_off_date")
        print("-" * 50)
        for idx, row in invalid_dates.iterrows():
            print(f"{row['loan_id']} | {row['boarding_date'].date()} | {row['charge_off_date'].date()}")

    # Check for duplicate loan_ids
    duplicates = df_copy[df_copy['loan_id'].duplicated(keep=False)]
    if not duplicates.empty:
        issues_found = True
        print("\nDuplicate loan_id entries:")
        print("loan_id | boarding_date")
        print("-" * 30)
        for idx, row in duplicates.iterrows():
            print(f"{row['loan_id']} | {row['boarding_date'].date()}")

    # Check for negative values
    for col in ['original_amount_financed', 'net_call_off']:
        negative_values = df_copy[df_copy[col] < 0]
        if not negative_values.empty:
            issues_found = True
            print(f"\nNegative values found in {col}:")
            print(f"loan_id | {col}")
            print("-" * 30)
            for idx, row in negative_values.iterrows():
                print(f"{row['loan_id']} | {row[col]}")

    # Basic data fixes
    # Replace NaN or None to NaT to be handled as a blank time in pandas
    df_copy['charge_off_date'] = df_copy['charge_off_date'].fillna(pd.NaT)
    # Replace NaN or None with 0.0
    df_copy['net_call_off'] = df_copy['net_call_off'].fillna(0.0)

    if not issues_found:
        print("No major data quality issues found")

    return df_copy

In [None]:
def calculate_vintage_matrix(df, period_type='quarterly', calculation_type='sum', output_file_path=None):
    """
    Calculate the vintage matrix for cumulative net call-offs with flexible time period options.

    Args:
        df (pd.DataFrame): DataFrame containing loan servicing data
        period_type (str): Time period for analysis - 'monthly', 'quarterly', or 'yearly'
        calculation_type (str): 'sum' for cumulative net call-off sum, 'percent' for percentage of cumulative amount financed for period
        output_file_path (str): Path to save the output Excel file. If None, will generate default name

    Returns:
        vintage_matrix (pd.DataFrame): The calculated vintage matrix
    """
    # Validate and set period parameters
    period_settings = {
        'monthly': {'freq': 'M', 'name': 'month'},
        'quarterly': {'freq': 'Q', 'name': 'quarter'},
        'yearly': {'freq': 'Y', 'name': 'year'}
    }

    if period_type.lower() not in period_settings:
        raise ValueError("period_type must be 'monthly', 'quarterly', or 'yearly'")

    period_freq = period_settings[period_type.lower()]['freq']
    period_name = period_settings[period_type.lower()]['name']

    # Set default output file path if none provided
    if output_file_path is None:
        output_file_path = f'vintage_analysis_{period_type.lower()}_{calculation_type}.xlsx'

    # Ensure datetime format for dates
    df['boarding_date'] = pd.to_datetime(df['boarding_date'])
    df['charge_off_date'] = pd.to_datetime(df['charge_off_date'])

    # Add the period column based on boarding_date
    period_col = f'year{period_name}'
    df[period_col] = df['boarding_date'].dt.to_period(period_freq).astype(str)

    # Calculate the number of periods since vintage started
    def calculate_period_offset(row):
        if pd.isna(row['charge_off_date']):
            charge_off_period = pd.Timestamp.now().to_period(period_freq)
        else:
            charge_off_period = row['charge_off_date'].to_period(period_freq)
        boarding_period = row['boarding_date'].to_period(period_freq)
        return (charge_off_period - boarding_period).n

    vintage_col = f'vintage_{period_name}'
    df[vintage_col] = df.apply(calculate_period_offset, axis=1)

    # Replace any missing net_call_off values with 0
    df['net_call_off'] = df['net_call_off'].fillna(0)

    # Calculate the total original amount financed for each vintage
    origination_sum = df.groupby(period_col)['original_amount_financed'].sum()

    # Group by period and vintage to calculate cumulative net_call_off
    cumulative_data = (
        df.groupby([vintage_col, period_col])['net_call_off']
        .sum()
        .groupby(level=1)
        .cumsum()
        .reset_index()
    )

    if calculation_type == 'percent':
        cumulative_data['origination_sum'] = cumulative_data[period_col].map(origination_sum)
        cumulative_data['net_call_off'] = (
            cumulative_data['net_call_off'] / cumulative_data['origination_sum']
        ) * 100

    # Pivot to create the matrix
    vintage_matrix = cumulative_data.pivot(
        index=vintage_col, columns=period_col, values='net_call_off'
    )

    # Create a separate DataFrame for originations
    originations_df = pd.DataFrame(origination_sum).T.rename(index={0: 'originations'})

    # Get the maximum number of vintage periods
    max_vintage_periods = len(vintage_matrix.index)

    # Process each column to handle empty cells and value forwarding
    for col in vintage_matrix.columns:
        col_idx = vintage_matrix.columns.get_loc(col)
        limit = max_vintage_periods - col_idx  # Diagonal cutoff limit

        if limit > 0:
            # Get the column data
            column_data = vintage_matrix[col].copy()

            # Find the first non-null value
            first_valid_idx = column_data.first_valid_index()

            if first_valid_idx is not None:
                # Fill with 0s up to first valid value
                column_data.loc[:first_valid_idx] = column_data.loc[:first_valid_idx].fillna(0)
                # Forward fill the remaining values up to the limit
                column_data = column_data.iloc[:limit].fillna(method='ffill')
            else:
                # If no valid values, fill everything up to limit with 0
                column_data.iloc[:limit] = 0

            # Apply back to vintage matrix
            vintage_matrix[col].iloc[:limit] = column_data[:limit]

    # Combine originations with vintage matrix
    vintage_matrix = pd.concat([originations_df, vintage_matrix])

    # Save the final vintage matrix to an Excel file
    vintage_matrix.to_excel(output_file_path)
    print(f"{period_type.capitalize()} vintage matrix saved to {output_file_path}")

    return vintage_matrix

In [None]:
df = load_data(new_filename)
df = validate_data(df)
quarterly_matrix = calculate_vintage_matrix(df, period_type='quarterly', calculation_type='percent')