<a href="https://colab.research.google.com/github/fletchdr7/cadet_data_solution/blob/main/cadet_data_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
##########################################
# 1) INSTALL & IMPORT LIBRARIES
##########################################

!pip install striprtf thefuzz pandas numpy

import pandas as pd
import numpy as np
import sqlite3
from thefuzz import process
from striprtf.striprtf import rtf_to_text
from io import StringIO
from pathlib import Path
import os

print("Libraries installed and imported.")

##########################################
# 2) UPLOAD & PARSE THE RTF FILE
##########################################

from google.colab import files

print("Please upload your .rtf file (e.g., W_DFAS_257.rtf).")
uploaded_rtf = files.upload()
rtf_filename = list(uploaded_rtf.keys())[0]
print(f"RTF file uploaded as: {rtf_filename}")

# Decode raw bytes and convert RTF to plain text
rtf_bytes = uploaded_rtf[rtf_filename]
rtf_text = rtf_bytes.decode('utf-8', errors='ignore')
plain_text = rtf_to_text(rtf_text)

print("\nPreview of RTF plain text (first 500 characters):")
print(plain_text[:500])

##########################################
# 3) PARSE THE RTF TABLE (PIPE-DELIMITED)
##########################################

lines = plain_text.splitlines()
table_lines = []
found_header = False

for line in lines:
    if line.startswith("SSN|"):
        found_header = True
    if found_header:
        if line.strip() == "":
            break
        table_lines.append(line)

table_str = "\n".join(table_lines)

df_rtf = pd.read_csv(StringIO(table_str), sep="|")

print("\nInitial df_rtf from RTF table:")
print(df_rtf.columns.tolist())  # Debugging - Show column names
display(df_rtf.head())

# Remove any 'Unnamed' columns
df_rtf.columns = df_rtf.columns.str.strip()  # Remove extra spaces in column names
df_rtf.rename(columns={"Name": "NAME"}, inplace=True)  # Standardize column name
unnamed_cols = [c for c in df_rtf.columns if "Unnamed" in c]
if unnamed_cols:
    df_rtf.drop(columns=unnamed_cols, inplace=True)
    print(f"Dropped unnamed columns: {unnamed_cols}")

##########################################
# 4) CLEAN/CONVERT RTF DATA
##########################################

# Convert numeric columns
numeric_cols = [
    "SUBSIS    PAID",
    "BOOKS & FEES",
    "TRAINING PAID",
    "BONUS PAID",
    "COMM PAID",
    "TRN DAY",
    "COLLECTION",
    "NET PAY",
    "CUM SUBSIS",
    "CUM SUB DAY",
]
for col in numeric_cols:
    if col in df_rtf.columns:
        df_rtf[col] = pd.to_numeric(df_rtf[col], errors="coerce")

# Convert ROTC GRAD DATE to datetime
if "ROTC GRAD DATE" in df_rtf.columns:
    df_rtf["ROTC GRAD DATE"] = pd.to_datetime(df_rtf["ROTC GRAD DATE"], format="%Y%m%d", errors="coerce")

print("\nAfter numeric/date conversion:")
df_rtf.info()

##########################################
# 5) UPLOAD & READ THE CSV
##########################################

print("\nUpload your SMR_AS_FAVORITE.csv file:")
uploaded_csv = files.upload()
csv_filename = list(uploaded_csv.keys())[0]
print(f"CSV file uploaded as: {csv_filename}")

df_csv = pd.read_csv(csv_filename)

# Debugging - Show column names before renaming
print("\nOriginal df_csv columns:")
print(df_csv.columns.tolist())

# Standardize column names
df_csv.columns = df_csv.columns.str.strip()  # Remove extra spaces
df_csv.rename(columns={"Name": "NAME", "Schlr Activ Dt": "Schlr_Act_Dt", "AS Year": "AS_Year"}, inplace=True)

# Convert scholarship activation date to datetime
df_csv["Schlr_Act_Dt"] = pd.to_datetime(df_csv["Schlr_Act_Dt"], errors="coerce")

print("\nUpdated df_csv columns:")
print(df_csv.columns.tolist())

##########################################
# 6) MERGE RTF & CSV DATA
##########################################

print("\nChecking column match before merging:")
print("RTF Columns:", df_rtf.columns.tolist())
print("CSV Columns:", df_csv.columns.tolist())

df_merged = pd.merge(df_rtf, df_csv, on="NAME", how="left")

print("\nMerged DataFrame shape:", df_merged.shape)
print("\nMerged DataFrame columns:")
print(df_merged.columns.tolist())

##########################################
# 7) CALCULATE DAYS PAID
##########################################

from datetime import datetime

def calculate_days_paid(df):
    today = datetime.today()
    end_date = datetime(today.year, today.month - 1, 30)

    # Ensure `Schlr_Act_Dt` is datetime type
    df["Schlr_Act_Dt"] = pd.to_datetime(df["Schlr_Act_Dt"], errors="coerce")

    # Function to determine start date based on AS Year
    def get_start_date(row):
        if pd.isna(row["Schlr_Act_Dt"]):
            return None  # Handle missing values
        if row["AS_Year"] == "AS300":
            return max(pd.Timestamp("2024-08-19"), row["Schlr_Act_Dt"])
        elif row["AS_Year"] == "AS400":
            return max(pd.Timestamp("2023-08-19"), row["Schlr_Act_Dt"])
        elif row["AS_Year"] == "AS200":
            return max(pd.Timestamp("2024-08-19"), row["Schlr_Act_Dt"])
        elif row["AS_Year"] == "AS100":
            return max(pd.Timestamp("2023-08-21"), row["Schlr_Act_Dt"])
        return None

    df["StartDate"] = df.apply(get_start_date, axis=1)

    # Ensure StartDate is not later than EndDate
    df["StartDateValid"] = df["StartDate"].where(df["StartDate"] <= end_date)

    # Ensure calculations only apply where valid dates exist
    df["DaysInStartMonth"] = df["StartDateValid"].apply(lambda x: (30 - x.day + 1) if pd.notna(x) else 0)
    df["DaysInEndMonth"] = end_date.day if pd.notna(end_date) else 0

    # Calculate full months using integer conversion of timedelta
    df["FullMonths"] = df.apply(
        lambda x: ((end_date - x["StartDateValid"]).days // 30) - 1 if pd.notna(x["StartDateValid"]) else 0,
        axis=1
    )

    # Calculate total days including full months and start/end months
    df["TotalDays"] = df["DaysInStartMonth"] + (df["FullMonths"] * 30) + df["DaysInEndMonth"]

    # Apply summer exclusion logic
    df["IsSummerExcluded"] = (
        df["AS_Year"].isin(["AS100", "AS200"]) & (df["Schlr_Act_Dt"].dt.year < 2024)
    )

    df["DaysPaid"] = np.where(
        df["IsSummerExcluded"],
        df["TotalDays"] - 0,  # Modify if necessary
        df["TotalDays"]
    )

    return df

# Apply function to the merged DataFrame
df_merged = calculate_days_paid(df_merged)

##########################################
# 8) DISPLAY RESULTS
##########################################

from IPython.display import display

# Show the final DataFrame in Colab
print("\nFinal Days Paid Calculation DataFrame:")
display(df_merged.head(20))  # Display the first 20 rows

print("\n✅ Calculation complete and displayed in Colab.")

df_merged.to_csv("DaysPaid_Calculation.csv", index=False)
print("✅ File saved as DaysPaid_Calculation.csv")

from google.colab import files
files.download("DaysPaid_Calculation.csv")



Libraries installed and imported.
Please upload your .rtf file (e.g., W_DFAS_257.rtf).


Saving W_DFAS_257.rtf to W_DFAS_257 (17).rtf
RTF file uploaded as: W_DFAS_257 (17).rtf

Preview of RTF plain text (first 500 characters):
SSN|NAME|PGM CD|SEX CD|PAY ST|SUBSIS    PAID|BOOKS & FEES|TRAINING PAID|BONUS PAID|COMM PAID|TRN DAY|COLLECTION|NET PAY|ROTC GRAD DATE|CUM SUBSIS|CUM SUB DAY|
***-**-3239|BURNO|NS04|M|X|500.00|0.00|0.00|0.00|0.00|0|0.00|500.00|20250516|2183.33|131|
***-**-0811|DANIE|NS04|F|X|500.00|0.00|0.00|0.00|0.00|0|0.00|500.00|20250804|6580.00|422|
***-**-9548|MEJIA|NS04|F|A|0.00|0.00|0.00|0.00|0.00|0|0.00|0.00|20250509|0.00|0|
***-**-0376|JORDA|NS03|F|X|450.00|0.00|0.00|0.00|0.00|0|0.00|450.00|20260515|232

Initial df_rtf from RTF table:
['SSN', 'NAME', 'PGM CD', 'SEX CD', 'PAY ST', 'SUBSIS    PAID', 'BOOKS & FEES', 'TRAINING PAID', 'BONUS PAID', 'COMM PAID', 'TRN DAY', 'COLLECTION', 'NET PAY', 'ROTC GRAD DATE', 'CUM SUBSIS', 'CUM SUB DAY', 'Unnamed: 16']


Unnamed: 0,SSN,NAME,PGM CD,SEX CD,PAY ST,SUBSIS PAID,BOOKS & FEES,TRAINING PAID,BONUS PAID,COMM PAID,TRN DAY,COLLECTION,NET PAY,ROTC GRAD DATE,CUM SUBSIS,CUM SUB DAY,Unnamed: 16
0,***-**-3239,BURNO,NS04,M,X,500.0,0.0,0.0,0.0,0.0,0,0.0,500.0,20250516,2183.33,131,
1,***-**-0811,DANIE,NS04,F,X,500.0,0.0,0.0,0.0,0.0,0,0.0,500.0,20250804,6580.0,422,
2,***-**-9548,MEJIA,NS04,F,A,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,20250509,0.0,0,
3,***-**-0376,JORDA,NS03,F,X,450.0,0.0,0.0,0.0,0.0,0,0.0,450.0,20260515,2325.0,155,
4,***-**-3503,TILLE,NS03,M,X,450.0,0.0,0.0,0.0,0.0,0,0.0,450.0,20260515,2430.0,162,


Dropped unnamed columns: ['Unnamed: 16']

After numeric/date conversion:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   SSN             14 non-null     object        
 1   NAME            14 non-null     object        
 2   PGM CD          14 non-null     object        
 3   SEX CD          14 non-null     object        
 4   PAY ST          14 non-null     object        
 5   SUBSIS    PAID  14 non-null     float64       
 6   BOOKS & FEES    14 non-null     float64       
 7   TRAINING PAID   14 non-null     float64       
 8   BONUS PAID      14 non-null     float64       
 9   COMM PAID       14 non-null     float64       
 10  TRN DAY         14 non-null     int64         
 11  COLLECTION      14 non-null     float64       
 12  NET PAY         14 non-null     float64       
 13  ROTC GRAD DATE  14 non-null     datetim

Saving SMR_AS_FAVORITE.csv to SMR_AS_FAVORITE (17).csv
CSV file uploaded as: SMR_AS_FAVORITE (17).csv

Original df_csv columns:
['EmplID', 'Detachment', 'AS Year', 'Name', 'Date of Birth', 'Type Program', 'Major Level', 'Stu Status', 'Stud Prog', 'Citizen', 'FICE Code', 'Major', 'Comm Dt', 'Last DP', 'Term', 'Term GPA', 'Cum GPA', 'FT-Status', 'FT-Fiscal Year', 'FT Session', 'FT Ranking', 'FT Class Size', 'Schlr Type', 'Schlr Length', 'Schlr Activ Dt', 'Schlr Terms Ent', 'Schlr  Prog', 'Schlr Status', 'Schlr Stat Dt', 'Enlist Date', 'POC Entry Dt', 'Security Level', 'Date Completed', 'Grade', 'Cat Sel', 'Height', 'Weight', 'Ht/Wt Dt', 'Phys Type', 'Phys Exp', 'MRS', 'AFPFT', 'AFPFT Dt', 'AFPFT Res', 'AFOQT-Pilot', 'AFOQT-Nav', 'AFOQT-Apt', 'AFOQT-Verb', 'AFOQT-Quan', 'ACT-Score', 'Conditionals', 'Priv Pilot', 'As Of Date', 'Ranking', 'PCSM', 'Scroll Approved', 'Pilot', 'Navigator', 'RPA', 'ABM', 'Org ID', 'SAT_COMP', 'School Name', 'ABM.1', 'GMC PDT Date', 'POC PDT Date']

Updated df_c

Unnamed: 0,SSN,NAME,PGM CD,SEX CD,PAY ST,SUBSIS PAID,BOOKS & FEES,TRAINING PAID,BONUS PAID,COMM PAID,...,GMC PDT Date,POC PDT Date,StartDate,StartDateValid,DaysInStartMonth,DaysInEndMonth,FullMonths,TotalDays,IsSummerExcluded,DaysPaid
0,***-**-3239,BURNO,NS04,M,X,500.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
1,***-**-0811,DANIE,NS04,F,X,500.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
2,***-**-9548,MEJIA,NS04,F,A,0.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
3,***-**-0376,JORDA,NS03,F,X,450.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
4,***-**-3503,TILLE,NS03,M,X,450.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
5,***-**-1387,LANGF,NS03,F,X,450.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
6,***-**-6737,AGLER,SC02,M,X,350.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
7,***-**-6002,FINNE,SC02,M,X,350.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
8,***-**-4457,GARRI,SC02,F,X,350.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30
9,***-**-1528,HUNSI,SC02,M,X,350.0,0.0,0.0,0.0,0.0,...,,,,,0,30,0,30,False,30



✅ Calculation complete and displayed in Colab.
✅ File saved as DaysPaid_Calculation.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>