In [1]:
import re
import torch
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch.nn.functional as F
import nlpaug.augmenter.word as naw
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import os, re, joblib

#import rms_app_packagages
from cryptography.fernet import Fernet
import base64
import configparser

In [None]:
NewData = pd.read_excel('\\\\vi240c060002.woc.prod\\e$\\Machine Learning\\File To Be Predicted\\AI - WDW WC Incoming.xlsx')
NewData

In [None]:
# Generate a key for encryption and decryption from the config file
seed = '3q2+7w==3q2+7w==3q2+7w==3q2+7w=='
seed = base64.urlsafe_b64encode(seed.encode())
fernet = Fernet(seed)

encrypted_input = 'gAAAAABnBX4EOfvh5QKfctr7-15igLjeQptAJ3nprx7JdPv9TphVSnnwzCyCzqwRJG_fV4KUs_ZrajmlhTjwTalYrE1pslyC5iYnma2qgKQWsD6yF4m5zEk='
decrypted_string = fernet.decrypt(encrypted_input.encode()).decode()
print("Decrypted string:", decrypted_string)

In [None]:
#Decryption HERE
#seed = util.getconfig(RK_SEED)
#rksftp = util.getconfig(RK_SFTP)

decrypted_string = fernet.decrypt(encrypted_input.encode()).decode()

In [None]:
#Connection HERE (Bernie/Nick)



In [4]:
#predict_claims_pipeline.py
# ------------------------------------------------------------------
#Batch‑inference pipeline:
#   • Event of Incident
#   • Source of Incident
#   • Event of Injury
#   • Source of Injury
#   • EDI Cause  (only if Claim Number prefix is WDWW / DLRW / DCLW)
#Produces three Excel files:
#   1) all_predictions.xlsx
#   2) high_confidence.xlsx
#   3) low_confidence.xlsx
# ------------------------------------------------------------------

nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

###############################################################################
# 1.CONFIGURATION
###############################################################################

# ---- model checkpoints ------------------------------------------------------
MODEL_PATHS = {
    "EventOfIncident": r'\\vi240c060002.woc.prod\e$\Machine Learning\3_10_event_of_incident',
    "SourceOfIncident": r'\\vi240c060002.woc.prod\e$\Machine Learning\3_3_source_of_incident',
    "EventOfInjury":   r'\\vi240c060002.woc.prod\e$\Machine Learning\3_18_event_of_injury',
    "SourceOfInjury":  r'\\vi240c060002.woc.prod\e$\Machine Learning\3_11_source_of_injury',
    "EDICause":        r'\\vi240c060002.woc.prod\e$\Machine Learning\baseline_models\baseline_edi_cause'
}

# ---- label‑encoder pkl files -------------------------------------------------
ENCODER_DIR = r'\\vi240c060002.woc.prod\e$\Model_Label_Encoders'
ENCODER_FILES = {
    "EventOfIncident": "Event_of_Incident_Desc_encoder.pkl",
    "SourceOfIncident": "Source_of_Incident_Desc_encoder.pkl",
    "EventOfInjury": "Event_of_Injury_Desc_encoder.pkl",
    "SourceOfInjury": "Source_of_Injury_Desc_encoder.pkl",
    "EDICause": "EDI_Cause_Desc_encoder.pkl"
}

# ---- textual columns used to build Combined_Text ----------------------------
TEXT_COLS = [
    "Incident Description",
    "Activity Engaged in During Accident",
    "General HS Comments",
    "Injury Description"
]

# ---- which claim prefixes require EDI cause ---------------------------------
EDI_PREFIXES = ("WDWW", "DLRW", "DCLW")

# ---- confidence threshold (max‑probability) ---------------------------------
CONFIDENCE_THRESHOLD = 0.50

# ---- paths ------------------------------------------------------------------
INPUT_EXCEL  = r'\\vi240c060002.woc.prod\e$\Machine Learning\File To Be Predicted\AI - WDW WC Incoming.xlsx'
OUTPUT_DIR   = r'\\vi240c060002.woc.prod\e$\prediction_outputs\2025_04_19'
###############################################################################

os.makedirs(OUTPUT_DIR, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {DEVICE}")

###############################################################################
# 2.Load models & encoders
###############################################################################
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

print(" Loading models …")
models = {}
for key, path in MODEL_PATHS.items():
    models[key] = BertForSequenceClassification.from_pretrained(path).to(DEVICE).eval()
    print(f"   {key:17s} ← {path}")

print("\n Loading label encoders …")
label_encoders = {}
for key, fname in ENCODER_FILES.items():
    enc_path = os.path.join(ENCODER_DIR, fname)
    label_encoders[key] = joblib.load(enc_path)
    print(f"   {key:17s} ← {enc_path}")

###############################################################################
#Helper functions
###############################################################################
lemmatizer = WordNetLemmatizer()
def clean_text(txt: str) -> str:
    if not isinstance(txt, str):
        return ""
    txt = txt.lower()
    txt = re.sub(r"[^a-z0-9.,!?'\s-]", "", txt)
    txt = re.sub(r"\s+", " ", txt).strip()
    tokens = (lemmatizer.lemmatize(t) for t in txt.split())
    return " ".join(tokens)

def build_combined_text(row: pd.Series) -> str:
    return " ".join(clean_text(row.get(c, "")) for c in TEXT_COLS).strip()

@torch.no_grad()
def predict(text: str, model) -> tuple[int, float]:
    enc = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors="pt")
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    logits = model(**enc).logits
    probs  = torch.softmax(logits, dim=-1)
    conf, idx = torch.max(probs, dim=1)
    return idx.item(), conf.item()

###############################################################################
# 4.Read data & run inference 
###############################################################################
print(f"\n Reading input file: {INPUT_EXCEL}")
df = pd.read_excel(INPUT_EXCEL)
print(f"   {len(df):,} rows loaded.")

#Prepare new columns
for key in models.keys():
    df[f"Pred_{key}"]  = None
    df[f"Conf_{key}"]  = None
df["LowConfidence"] = None  # flag later

for i, row in df.iterrows():
    text = build_combined_text(row)

    # Always run the four base models
    for key in ("EventOfIncident", "SourceOfIncident", "EventOfInjury", "SourceOfInjury"):
        idx, conf = predict(text, models[key])
        label = label_encoders[key].inverse_transform([idx])[0]
        df.at[i, f"Pred_{key}"] = label
        df.at[i, f"Conf_{key}"] = conf

    # Conditionally run EDI Cause
    claim = str(row.get("Claim Number", ""))
    if claim.startswith(EDI_PREFIXES):
        idx, conf = predict(text, models["EDICause"])
        label = label_encoders["EDICause"].inverse_transform([idx])[0]
        df.at[i, "Pred_EDICause"] = label
        df.at[i, "Conf_EDICause"] = conf
    else:
        df.at[i, "Pred_EDICause"] = None
        df.at[i, "Conf_EDICause"] = None

    # Flag low‑confidence
    confidences = [
        df.at[i, "Conf_EventOfIncident"],
        df.at[i, "Conf_SourceOfIncident"],
        df.at[i, "Conf_EventOfInjury"],
        df.at[i, "Conf_SourceOfInjury"],
    ]
    # include EDI if present
    if pd.notnull(df.at[i, "Conf_EDICause"]):
        confidences.append(df.at[i, "Conf_EDICause"])

    df.at[i, "LowConfidence"] = any(c < CONFIDENCE_THRESHOLD for c in confidences)

###############################################################################
# 5.Save outputs
###############################################################################
all_path  = os.path.join(OUTPUT_DIR, "all_predictions.xlsx")
low_path  = os.path.join(OUTPUT_DIR, "low_confidence.xlsx")
high_path = os.path.join(OUTPUT_DIR, "high_confidence.xlsx")

df["LowConfidence"] = df["LowConfidence"].astype(bool) 

all_df  = df.copy()
low_df  = df[df["LowConfidence"]]        #True  rows
high_df = df[~df["LowConfidence"]]       #False Rows
 
df.to_excel(all_path,  index=False)
df[df.LowConfidence].to_excel(low_path,  index=False)
df[~df.LowConfidence].to_excel(high_path, index=False)

print("\n Export complete")
print(f"   • All predictions → {all_path}")
print(f"   • Low confidence  → {low_path}")
print(f"   • High confidence → {high_path}")


 Using device: cpu
 Loading models …
   EventOfIncident   ← \\vi240c060002.woc.prod\e$\Machine Learning\3_10_event_of_incident
   SourceOfIncident  ← \\vi240c060002.woc.prod\e$\Machine Learning\3_3_source_of_incident
   EventOfInjury     ← \\vi240c060002.woc.prod\e$\Machine Learning\3_18_event_of_injury
   SourceOfInjury    ← \\vi240c060002.woc.prod\e$\Machine Learning\3_11_source_of_injury
   EDICause          ← \\vi240c060002.woc.prod\e$\Machine Learning\baseline_models\baseline_edi_cause

 Loading label encoders …
   EventOfIncident   ← \\vi240c060002.woc.prod\e$\Model_Label_Encoders\Event_of_Incident_Desc_encoder.pkl
   SourceOfIncident  ← \\vi240c060002.woc.prod\e$\Model_Label_Encoders\Source_of_Incident_Desc_encoder.pkl
   EventOfInjury     ← \\vi240c060002.woc.prod\e$\Model_Label_Encoders\Event_of_Injury_Desc_encoder.pkl
   SourceOfInjury    ← \\vi240c060002.woc.prod\e$\Model_Label_Encoders\Source_of_Injury_Desc_encoder.pkl
   EDICause          ← \\vi240c060002.woc.prod\e$\Model

  warn("Workbook contains no default style, apply openpyxl's default")


   326 rows loaded.


PermissionError: [Errno 13] Permission denied: '\\\\vi240c060002.woc.prod\\e$\\prediction_outputs\\2025_04_19\\high_confidence.xlsx'

In [None]:
#Hooking into SFTP 
    #Encrypted Pass
    #Corrected Folder Structure
    

In [None]:
rksftp = (RK_SFTP)
rkseed = (RK_SEED)

In [None]:
#Set Up Environment Variables

In [None]:
rkfsftp = util.getconfig(RK_SFTP)
rkseed = util.getconfig(SK_SEED)

In [None]:
#Data Push
    #SFTP Drop
    #Cadence
    #Outside app, we need data monitor job

In [None]:
import os
os.getcwd()

In [None]:
from datetime import date, timedelta

curr = date(2025, 3, 7)
end = date(2025, 9, 12)
step = timedelta(1)
num_thur_fri = 0
while curr <= end:
    if curr.weekday() in [2]: #Friday and thursday
        num_thur_fri += 1
    curr += step
print(num_thur_fri)