In [28]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import time
from tqdm import tqdm

### Pre-process data and clustering

In [39]:

# Initialize tqdm for pandas
# This allows us to use .progress_apply() for a progress bar
tqdm.pandas(desc="Cleaning Text")

# Download NLTK data (only need to run this once) 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Pre-processing (Load & Clean)

print("Loading & Cleaning...")
start_load = time.time()

# Load your CSV
file_path = 'final_data/final_combined_classified_data.csv'  
try:
    df = pd.read_csv(file_path, sep='|', on_bad_lines='skip')
    end_load = time.time()
    print(f"Loaded {len(df)} rows in {end_load - start_load:.2f} seconds.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")

# Initialize cleaning tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Cleans and tokenizes text."""
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stop words and lemmatize
    clean_tokens = [
        lemmatizer.lemmatize(word) 
        for word in tokens 
        if word.isalpha() and word not in stop_words
    ]
    return ' '.join(clean_tokens)

print("Starting text preprocessing (this may take a while)...")
start_clean = time.time()
# Use .progress_apply() to see a progress bar
df['cleaned_paragraph'] = df['Paragraph'].progress_apply(preprocess_text)
end_clean = time.time()
print(f"Preprocessing complete in {end_clean - start_clean:.2f} seconds.")
print(df[['Paragraph', 'cleaned_paragraph']].head())

# Vectorization (TF-IDF) 
print("\nVectorization (TF-IDF)")
start_vec = time.time()

# Tuned these parameters. 100 features was too low.
vectorizer = TfidfVectorizer(max_df=0.90, min_df=5, stop_words='english', max_features=1000)

# Filter out empty paragraphs that might result from cleaning
original_rows = len(df)
non_empty_df = df[df['cleaned_paragraph'].str.strip().astype(bool)].copy()
rows_dropped = original_rows - len(non_empty_df)

if rows_dropped > 0:
    print(f"Filtered {rows_dropped} empty/unusable rows.")

if non_empty_df.empty:
    print("Error: No data left after cleaning or all paragraphs were shorter than min_df.")
else:
    print(f"Vectorizing {len(non_empty_df)} paragraphs...")
    tfidf_matrix = vectorizer.fit_transform(non_empty_df['cleaned_paragraph'])
    end_vec = time.time()
    print(f"Vectorization complete in {end_vec - start_vec:.2f} seconds.")
    print(f"TF-IDF Matrix Shape: (paragraphs, features) = {tfidf_matrix.shape}")

    # Exploratory Clustering (K-Means)
    print("\n--- Step 3: Exploratory Clustering (K-Means) ---")
    start_cluster = time.time()
    
    # We pick a high-ish 'k' to find junk topics
    k_exploratory = 25 
    
    # Ensure k is not larger than the number of samples
    if k_exploratory > tfidf_matrix.shape[0]:
        k_exploratory = tfidf_matrix.shape[0]
        print(f"Warning: k was larger than sample size. Setting k to {k_exploratory}.")

    print(f"Starting K-Means clustering with k={k_exploratory}...")
    kmeans = KMeans(n_clusters=k_exploratory, random_state=42, n_init=10)
    kmeans.fit(tfidf_matrix)
    end_cluster = time.time()
    
    # Add cluster labels back to our non-empty DataFrame
    non_empty_df['cluster_id'] = kmeans.labels_
    print(f"Clustering complete in {end_cluster - start_cluster:.2f} seconds.")
    print(f"Model Inertia (lower is better): {kmeans.inertia_}")


    # Manual Inspection (Review Keywords)
    print("\nStep 4: Manual Inspection")
    print("\nReview Top Words per Cluster")
    
    terms = vectorizer.get_feature_names_out()
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    
    for i in range(k_exploratory):
        top_words = [terms[ind] for ind in order_centroids[i, :10]]
        print(f"Cluster {i}: {', '.join(top_words)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading & Cleaning...
Error: File not found at final_data/final_combined_classified_data.csv
Starting text preprocessing (this may take a while)...


Cleaning Text: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 66227/66227 [00:41<00:00, 1596.70it/s]


Preprocessing complete in 41.49 seconds.
                                           Paragraph  \
0  1) mortgage credit bank operations mean activi...   
1  11) cover pool means a set of collaterals for ...   
2  12) Resolution Authority means the Finnish Fin...   
3  In addition to what is provided in subsection ...   
4  In addition to what is provided in the Act on ...   

                                   cleaned_paragraph  
0  mortgage credit bank operation mean activity c...  
1  cover pool mean set collateral one specified c...  
2  resolution authority mean finnish financial st...  
3  addition provided subsection mortgage credit b...  
4  addition provided act credit institution suita...  

Vectorization (TF-IDF)
Vectorizing 66227 paragraphs...
Vectorization complete in 3.93 seconds.
TF-IDF Matrix Shape: (paragraphs, features) = (66227, 1000)

--- Step 3: Exploratory Clustering (K-Means) ---
Starting K-Means clustering with k=25...
Clustering complete in 26.58 seconds.
Model I

### Remove paragraphs containing certain keywords from clustering

In [None]:

# Filtering 
print("\nFiltering ")

# Indices of clusters to remove
clusters_to_remove = []

if not clusters_to_remove:
    print("NOTE: `clusters_to_remove` is empty. No filtering will be applied.")

# Filter the DataFrame
clean_df = non_empty_df[~non_empty_df['cluster_id'].isin(clusters_to_remove)].copy()

print("Saving filtered data to CSV...")
start_save = time.time()
save_path = '../filtered_data/filtered_paragraphs_3.csv'
clean_df.to_csv(save_path, index=False, sep='|')
end_save = time.time()
print(f"Filtered data saved to {save_path} in {end_save - start_save:.2f} seconds.")

print("\nFiltering complete.")
print(f"Original non-empty paragraphs: {len(non_empty_df)}")
print(f"Paragraphs removed: {len(non_empty_df) - len(clean_df)}")
print(f"Clean paragraphs remaining: {len(clean_df)}")

### Classify each paragraph into financial risk classes based on keywords 

In [None]:
# Ensure tqdm is set up for pandas
tqdm.pandas(desc="Classifying Risks")

# Define all Term Lists (Expanded Version) 

# This dictionary holds all keyword lists
term_lists = {
    "RiskTriggerTerms": ["risk", "risks", "exposure", "exposures", "vulnerability", "vulnerabilities", "hazard", "threat", "peril"],
    
    "DefaultTerms": ["default", "defaults", "non-performing", "non performing", "past due", "delinquency", "delinquent", 
                     "arrears", "insolvency", "bankruptcy", "foreclosure", "write-off", "charge-off", "impaired loan", "credit loss"],
                     
    "CreditTerms": ["credit", "loan", "loans", "lending", "obligor", "borrower", "debtor", "mortgage", "creditor", 
                    "debenture", "bond", "overdraft", "facility", "credit line", "loan portfolio", "credit institution"],
                    
    "CounterpartyTerms": ["counterparty", "counterparties", "trading partner", "derivative counterparty", "clearing member", 
                          "CCP", "central counterparty", "clearing house", "bilateral agreement", "ISDA"],
                          
    "CollateralTerms": ["collateral", "security", "securities", "pledge", "guarantee", "guarantor", "margin", 
                        "encumbrance", "lien", "charge", "hypothecation", "repo", "reverse repo", "security interest", "property"],
                        
    "CountryTerms": ["country risk", "transfer risk", "sovereign risk", "cross-border", "cross border", "geopolitical risk", "sovereign default"],
    
    "MarketTerms": ["market risk", "market volatility", "price volatility", "spread risk", "trading book", "equities", "bonds", 
                    "commodities", "derivatives", "spreads", "VaR", "value at risk", "expected shortfall", "ES", "mark-to-market", "MTM"],
                    
    "InterestRateTerms": ["interest rate risk", "IRRBB", "yield curve", "repricing gap", "net interest income", "NII", 
                          "economic value of equity", "EVE", "basis risk", "yield curve risk", "repricing risk"],
                          
    "LiquidityTerms": ["liquidity risk", "funding liquidity", "market liquidity", "liquidity coverage ratio", "LCR", "NSFR", 
                       "net stable funding ratio", "liquidity buffer", "cash flow", "funding mismatch", "run-off", "deposit outflow", 
                       "HQLA", "high-quality liquid assets", "ILAAP", "contingency funding plan", "CFP"],
                       
    "FXTerms": ["foreign exchange", "FX risk", "currency mismatch", "exchange rate", "currency risk", "FX exposure", 
                "translation risk", "transaction risk", "foreign currency risk"],
                
    "CommodityTerms": ["commodity price", "commodity risk", "oil price", "gas price", "energy price", "metal price", "agricultural price"],
    
    "OperationalTerms": ["operational risk", "operational failure", "internal process", "process failure", "error", "manual error", 
                         "oprisk", "RCSA", "risk and control self-assessment", "loss event", "failed process", "people risk", 
                         "business disruption", "BCP", "business continuity", "internal fraud", "external fraud"],
                         
    "ITTerms": ["IT system", "system outage", "system failure", "downtime", "IT disruption", "legacy system", "core banking system", 
                "infrastructure failure", "network outage", "disaster recovery", "DR", "technology failure", "platform instability"],
                
    "CyberTerms": ["cyber risk", "cybersecurity", "cyber attack", "malware", "ransomware", "data breach", "infosec", 
                   "information security", "phishing", "vishing", "smishing", "denial of service", "DoS", "DDoS", 
                   "zero-day", "exploit", "unauthorized access", "cyber threat", "cyber incident"],
                   
    "ModelTerms": ["risk model", "scoring model", "PD model", "LGD model", "EAD model", "model validation", "model risk", 
                   "model uncertainty", "model error", "model overlay", "back-testing", "backtesting", "model governance", 
                   "model implementation", "model misuse", "pd", "probability of default", "lgd", "loss given default", "estimation", "downturn"],
                   
    "DataQualityTerms": ["data quality", "data integrity", "data accuracy", "data completeness", "data inconsistency", 
                         "data governance", "data lineage", "data validation", "reconciliation", "inaccurate data", 
                         "incomplete data", "data stewardship", "data reliability"],
                         
    "OutsourcingTerms": ["outsourcing", "third-party", "third party", "external service provider", "cloud service provider", 
                         "fourth-party", "sub-contractor", "supply chain risk", "vendor risk", "service level agreement", "SLA", 
                         "vendor management", "supplier risk"],
                         
    "LegalTerms": ["legal risk", "enforceability", "legal enforceability", "litigation", "legal dispute", "non-enforceable", 
                   "unenforceable", "contract risk", "legal uncertainty", "regulatory action", "lawsuit", "arbitration"],
                   
    "ComplianceTerms": ["compliance risk", "regulatory requirements", "supervisory expectations", "prudential requirements", 
                        "regulatory reporting", "non-compliance", "breach", "regulatory breach", "supervisory action", 
                        "regulatory fine", "regulatory scrutiny", "supervisory review"],
                        
    "AMLTerms": ["money laundering", "terrorist financing", "AML", "CFT", "sanctions", "sanctioned entity", "KYC", 
                 "know your customer", "PEP", "politically exposed person", "SAR", "suspicious activity report", 
                 "OFAC", "financial crime", "fatf", "financial action task force", "mltf"],
                 
    "ConductTerms": ["conduct risk", "mis-selling", "mis selling", "consumer protection", "treating customers fairly", "TCF", 
                     "product suitability", "client detriment", "conflict of interest", "insider trading", "market abuse", "market manipulation"],
                     
    "ReputationTerms": ["reputational risk", "reputation risk", "adverse publicity", "public perception", "media coverage", 
                        "brand damage", "public trust", "scandal", "negative press", "stakeholder perception", "loss of trust"],
                        
    "GovernanceTerms": ["governance", "internal control", "risk committee", "board oversight", "risk culture", "conflict of interest", 
                        "risk appetite", "three lines of defense", "3LOD", "audit committee", "remuneration", "corporate governance", 
                        "board of directors", "management body", "accountability"],
                        
    "StrategicTerms": ["strategic risk", "business model", "strategy", "business strategy", "competitive risk", "business environment", 
                       "merger", "acquisition", "M&A", "business model viability", "competitive landscape"],
                       
    "SystemicTerms": ["systemic risk", "macroprudential", "contagion", "spillover", "too big to fail", "interconnectedness", 
                      "financial stability", "domino effect", "G-SIB", "G-SII", "globally systemically important"],
                      
    "ClimateTerms": ["climate risk", "environmental risk", "physical risk", "transition risk", "ESG risk", "greenhouse gas", 
                     "sustainability", "carbon footprint", "stranded assets", "TCFD", "greenwashing", "environmental social governance"],
                     
    "MacroTerms": ["macroeconomic", "business cycle", "economic downturn", "recession", "inflation shock", "geopolitical", 
                   "interest rate shock", "stagflation", "unemployment", "GDP growth", "economic shock", "inflationary pressure"],
                   
    "RequirementTerms": ["requirement", "requirements", "obligation", "obligations", "shall", "must", "binding", "mandatory"],
    
    "CapitalTerms": ["capital requirement", "capital buffer", "Pillar 2", "own funds", "capital adequacy", "CET1", "Common Equity Tier 1", 
                     "Tier 1 capital", "Tier 2 capital", "T1", "T2", "Basel III", "Basel 3", "Basel IV", "Basel 4", "ICAAP", 
                     "leverage ratio", "RWA", "risk-weighted assets", "capital ratio", "Pillar 1", "irb", "internal ratings based", "standardized approach", "securisation", "asset", "liability", "equity", "share", "tier"]
}

# Helper Functions for Rule Logic

def build_regex_pattern(terms):
    """Builds a regex pattern from a list of terms."""
    # We sort by length, longest first, to match "non performing" before "non"
    sorted_terms = sorted(terms, key=len, reverse=True)
    # Create pattern: \b(term1|term2|...)\b
    return r'\b(' + '|'.join(re.escape(term) for term in sorted_terms) + r')\b'

def check_proximity(text, terms1_list, terms2_list, distance):
    """
    Checks if a term from terms1 is within 'distance' words of a term from terms2.
    """
    # Create regex patterns for both lists
    pattern1 = build_regex_pattern(terms1_list)
    pattern2 = build_regex_pattern(terms2_list)
    
    # Create forward and backward proximity patterns
    # (A w/N B) -> A ... B
    forward_pattern = pattern1 + (r'(?:\s+\w+){0,' + str(distance) + r'}\s+') + pattern2
    # (B w/N A) -> B ... A
    backward_pattern = pattern2 + (r'(?:\s+\w+){0,' + str(distance) + r'}\s+') + pattern1

    # Check if either pattern exists
    if re.search(forward_pattern, text, re.IGNORECASE) or \
       re.search(backward_pattern, text, re.IGNORECASE):
        return True
    return False

def check_list(text, terms_list):
    """Checks if any term from the list exists in the text."""
    pattern = build_regex_pattern(terms_list)
    if re.search(pattern, text, re.IGNORECASE):
        return True
    return False

# The Main Classifier Function 

def classify_risk_concept(text):
    """
    Classifies text based on the 30-rule concept model.
    Uses a 'first match wins' logic.
    """
    if not isinstance(text, str):
        return "Unclassified"
    
    # 1. Risk_CreditDefault
    if check_proximity(text, term_lists["CreditTerms"], term_lists["DefaultTerms"], 3) or \
       check_proximity(text, term_lists["DefaultTerms"], ["exposure"], 3) or \
       check_proximity(text, term_lists["RiskTriggerTerms"], term_lists["DefaultTerms"], 3) or \
       re.search(r'credit default risk|risk of default', text, re.IGNORECASE):
        return "Risk_CreditDefault"

    # 2. Risk_CounterpartyCredit
    if check_proximity(text, term_lists["CounterpartyTerms"], term_lists["DefaultTerms"], 5) or \
       check_proximity(text, term_lists["CounterpartyTerms"], ["credit risk"], 5) or \
       check_proximity(text, ["settlement"], term_lists["CounterpartyTerms"], 5) or \
       re.search(r'counterparty credit risk', text, re.IGNORECASE):
        return "Risk_CounterpartyCredit"
    
    # 3. Risk_Concentration
    if re.search(r'concentration risk|large exposure|large exposures', text, re.IGNORECASE) or \
       check_proximity(text, ["sectoral"], ["concentration"], 3) or \
       check_proximity(text, ["portfolio"], ["concentration"], 3):
        return "Risk_Concentration"

    # 4. Risk_CollateralQuality
    if check_proximity(text, term_lists["CollateralTerms"], ["quality"], 3) or \
       check_proximity(text, term_lists["CollateralTerms"], ["valuation"], 5) or \
       check_proximity(text, ["haircut"], term_lists["CollateralTerms"], 3) or \
       check_proximity(text, ["insufficient"], term_lists["CollateralTerms"], 3):
        return "Risk_CollateralQuality"
        
    # 5. Risk_ResidualCredit
    if re.search(r'residual risk|risk mitigation techniques', text, re.IGNORECASE) or \
       check_proximity(text, ["residual"], ["credit risk"], 3) or \
       check_proximity(text, ["mitigant", "mitigants"], ["ineffective"], 5):
        return "Risk_ResidualCredit"

    # 6. Risk_CountryTransfer
    if check_list(text, term_lists["CountryTerms"]) or \
       check_proximity(text, ["restrictions"], ["capital transfer"], 5) or \
       check_proximity(text, ["sovereign"], ["default"], 3) or \
       check_proximity(text, ["country"], ["default risk"], 3):
        return "Risk_CountryTransfer"

    # 7. Risk_MarketValue
    if check_list(text, term_lists["MarketTerms"]) or \
       check_proximity(text, ["market"], ["risk"], 3) or \
       check_proximity(text, ["price"], ["volatility"], 3) or \
       check_proximity(text, ["trading book"], ["risk"], 5):
        return "Risk_MarketValue"

    # 8. Risk_InterestRate_BankingBook
    if check_list(text, term_lists["InterestRateTerms"]) or \
       check_proximity(text, ["interest rate"], ["banking book"], 3) or \
       check_proximity(text, ["repricing"], ["gap"], 3) or \
       check_proximity(text, ["earnings"], ["interest rate risk"], 5):
        return "Risk_InterestRate_BankingBook"

    # 9. Risk_FXExposure
    if check_list(text, term_lists["FXTerms"]) or \
       check_proximity(text, ["currency"], ["mismatch", "exposure"], 3) or \
       check_proximity(text, ["open FX"], ["position"], 3) or \
       check_proximity(text, ["foreign currency"], ["exposure"], 3):
        return "Risk_FXExposure"

    # 10. Risk_CommodityPrice
    if check_list(text, term_lists["CommodityTerms"]) or \
       check_proximity(text, ["commodity"], ["price risk"], 3) or \
       check_proximity(text, ["exposure"], term_lists["CommodityTerms"], 5):
        return "Risk_CommodityPrice"

    # 11. Risk_FundingLiquidity
    if check_list(text, term_lists["LiquidityTerms"]) or \
       check_proximity(text, ["funding"], ["liquidity"], 3) or \
       check_proximity(text, ["funding"], ["stress"], 5) or \
       check_proximity(text, ["inability"], ["refinance"], 5):
        return "Risk_FundingLiquidity"

    # 12. Risk_MarketLiquidity
    if re.search(r'wide bid-ask spread|wide bid ask spread', text, re.IGNORECASE) or \
       check_proximity(text, ["market liquidity"], ["risk"], 3) or \
       check_proximity(text, ["liquidity"], ["market depth"], 5) or \
       check_proximity(text, ["illiquid"], ["asset", "market"], 3):
        return "Risk_MarketLiquidity"

    # 13. Risk_MaturityMismatch
    if re.search(r'maturity mismatch|maturity transformation', text, re.IGNORECASE) or \
       check_proximity(text, ["short-term funding", "short term funding"], ["long-term loan", "long term loan"], 5):
        return "Risk_MaturityMismatch"

    # 14. Risk_DepositRunOff
    if re.search(r'deposit run|bank run', text, re.IGNORECASE) or \
       check_proximity(text, ["run"], ["deposits"], 3) or \
       check_proximity(text, ["run"], ["on the bank"], 3) or \
       check_proximity(text, ["outflow"], ["deposits"], 3) or \
       check_proximity(text, ["run-off", "run off"], ["rates"], 3):
        return "Risk_DepositRunOff"

    # 15. Risk_OperationalProcess
    if check_list(text, term_lists["OperationalTerms"]) or \
       check_proximity(text, ["process"], ["failure", "breakdown", "deficiency"], 5) or \
       check_proximity(text, ["internal control"], ["deficiency"], 3) or \
       check_proximity(text, ["manual process"], ["error"], 5):
        return "Risk_OperationalProcess"

    # 16. Risk_ITSystemFailure
    if check_list(text, term_lists["ITTerms"]) or \
       check_proximity(text, ["system"], ["outage", "failure", "crash"], 3) or \
       check_proximity(text, ["IT"], ["incident", "disruption"], 3) or \
       check_proximity(text, ["core banking system"], ["failure"], 3):
        return "Risk_ITSystemFailure"

    # 17. Risk_CyberSecurity
    if check_list(text, term_lists["CyberTerms"]) or \
       check_proximity(text, ["information security"], ["breach"], 3) or \
       check_proximity(text, ["unauthorized access"], ["systems"], 3) or \
       check_proximity(text, ["IT"], ["cyber attack"], 5):
        return "Risk_CyberSecurity"

    # 18. Risk_ModelPerformance
    if check_list(text, term_lists["ModelTerms"]) or \
       re.search(r'model risk', text, re.IGNORECASE) or \
       check_proximity(text, ["model"], ["mis-specification", "mis specification", "overfitting"], 5) or \
       check_proximity(text, ["model"], ["validation", "backtesting", "back testing"], 5):
        return "Risk_ModelPerformance"

    # 19. Risk_DataQuality
    if check_list(text, term_lists["DataQualityTerms"]) or \
       check_proximity(text, ["data"], ["error", "errors", "inaccuracy", "inaccuracies"], 3) or \
       check_proximity(text, ["incorrect"], ["data"], 3) or \
       check_proximity(text, ["missing"], ["data"], 3):
        return "Risk_DataQuality"

    # 20. Risk_ThirdPartyOutsourcing
    if check_list(text, term_lists["OutsourcingTerms"]) or \
       check_proximity(text, ["critical function"], term_lists["OutsourcingTerms"], 5) or \
       check_proximity(text, ["outsourced"], ["service", "activity"], 3) or \
       check_proximity(text, ["third-party", "third party"], ["dependency"], 5):
        return "Risk_ThirdPartyOutsourcing"

    # 21. Risk_LegalEnforceability
    if check_list(text, term_lists["LegalTerms"]) or \
       check_proximity(text, ["legal"], ["enforceability"], 3) or \
       check_proximity(text, ["contract"], ["invalid", "unenforceable"], 3) or \
       check_proximity(text, ["legal"], ["uncertainty"], 5):
        return "Risk_LegalEnforceability"

    # 22. Risk_RegulatoryCompliance
    if check_list(text, term_lists["ComplianceTerms"]) or \
       re.search(r'non-compliance|non compliance', text, re.IGNORECASE) or \
       check_proximity(text, ["breach"], ["regulation"], 5) or \
       check_proximity(text, ["failure"], ["comply", "requirements"], 5): # Simplified "failure w/5 comply w/5 requirements"
        return "Risk_RegulatoryCompliance"

    # 23. Risk_AML_CFT_Sanctions
    if check_list(text, term_lists["AMLTerms"]) or \
       check_proximity(text, ["risk"], ["money laundering"], 5) or \
       check_proximity(text, ["risk"], ["terrorist financing"], 5) or \
       check_proximity(text, ["breach"], ["sanctions"], 5):
        return "Risk_AML_CFT_Sanctions"

    # 24. Risk_ConsumerConduct
    if check_list(text, term_lists["ConductTerms"]) or \
       re.search(r'treating customers fairly', text, re.IGNORECASE) or \
       check_proximity(text, ["consumer"], ["protection", "detriment"], 3) or \
       check_proximity(text, ["product"], ["unsuitable"], 5):
        return "Risk_ConsumerConduct"

    # 25. Risk_ReputationImpact
    if check_list(text, term_lists["ReputationTerms"]) or \
       check_proximity(text, ["reputation"], ["damage"], 3) or \
       check_proximity(text, ["adverse"], ["publicity"], 3) or \
       check_proximity(text, ["media"], ["criticism", "coverage", "scandal"], 5):
        return "Risk_ReputationImpact"

    # 26. Risk_GovernanceControl
    if check_list(text, term_lists["GovernanceTerms"]) or \
       re.search(r'conflict of interest', text, re.IGNORECASE) or \
       check_proximity(text, ["weak"], ["governance"], 3) or \
       check_proximity(text, ["inadequate"], ["internal control"], 3) or \
       check_proximity(text, ["lack of"], ["oversight"], 5):
        return "Risk_GovernanceControl"

    # 27. Risk_StrategicBusiness
    if check_list(text, term_lists["StrategicTerms"]) or \
       check_proximity(text, ["risk"], ["business model"], 5) or \
       check_proximity(text, ["strategy"], ["risk"], 5) or \
       check_proximity(text, ["strategic"], ["decision"], 3):
        return "Risk_StrategicBusiness"

    # 28. Risk_SystemicInterconnected
    if check_list(text, term_lists["SystemicTerms"]) or \
       re.search(r'domino effect', text, re.IGNORECASE) or \
       check_proximity(text, ["interconnectedness"], ["risk"], 5) or \
       check_proximity(text, ["failure"], ["financial system"], 5):
        return "Risk_SystemicInterconnected"

    # 29. Risk_ClimateEnvironmental
    if check_list(text, term_lists["ClimateTerms"]) or \
       check_proximity(text, ["physical"], ["climate risk"], 3) or \
       check_proximity(text, ["transition"], ["risk"], 3) or \
       check_proximity(text, ["climate-related"], ["risk"], 3) or \
       check_proximity(text, ["environmental"], ["risk"], 3):
        return "Risk_ClimateEnvironmental"

    # 30. Risk_MacroeconomicCycle
    if check_list(text, term_lists["MacroTerms"]) or \
       re.search(r'economic downturn', text, re.IGNORECASE) or \
       check_proximity(text, ["recession"], ["risk"], 3) or \
       check_proximity(text, ["macro"], ["shock"], 3) or \
       check_proximity(text, ["business cycle"], ["downturn"], 3):
        return "Risk_MacroeconomicCycle"


    return "Unclassified"

# Apply the Classifier to your DataFrame

# THIS ASSUMES you have the 'clean_df' from the previous steps.
# We will use the *original* 'Paragraph' text, as the rules need
# full sentences, not the 'cleaned_paragraph'.

# Make sure you have the 'clean_df' from Phase 1 first

if 'clean_df' in locals():
    print("\n--- Phase 2: Starting Rules-Based Classification ---")
    start_classify = time.time()

    # Apply the function to the *original* paragraph text
    clean_df['classified_class'] = clean_df['Paragraph'].progress_apply(classify_risk_concept)

    end_classify = time.time()
    print(f"Classification complete in {end_classify - start_classify:.2f} seconds.")

    # See Results
    print("\n--- Classification Results (Top 30) ---")
    print(clean_df['classified_class'].value_counts().head(30))

    print("\n--- Example of Classified Paragraphs ---")
    print(clean_df[clean_df['classified_class'] != 'Unclassified']
          [['Paragraph', 'classified_class']].head())
          
    # Save the final file
    print("\nSaving final classified data...")
    final_save_path = '../classified_data/final_classified_paragraphs_3.csv'
    # Make sure directory exists, etc.
    clean_df.to_csv(final_save_path, index=False, sep='|')
    print(f"Final data saved to {final_save_path}")

else:
    print("\nError: 'clean_df' not found. Please run the Phase 1 filtering code first.")

### Remove unclassified paragraphs and save new file

In [None]:
# Removing Unclassified rows and short paragraphs
new_df = clean_df[clean_df['classified_class'] != 'Unclassified']
new_df = new_df[new_df["Paragraph"].str.split().str.len() >= 20]
new_df["Paragraph"] = new_df["Paragraph"].str.replace(r'^\s*(?:\(\d+\)|\d+\.)\s*', '', regex=True)
new_df.to_csv('classified_data_no_unclassified_3.csv', index=False, sep='|')

### Second round of clustering to analyze missed keywords

In [None]:
# Gap Analysis on "Unclassified" Data 

print("\nAnalyzing Gaps in Classification")

# First, check if the previous steps ran and 'clean_df' exists
if 'clean_df' not in locals():
    print("Error: 'clean_df' not found. Please run the full script from Phase 1 and 2.")
else:
    # 1. Isolate Unclassified Data
    unclassified_df = clean_df[clean_df['classified_class'] == 'Unclassified'].copy()

    unclassified_count = len(unclassified_df)
    
    if unclassified_count == 0:
        print("ðŸŽ‰ Congratulations! No 'Unclassified' paragraphs were found.")
    else:
        print(f"Found {unclassified_count} 'Unclassified' paragraphs. Running clustering to analyze them...")

    # Set a K value for discovery.
    k_discovery = 25 

    # Check if we have enough data to cluster
    if unclassified_count < k_discovery:
        print(f"Only {unclassified_count} unclassified items. Not enough to cluster into {k_discovery} groups.")
        if unclassified_count > 0:
            print("Here are the unclassified paragraphs:")
            print(unclassified_df[['Paragraph_ID', 'Paragraph']].head())
    
    else:
        # Vectorize *only* the unclassified text
        print(f"Vectorizing {unclassified_count} paragraphs...")
        start_vec_uc = time.time()
        
        # We use 'cleaned_paragraph' for better topic clustering
        uc_vectorizer = TfidfVectorizer(max_df=0.90, 
                                        min_df=5,    # Ignore words that appear in < 5 docs
                                        stop_words='english', 
                                        max_features=1000)
        
        uc_tfidf_matrix = uc_vectorizer.fit_transform(unclassified_df['cleaned_paragraph'])
        
        # Check if vectorization returned anything (min_df might filter everything)
        if uc_tfidf_matrix.shape[0] == 0:
            print("Error: No data remains after vectorization. Your 'min_df=5' might be too high for this small dataset.")
        
        else:
            end_vec_uc = time.time()
            print(f"Vectorization complete in {end_vec_uc - start_vec_uc:.2f}s. Matrix shape: {uc_tfidf_matrix.shape}")

            # 4. Run K-Means Clustering
            print(f"Running K-Means with k={k_discovery}...")
            start_kmeans_uc = time.time()

            uc_kmeans = KMeans(n_clusters=k_discovery, random_state=42, n_init=10)
            uc_kmeans.fit(uc_tfidf_matrix)
            
            # Add the new cluster IDs to the unclassified_df
            unclassified_df['discovery_cluster'] = uc_kmeans.labels_
            end_kmeans_uc = time.time()
            print(f"Clustering complete in {end_kmeans_uc - start_kmeans_uc:.2f}s.")
            print(f"Inertia (lower is better): {uc_kmeans.inertia_}")

            # Analyze and Report the New Clusters
            print("\n--- ðŸ”Ž Top Words for Unclassified Topics ---")
            print("Review these topics to find gaps in your classification rules.")
            
            uc_terms = uc_vectorizer.get_feature_names_out()
            uc_order_centroids = uc_kmeans.cluster_centers_.argsort()[:, ::-1]

            for i in range(k_discovery):
                # Find the top words for this cluster
                top_words = [uc_terms[ind] for ind in uc_order_centroids[i, :10]]
                
                # Get the count of paragraphs in this cluster
                cluster_size = (unclassified_df['discovery_cluster'] == i).sum()
                
                print(f"\nDiscovery Cluster {i} (Size: {cluster_size}):")
                print(f"  > {', '.join(top_words)}")

            print("\nAnalysis of unclassified data is complete.")

### BERT to analyze overlaps within classes

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# WARNING: You have to have torch and transformers installed

# ---------- CONFIG ----------
csv_path = ""
text_column = "cleaned_paragraph"   # use only cleaned text
class_column = "classified_class"   # filter within this class

model_name = "nlpaueb/legal-bert-base-uncased"
batch_size = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(torch.cuda.is_available())
print(f"Using device: {device}")

# ---------- LOAD DATA ----------
df = pd.read_csv(csv_path, delimiter="|")
df[text_column] = df[text_column].astype(str)

# Drop empty cleaned paragraphs
df = df[df[text_column].str.strip() != ""].reset_index(drop=True)

print(f"Loaded {len(df)} rows with cleaned paragraphs.")

# ---------- LOAD MODEL ----------
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

# ---------- EMBEDDING FUNCTION ----------
def embed_texts(text_list, batch_size=16):
    all_embs = []

    for i in tqdm(range(0, len(text_list), batch_size), desc="Embedding"):
        batch_texts = text_list[i:i+batch_size]
        enc = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**enc)
            batch_embs = outputs.last_hidden_state.mean(dim=1)

        all_embs.append(batch_embs.cpu())

    return torch.cat(all_embs, dim=0)

# ---------- PROCESS EACH CLASS SEPARATELY ----------
classes = df[class_column].unique()
print("Classes found:", classes)

for cls in classes:
    print(f"\n=== Processing class: {cls} ===")
    
    subset = df[df[class_column] == cls].reset_index(drop=True)
    texts = subset[text_column].astype(str).tolist()
    
    print(f"{len(texts)} items in this class.")

    # ---- Create embeddings for this class only ----
    embeddings = embed_texts(texts, batch_size=batch_size)
    print(f"Embeddings shape for class {cls}: {embeddings.shape}")

    # ---- Cosine similarity matrix ----
    embeddings_norm = F.normalize(embeddings, p=2, dim=1)
    similarity_matrix = embeddings_norm @ embeddings_norm.T

    print(f"Similarity matrix shape for class {cls}: {similarity_matrix.shape}")

    # ---- Save ----
    filename = f"similarity_{cls}.npy"
    np.save(filename, similarity_matrix.numpy())
    print(f"Saved:", filename)


  from .autonotebook import tqdm as notebook_tqdm


False
Using device: cpu


FileNotFoundError: [Errno 2] No such file or directory: 'final_combined_classified_data.csv'

In [40]:
df = pd.read_csv("C:/Users/danit/Junction/final_data/final_combined_classified_data.csv", delimiter="|")
df["classified_class"].value_counts()

classified_class
Risk_GovernanceControl           9681
Risk_StrategicBusiness           7853
Risk_MarketValue                 6673
Risk_RegulatoryCompliance        5105
Risk_CountryTransfer             4334
Risk_AML_CFT_Sanctions           4048
Risk_ThirdPartyOutsourcing       3368
Risk_ClimateEnvironmental        3332
Risk_FundingLiquidity            3278
Risk_ModelPerformance            2498
Risk_OperationalProcess          2043
Risk_MacroeconomicCycle          1854
Risk_SystemicInterconnected      1762
Risk_CyberSecurity               1585
Risk_CreditDefault               1557
Risk_ConsumerConduct             1281
Risk_LegalEnforceability         1268
Risk_FXExposure                  1128
Risk_DataQuality                  811
Risk_CounterpartyCredit           564
Risk_InterestRate_BankingBook     490
Risk_ITSystemFailure              410
Risk_Concentration                382
Risk_CollateralQuality            310
Risk_CommodityPrice               219
Risk_ResidualCredit              