### Automated Fraud Risk Assessment Using News Analytics and NLP

 ### Setup and Library Installation

In [1]:
!pip install newsapi-python newspaper3k spacy fpdf lxml_html_clean
!python -m spacy download en_core_web_sm

from newsapi import NewsApiClient
from newspaper import Article
import spacy
import pandas as pd
from datetime import datetime, timedelta
from fpdf import FPDF
import matplotlib.pyplot as plt
import os
from math import ceil

# Initialization
newsapi = NewsApiClient(api_key='8071ccec273940f89328374ebb540bd6')  # Replace with your actual API key
nlp = spacy.load("en_core_web_sm")


Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting lxml>=3.6.0 (from newspaper3k)
  Downloading lxml-5.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>

### Load Dataset

In [2]:
fraud_df = pd.read_csv("recent_fraud_cases_2023_2025.csv")
individuals = fraud_df["Name"].tolist()


### Keyword Definition and Risk Scoring Functions

In [3]:
keyword_weights = {
    "fraud": 3, "scam": 3, "embezzlement": 3, "money laundering": 3,
    "insider trading": 3, "ponzi scheme": 3, "wire fraud": 3,
    "securities fraud": 3, "investment fraud": 3, "accounting fraud": 3,
    "crypto fraud": 3, "financial misconduct": 3, "asset misappropriation": 3,
    "misuse of funds": 3, "bank fraud": 3, "regulatory fraud": 3,
    "indicted": 2, "charged": 2, "convicted": 2, "trial": 2,
    "criminal complaint": 2, "prosecutors": 2, "federal charges": 2,
    "regulatory violation": 2, "sec probe": 2, "doj": 2,
    "market manipulation": 2, "false reporting": 2,
    "misappropriation": 2, "whistleblower": 1, "audit failure": 1,
    "investigation": 1, "court": 1, "lawsuit": 1, "breach": 1,
    "kickbacks": 2, "forgery": 2
}

risk_verbs = list(keyword_weights.keys())

def calculate_score(text):
    text = text.lower()
    return sum(weight for kw, weight in keyword_weights.items() if kw in text)

def get_risk_level(score):
    return "High" if score >= 6 else "Medium" if score >= 3 else "Low"

def get_flag(score):
    return "Escalate" if score >= 6 else "Review" if score >= 3 else "Monitor"

def get_matched_keywords(text):
    text = text.lower()
    return ", ".join([kw for kw in keyword_weights if kw in text])

def clean(text):
    return str(text).encode('latin-1', 'ignore').decode('latin-1')


### NLP-Based Article Filtering Functions

In [4]:
def fetch_full_text(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except:
        return None

def is_negative_about_person(article_text, person_name):
    doc = nlp(article_text)
    for sent in doc.sents:
        if person_name.lower() in sent.text.lower():
            if any(kw in sent.text.lower() for kw in risk_verbs):
                return True
    return False


### Bar Chart Generation Function

In [5]:
def generate_bar_chart(articles, person_name):
    titles = [f"Article {i+1}" for i in range(len(articles))]
    scores = [a["Negative News Score"] for a in articles]
    colors = ['#dc3545' if a["Risk Level"] == "High" else '#ffc107' if a["Risk Level"] == "Medium" else '#28a745' for a in articles]

    plt.figure(figsize=(7, 4))
    plt.bar(titles, scores, color=colors)
    plt.xlabel("Articles")
    plt.ylabel("Negative News Score")
    plt.title("Risk Score per Article")
    plt.ylim(0, 10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    chart_path = f"charts/{person_name.replace(' ', '_')}_bar_chart.png"
    plt.tight_layout()
    plt.savefig(chart_path)
    plt.close()
    return chart_path


### PDF Report Generation Class

In [6]:
class FraudReportPDF(FPDF):
    def header(self):
        self.set_font("Arial", "B", 14)
        self.set_text_color(0, 0, 128)
        self.cell(0, 10, "Financial Fraud Risk Report", ln=True, align="C")
        self.ln(4)

    def person_section(self, name, articles, chart_path):
        self.set_font("Arial", "B", 12)
        self.set_text_color(0, 0, 0)
        self.cell(0, 10, f"Person: {clean(name)}", ln=True)
        self.ln(3)

        if os.path.exists(chart_path):
            self.image(chart_path, w=180)
            self.ln(5)

        for idx, article in enumerate(articles, 1):
            self.set_font("Arial", "B", 11)
            self.multi_cell(0, 8, f"{idx}. {clean(article['Title'])}")
            self.set_font("Arial", "", 10)
            self.cell(0, 6, f"Date: {clean(article['Published At'])}", ln=True)
            self.cell(0, 6, f"Negative News Score: {clean(str(article['Negative News Score']))}", ln=True)
            self.cell(0, 6, f"Risk Level: {clean(article['Risk Level'])}", ln=True)
            self.cell(0, 6, f"Flag: {clean(article['Flag'])}", ln=True)
            self.multi_cell(0, 6, f"Matched Keywords: {clean(article['Matched Keywords'])}")
            self.set_text_color(0, 0, 255)
            self.set_font("Arial", "I", 10)
            self.multi_cell(0, 6, f"URL: {clean(article['URL'])}")
            self.set_text_color(0, 0, 0)
            self.ln(4)


### Article Extraction and Report Generation

In [7]:
from_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
os.makedirs("fraud_news_pdfs", exist_ok=True)
os.makedirs("charts", exist_ok=True)

for name in individuals:
    all_articles = []
    keywords = list(keyword_weights.keys())
    chunk_size = ceil(len(keywords) / 3)
    keyword_chunks = [keywords[i:i + chunk_size] for i in range(0, len(keywords), chunk_size)]

    for keyword_group in keyword_chunks:
        query = f"{name} AND ({' OR '.join(keyword_group)})"
        try:
            results = newsapi.get_everything(
                q=query,
                language='en',
                sort_by='relevancy',
                from_param=from_date,
                page_size=10
            )
            for article in results.get('articles', []):
                full_text = fetch_full_text(article.get('url', ''))
                if full_text and is_negative_about_person(full_text, name):
                    score = calculate_score(full_text)
                    all_articles.append({
                        "Title": article.get("title", ""),
                        "Published At": article.get("publishedAt", "")[:10],
                        "URL": article.get("url", ""),
                        "Negative News Score": score,
                        "Risk Level": get_risk_level(score),
                        "Flag": get_flag(score),
                        "Matched Keywords": get_matched_keywords(full_text)
                    })
        except Exception as e:
            print(f"Error for {name}: {e}")

    if all_articles:
        chart_path = generate_bar_chart(all_articles, name)
        pdf = FraudReportPDF()
        pdf.add_page()
        pdf.person_section(name, all_articles, chart_path)
        pdf.output(f"fraud_news_pdfs/{name.replace(' ', '_')}_report.pdf")

print("PDF reports generated in 'fraud_news_pdfs/' folder.")


PDF reports generated in 'fraud_news_pdfs/' folder.
