# Fraud Dectection

In [2]:
import pandas as pd
import os

In [3]:
data = os.path.join('data', 'PS_20174392719_1491204439457_log.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [6]:
print(df.isna().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [7]:
print(df.isna().values.any())

False


In [8]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


## Research Agent: Ai Analyst

### Processing

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [10]:
np.random.seed(42)

In [11]:
def preprocess_data(df):
    df = df.dropna()
    le = LabelEncoder()
    df['type'] = le.fit_transform(df['type'])
    df['balance_change_orig'] = df['newbalanceOrig'] - df['oldbalanceOrg']
    df['balance_change_dest'] = df['newbalanceDest'] - df['oldbalanceDest']
    df['amount_to_orig_balance'] = df['amount'] / (df['oldbalanceOrg'] + 1e-6)
    features = ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
                'oldbalanceDest', 'newbalanceDest', 'balance_change_orig', 
                'balance_change_dest', 'amount_to_orig_balance']
    X = df[features]
    y = df['isFraud']
    return X, y, df

X, y, df = preprocess_data(df)

### Modeling (Random Forest)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [13]:
def train_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    return model, report, cm, X_test, y_test

model, report, cm, X_test, y_test = train_model(X, y)
print(report)

{'0': {'precision': 0.9999947543983059, 'recall': 0.9999989508752586, 'f1-score': 0.9999968526323796, 'support': 1906351.0}, '1': {'precision': 0.9991759373712402, 'recall': 0.9958932238193019, 'f1-score': 0.9975318798848211, 'support': 2435.0}, 'accuracy': 0.9999937132816356, 'macro avg': {'precision': 0.999585345884773, 'recall': 0.9979460873472803, 'f1-score': 0.9987643662586003, 'support': 1908786.0}, 'weighted avg': {'precision': 0.9999937098498541, 'recall': 0.9999937132816356, 'f1-score': 0.9999937081161057, 'support': 1908786.0}}


### LangChain Analyst

In [14]:
from langchain.agents import initialize_agent, AgentType
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

In [15]:
import langchain
langchain.__version__

'0.0.232'

In [18]:
from dotenv import load_dotenv

In [23]:
def create_ai_analyst(df, report):
    # Load environment variables from .env file
    load_dotenv()
    
    # Retrieve OpenAI API key
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in .env file. Please set it in the .env file.")
    
    # Initialize OpenAI LLM with a supported model
    try:
        llm = OpenAI(
            model_name="gpt-3.5-turbo-instruct",  # Use a supported model
            openai_api_key=api_key,
            temperature=0.7
        )
    except Exception as e:
        raise ValueError(f"Failed to initialize OpenAI LLM: {str(e)}")
    
    # Define prompt template
    prompt = PromptTemplate(
        input_variables=["dataset_summary", "model_performance"],
        template="""You are an AI Analyst specializing in financial fraud detection. 
        Analyze the provided dataset summary and model performance, then provide insights and recommendations.
        Dataset summary: {dataset_summary}
        Model performance: {model_performance}
        Provide a concise analysis, including key findings, potential issues, and suggestions for improvement."""
    )
    
    # Prepare inputs
    dataset_summary = df.describe().to_string()
    model_performance = str(report)
    
    # Run LLM directly
    try:
        result = llm(prompt.format(dataset_summary=dataset_summary, model_performance=model_performance))
        return result
    except Exception as e:
        return f"Error in AI Analyst: {str(e)}"

analysis = create_ai_analyst(df, report)
print(analysis)



Error in AI Analyst: This is not a chat model and thus not supported in the v1/chat/completions endpoint. Did you mean to use v1/completions?


### Visualization

In [None]:
import io
from datetime import datetime

In [None]:
def generate_visualizations(df, cm):
    plt.style.use('seaborn')
    visualizations = []
    
    plt.figure(figsize=(10, 8))
    corr = df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Features')
    plt.show()
    corr_buffer = io.BytesIO()
    plt.savefig(corr_buffer, format='png')
    corr_buffer.seek(0)
    visualizations.append(('correlation_heatmap.png', corr_buffer))
    plt.close()
    
    plt.figure(figsize=(6, 4))
    sns.countplot(x='isFraud', data=df)
    plt.title('Distribution of Fraudulent vs Non-Fraudulent Transactions')
    plt.show()
    fraud_dist_buffer = io.BytesIO()
    plt.savefig(fraud_dist_buffer, format='png')
    fraud_dist_buffer.seek(0)
    visualizations.append(('fraud_distribution.png', fraud_dist_buffer))
    plt.close()
    
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    cm_buffer = io.BytesIO()
    plt.savefig(cm_buffer, format='png')
    cm_buffer.seek(0)
    visualizations.append(('confusion_matrix.png', cm_buffer))
    plt.close()
    
    return visualizations

visualizations = generate_visualizations(df, cm)

### PDF Report

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

In [None]:
def generate_pdf_report(analysis, visualizations):
    buffer = io.BytesIO()
    c = canvas.Canvas(buffer, pagesize=letter)
    c.setFont("Helvetica", 12)
    c.drawString(50, 750, "Fraud Detection Analysis Report")
    c.drawString(50, 730, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    c.drawString(50, 710, "Summary of Analysis:")
    text = c.beginText(50, 690)
    text.setFont("Helvetica", 10)
    for line in analysis.split('\n')[:20]:
        text.textLine(line)
    c.drawText(text)
    y_pos = 500
    for img_name, img_buffer in visualizations:
        c.drawImage(img_name, 50, y_pos - 150, width=500, height=150)
        y_pos -= 170
    c.showPage()
    c.save()
    buffer.seek(0)
    with open('report.pdf', 'wb') as f:
        f.write(buffer.read())

generate_pdf_report(analysis, visualizations)

### Flask

In [None]:
from flask import Flask, render_template, send_file

In [None]:
app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/visualization/<img_name>')
def serve_image(img_name):
    return send_file(f'static/{img_name}', mimetype='image/png')

@app.route('/report')
def serve_report():
    return send_file('static/report.pdf', as_attachment=True)

### Main

In [24]:
import gc

In [25]:
def main(sample_size=None):
    """
    Main function optimized for Jupyter Notebook.
    Args:
        sample_size (int, optional): Number of rows to sample from dataset. If None, use full dataset.
    Returns:
        dict: Results including model, report, confusion matrix, analysis, and visualizations.
    """
    # Load dataset
    data_path = os.path.join('data', 'PS_20174392719_1491204439457_log.csv')
    if sample_size:
        df = pd.read_csv(data_path).sample(n=sample_size, random_state=42)
    else:
        df = pd.read_csv(data_path)
    
    # Preprocess, train, analyze, and generate outputs
    X, y, df = preprocess_data(df)
    model, report, cm, X_test, y_test = train_model(X, y)
    analysis = create_ai_analyst(df, report)
    visualizations = generate_visualizations(df, cm)
    
    # Save visualizations
    os.makedirs('static', exist_ok=True)
    for img_name, img_buffer in visualizations:
        with open(f'static/{img_name}', 'wb') as f:
            f.write(img_buffer.read())
    
    # Save PDF
    pdf_buffer = generate_pdf_report(analysis, visualizations)
    with open('static/report.pdf', 'wb') as f:
        f.write(pdf_buffer.read())
    
    # Clear memory
    gc.collect()
    
    return {
        'model': model,
        'report': report,
        'confusion_matrix': cm,
        'analysis': analysis,
        'visualizations': visualizations
    }

In [26]:
if __name__ == "__main__":
    main()

KeyboardInterrupt: 