In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, RobustScaler
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import logging
import requests
import openai
from jira import JIRA
import os
import sys

# Setup logging for better debugging
logging.basicConfig(level=logging.INFO)

# OpenAI API key (set it securely, don't hard-code in production)
openai.api_key = "your-openai-api-key"

# JIRA Setup
jira_url = "https://your-domain.atlassian.net"
jira_user = "your-email@example.com"
jira_token = "your-jira-api-token"
jira = JIRA(jira_url, basic_auth=(jira_user, jira_token))

# Setup Agentic AI API key
AGENTIC_API_KEY = "your-agentic-api-key"  # Replace with your actual Agentic API key

# Load and preprocess data
def load_data_tool(file_path):
    if not os.path.exists(file_path):
        logging.error(f"File {file_path} not found.")
        sys.exit(1)

    try:
        df = pd.read_csv(file_path, dtype={"Account": "category", "AU": "category", "Company": "category"})
    except Exception as e:
        logging.error(f"Error loading file {file_path}: {str(e)}")
        sys.exit(1)

    required_cols = ['Asofdate', 'Company', 'Account', 'AU', 'Match Status', 'GL Balance', 'IHub balance', 'Balance difference']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        logging.error(f"Missing columns {missing_cols}")
        sys.exit(1)

    df.dropna(subset=required_cols, inplace=True)

    # Encoding categorical variables
    le_account = LabelEncoder()
    le_au = LabelEncoder()
    le_company = LabelEncoder()

    df['Company'] = le_company.fit_transform(df['Company'].astype(str))
    df['Account'] = le_account.fit_transform(df['Account'].astype(str))
    df['AU'] = le_au.fit_transform(df['AU'].astype(str))
    df['Match Status'] = LabelEncoder().fit_transform(df['Match Status'].astype(str))
    df['Asofdate'] = pd.to_datetime(df['Asofdate'], errors='coerce')

    return df, le_account, le_au, le_company

# Feature Engineering with LLM enhancement
def engineer_features_with_llm(df):
    for col in ['GL Balance', 'IHub balance', 'Balance difference']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    df = df.sort_values('Asofdate')
    grouped = df.groupby(['Company', 'Account', 'AU'], group_keys=False)

    df['Days_Since_Last'] = grouped['Asofdate'].diff().dt.days.fillna(0)
    df['GL_Change'] = grouped['GL Balance'].diff().fillna(0)
    df['IHub_Change'] = grouped['IHub balance'].diff().fillna(0)
    df['Diff_Change'] = grouped['Balance difference'].diff().fillna(0)
    df['GL_Std'] = grouped['GL Balance'].transform(lambda x: x.rolling(window=3, min_periods=1).std().fillna(0))
    df['IHub_Std'] = grouped['IHub balance'].transform(lambda x: x.rolling(window=3, min_periods=1).std().fillna(0))

    features = ['Company', 'Account', 'AU', 'Days_Since_Last', 'GL Balance', 'IHub balance', 'Balance difference',
                'GL_Change', 'IHub_Change', 'Diff_Change', 'GL_Std', 'IHub_Std']
    return df, features

# Estimate contamination dynamically
def estimate_contamination(df, feature='Balance difference', z_threshold=3):
    mean_diff = df[feature].mean()
    std_diff = df[feature].std()
    extreme_values = (df[feature].abs() > (mean_diff + z_threshold * std_diff)).sum()
    contamination = min(max(extreme_values / len(df), 0.01), 0.5)  # Ensure reasonable contamination level
    logging.info(f"Estimated contamination: {contamination:.4f}")
    return contamination

# Train Isolation Forest model
def train_model_tool(df, features):
    X = df[features]
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)

    contamination = estimate_contamination(df)

    model = IsolationForest(n_estimators=200, contamination=contamination, random_state=42, n_jobs=-1)
    model.fit(X_scaled)

    df['Anomaly_Score'] = model.decision_function(X_scaled)
    df['Anomaly'] = model.predict(X_scaled)
    df['Anomaly'] = df['Anomaly'].apply(lambda x: 1 if x == -1 else 0)  # Convert -1 to 1 (anomaly), 1 to 0 (normal)

    return model, df

# Generate enhanced explanation using LLM
def generate_enhanced_explanation(anomaly_data):
    prompt = f"""
    Anomaly Detected:
    - GL Balance: {anomaly_data['GL Balance']}
    - IHub Balance: {anomaly_data['IHub balance']}
    - Balance Difference: {anomaly_data['Balance difference']}
    - Days Since Last Transaction: {anomaly_data['Days_Since_Last']}
    - GL Balance Change: {anomaly_data['GL_Change']}
    - IHub Balance Change: {anomaly_data['IHub_Change']}
    - Difference Change: {anomaly_data['Diff_Change']}
    - GL Balance Standard Deviation: {anomaly_data['GL_Std']}
    - IHub Balance Standard Deviation: {anomaly_data['IHub_Std']}
   
    Please provide an in-depth explanation for this anomaly. Include potential causes for discrepancies in the GL balance and IHub balance, taking into account changes over time, the relationship between the two, and any possible outliers. The anomaly might indicate accounting errors, data inconsistencies, or financial reporting issues.
    """

    # Generate explanation using OpenAI's API
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        max_tokens=300
    )
    explanation = response['choices'][0]['text'].strip()
    return explanation

# Create task in Agentic AI
def create_agentic_task(anomaly):
    api_url = "https://api.agentic.ai/tasks"
    headers = {
        "Authorization": f"Bearer {AGENTIC_API_KEY}",
        "Content-Type": "application/json"
    }

    # Create task data based on the anomaly details
    task_data = {
        "task_name": f"Investigate Anomaly: {anomaly['id']}",
        "description": anomaly['explanation'],
        "severity": "High",  # Adjust severity based on anomaly characteristics
        "due_date": "2023-04-01"  # Set a realistic due date
    }

    try:
        response = requests.post(api_url, json=task_data, headers=headers)
        if response.status_code == 200:
            logging.info(f"Task created successfully in Agentic AI.")
        else:
            logging.error(f"Error creating task in Agentic AI: {response.text}")
    except Exception as e:
        logging.error(f"Exception while creating task in Agentic AI: {str(e)}")

# Create JIRA ticket for anomaly
def create_jira_ticket(anomaly):
    summary = f"Anomaly Detected: {anomaly['id']}"
    description = f"Details of anomaly: {anomaly['description']}\nSeverity: High"

    new_issue = jira.create_issue(
        project='YOUR_PROJECT_KEY',
        summary=summary,
        description=description,
        issuetype={'name': 'Task'}
    )
    logging.info(f"JIRA ticket created: {new_issue.key}")

# Save anomalies to CSV
def save_anomalies_to_csv(anomalies, file_name="anomalies_output.csv"):
    anomalies.to_csv(file_name, index=False)
    logging.info(f"Anomalies saved to {file_name}")

# Email anomaly results
def send_email_tool(anomalies, recipient_email, le_account, le_au, le_company):
    sender_email = "youremailid"
    sender_password = "your-app-password"  # Replace with actual app password

    if anomalies.empty:
        logging.info("No anomalies detected. Skipping email.")
        return

    # Make a copy to avoid SettingWithCopyWarning
    anomalies = anomalies.copy()

    # Fix dtype issues by explicitly converting to string before assignment
    anomalies.loc[:, 'Company'] = le_company.inverse_transform(anomalies['Company'].astype(int)).astype(str)
    anomalies.loc[:, 'Account'] = le_account.inverse_transform(anomalies['Account'].astype(int)).astype(str)
    anomalies.loc[:, 'AU'] = le_au.inverse_transform(anomalies['AU'].astype(int)).astype(str)

    # Generate enhanced explanations and tasks for anomalies
    for idx, anomaly in anomalies.iterrows():
        explanation = generate_enhanced_explanation(anomaly)
        anomaly['explanation'] = explanation

        # Create JIRA ticket and Agentic AI task
        create_jira_ticket(anomaly)
        create_agentic_task(anomaly)  # Create task in Agentic AI

    # Save anomalies to CSV
    save_anomalies_to_csv(anomalies)

    subject = "Anomalies Detected"
    body = anomalies.to_html()

    msg = MIMEMultipart()
    msg['From'] = sender_email
    msg['To'] = recipient_email
    msg['Subject'] = subject
    msg.attach(MIMEText(body, 'html'))

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        logging.info("Email sent successfully!")
    except Exception as e:
        logging.error(f"Failed to send email: {str(e)}")

# Feedback loop for anomaly detection reframing
def feedback_loop_and_retrain(df, feedback_data, model, features):
    """
    This function integrates feedback data to refine and retrain the anomaly detection model.
    feedback_data: DataFrame containing feedback about the anomalies
    """
    logging.info("Applying feedback to adjust anomaly detection model...")

    # Example of adjusting contamination based on feedback (e.g., if feedback indicates fewer anomalies)
    positive_feedback = feedback_data[feedback_data['feedback'] == 1]  # Feedback = 1 means anomaly is real
    negative_feedback = feedback_data[feedback_data['feedback'] == 0]  # Feedback = 0 means false positive

    # Retrain model if necessary based on feedback (e.g., adjust contamination rate)
    contamination = estimate_contamination(df)  # Can be adjusted based on feedback data

    # Retrain model with new contamination value or other adjustments
    model = IsolationForest(n_estimators=200, contamination=contamination, random_state=42, n_jobs=-1)
    model.fit(df[features])
    logging.info("Model retrained with feedback.")
    return model

# Main Execution
def main(file_path, feedback_file_path):
    # Load the main data
    df, le_account, le_au, le_company = load_data_tool(file_path)
    
    # Load feedback data from a separate file (e.g., feedback_file.csv)
    feedback_data = pd.read_csv(feedback_file_path)  # Load feedback data from file

    # Ensure that feedback data has the expected columns (if needed, you can add validation)
    if 'feedback' not in feedback_data.columns:
        logging.error("Feedback data must contain a 'feedback' column.")
        sys.exit(1)

    # Feature engineering
    df, features = engineer_features_with_llm(df)  # Updated function
    
    # Train initial model
    model, df = train_model_tool(df, features)  # Train initial model

    # Apply feedback loop to retrain the model
    if feedback_data is not None and not feedback_data.empty:
        model = feedback_loop_and_retrain(df, feedback_data, model, features)

    # Send email with the detected anomalies
    send_email_tool(df[df['Anomaly'] == 1], "recipient@example.com", le_account, le_au, le_company)


# Example of providing file paths:
file_path = "/path/to/your/file.csv"  # Main data file path
feedback_file_path = "/path/to/your/feedback_file.csv"  # Feedback data file path

# Call the main function with file paths
main(file_path, feedback_file_path)
