In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os
import sys
import traceback
from pandas import Period
from datetime import datetime, timedelta
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

def setup_logging(run_dir):
    current_time = datetime.now().strftime("%d%m%Y_%H%M")
    log_filename = os.path.join(run_dir, f"debug_{current_time}.log")
    print(f"Attempting to create log file at: {os.path.abspath(log_filename)}")
    
    # Create a file handler
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.DEBUG)
    
    # Create a formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # Get the root logger and add the file handler
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    root_logger.addHandler(file_handler)
    
    print("Logging setup complete")

def safe_execute(func):
    """Decorator to safely execute functions and log errors."""
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logging.error(f"Error in {func.__name__}: {str(e)}")
            return None
    return wrapper

@safe_execute
def time_series_decomposition(series, period):
    if not isinstance(series.index, pd.DatetimeIndex):
        series.index = pd.to_datetime(series.index)
    result = seasonal_decompose(series, model='additive', period=period)
    return result

@safe_execute
def check_stationarity(series):
    result = adfuller(series)
    return result[1] <= 0.05

@safe_execute
def rolling_statistics(series, window):
    rolling_mean = series.rolling(window=window).mean()
    rolling_std = series.rolling(window=window).std()
    return rolling_mean, rolling_std

@safe_execute
def correlation_analysis(kyc_data):
    numeric_cols = kyc_data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    if 'pass_rate' in kyc_data.columns:
        numeric_cols.append('pass_rate')
    
    if 'days_to_expiry' in kyc_data.columns:
        numeric_cols.append('days_to_expiry')
    elif 'date_of_expiry' in kyc_data.columns and 'created_at' in kyc_data.columns:
        kyc_data['days_to_expiry'] = (pd.to_datetime(kyc_data['date_of_expiry']) - pd.to_datetime(kyc_data['created_at'])).dt.days
        numeric_cols.append('days_to_expiry')
    
    if 'age' in kyc_data.columns:
        numeric_cols.append('age')
    elif 'date_of_birth' in kyc_data.columns and 'created_at' in kyc_data.columns:
        kyc_data['age'] = (pd.to_datetime(kyc_data['created_at']) - pd.to_datetime(kyc_data['date_of_birth'])).astype('<m8[Y]')
        numeric_cols.append('age')
    
    numeric_cols = list(set(numeric_cols))
    numeric_cols = [col for col in numeric_cols if col in kyc_data.columns]
    
    correlation_matrix = kyc_data[numeric_cols].corr()
    return correlation_matrix

@safe_execute
def trend_analysis(series):
    ma_short = series.rolling(window=7).mean()
    ma_long = series.rolling(window=30).mean()
    
    x = np.arange(len(series))
    y = series.values
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    
    return {
        'ma_short': ma_short,
        'ma_long': ma_long,
        'trend_line': p(x)
    }

@safe_execute
def seasonal_patterns(series, period):
    if not isinstance(series.index, pd.DatetimeIndex):
        series.index = pd.to_datetime(series.index, errors='coerce')
    series = series.dropna()  # Remove any NaT values
    if len(series) == 0:
        logging.warning("No valid dates for seasonal analysis")
        return pd.Series()
    return series.groupby(series.index.to_period(period)).mean()

@safe_execute
def anomaly_detection(series, window=20, threshold=2):
    rolling_mean = series.rolling(window=window).mean()
    rolling_std = series.rolling(window=window).std()
    z_scores = (series - rolling_mean) / rolling_std
    anomalies = np.abs(z_scores) > threshold
    return anomalies

@safe_execute
def interdependency_analysis(kyc_data):
    doc_type_pass_rates = kyc_data.groupby(['date', 'document_type'])[['result_facial', 'result_doc']].apply(calculate_pass_rate).unstack()
    country_pass_rates = kyc_data.groupby(['date', 'issuing_country'])[['result_facial', 'result_doc']].apply(calculate_pass_rate).unstack()
    
    return {
        'doc_type_pass_rates': doc_type_pass_rates,
        'country_pass_rates': country_pass_rates
    }

def validate_dates(kyc_data):
    if 'date' not in kyc_data.columns:
        if 'created_at' in kyc_data.columns:
            kyc_data['date'] = pd.to_datetime(kyc_data['created_at']).dt.date
        else:
            raise ValueError("Neither 'date' nor 'created_at' column found in the DataFrame")
    
    kyc_data['date'] = pd.to_datetime(kyc_data['date'], errors='coerce')
    invalid_dates = kyc_data['date'].isnull()
    if invalid_dates.any():
        logging.warning(f"Found {invalid_dates.sum()} invalid dates. These will be excluded from analysis.")
    return kyc_data[~invalid_dates]

# create markdown file
def create_markdown_file(filename, content):
    with open(filename, 'w') as f:
        f.write(content)

# load data
def load_data(facial_similarity_path, doc_reports_path, properties_path):
    facial_similarity = pd.read_csv(facial_similarity_path)
    doc_reports = pd.read_csv(doc_reports_path)
    properties = pd.read_csv(properties_path)
    
    logging.debug(f"Columns in facial_similarity: {facial_similarity.columns}")
    logging.debug(f"Columns in doc_reports: {doc_reports.columns}")
    
    kyc_data = pd.merge(facial_similarity, doc_reports, on=['user_id', 'attempt_id'], suffixes=('_facial', '_doc'))
    kyc_data = kyc_data.rename(columns={'created_at_facial': 'created_at'})
    
    logging.debug(f"Columns after first merge: {kyc_data.columns}")
    
    kyc_data = pd.merge(kyc_data, properties, on='user_id', how='left')
    
    logging.debug(f"Final columns in kyc_data: {kyc_data.columns}")
    
    return kyc_data

# calculate pass rates
def calculate_pass_rate(df):
    if isinstance(df, pd.Series):
        return df.mean()
    total_attempts = df.shape[0]
    passed_attempts = df[(df['result_facial'] == 'clear') & (df['result_doc'] == 'clear')].shape[0]
    return passed_attempts / total_attempts if total_attempts > 0 else 0

# get failure reason
def get_failure_reason(row):
    if row['result_facial'] != 'clear' and row['result_doc'] != 'clear':
        return 'Both'
    elif row['result_facial'] != 'clear':
        return 'Facial Similarity'
    elif row['result_doc'] != 'clear':
        return 'Document Check'
    else:
        return 'Anomaly: Passed but Flagged as Failure'


# standardise gender labels
def standardize_gender(gender):
    gender = str(gender).upper()
    if gender in ['MALE', 'M']:
        return 'Male'
    elif gender in ['FEMALE', 'F']:
        return 'Female'
    else:
        return 'Other'

def generate_new_visualizations(oct_2017_analysis, ts_decomposition, rolling_mean, rolling_std, 
                                trend_results, seasonal_patterns_result, anomalies, 
                                interdependencies, daily_pass_rates, run_dir):
    # Create visualizations for October 2017 analysis
    plt.figure(figsize=(12, 6))
    oct_2017_analysis['daily_pass_rates'].plot()
    plt.title('October 2017 Daily Pass Rates')
    plt.savefig(os.path.join(run_dir, 'oct_2017_pass_rates.png'))
    plt.close()

    # Create visualizations for time series decomposition
    ts_decomposition.plot()
    plt.savefig(os.path.join(run_dir, 'ts_decomposition.png'))
    plt.close()

    # Create visualizations for rolling statistics
    plt.figure(figsize=(12, 6))
    plt.plot(rolling_mean, label='Rolling Mean')
    plt.plot(rolling_std, label='Rolling Std')
    plt.legend()
    plt.title('Rolling Statistics')
    plt.savefig(os.path.join(run_dir, 'rolling_statistics.png'))
    plt.close()

    # Create visualizations for trend analysis
    plt.figure(figsize=(12, 6))
    plt.plot(trend_results['ma_short'], label='7-day MA')
    plt.plot(trend_results['ma_long'], label='30-day MA')
    plt.plot(trend_results['trend_line'], label='Trend Line')
    plt.legend()
    plt.title('Trend Analysis')
    plt.savefig(os.path.join(run_dir, 'trend_analysis.png'))
    plt.close()

    # Create visualizations for seasonal patterns
    seasonal_patterns_result.plot(kind='bar')
    plt.title('Seasonal Patterns')
    plt.savefig(os.path.join(run_dir, 'seasonal_patterns.png'))
    plt.close()

    # Create visualizations for anomaly detection
    plt.figure(figsize=(12, 6))
    plt.plot(daily_pass_rates)
    plt.scatter(daily_pass_rates.index[anomalies], daily_pass_rates[anomalies], color='red', label='Anomalies')
    plt.legend()
    plt.title('Anomaly Detection')
    plt.savefig(os.path.join(run_dir, 'anomaly_detection.png'))
    plt.close()

    # Create visualizations for interdependencies
    interdependencies['doc_type_pass_rates'].plot(figsize=(12, 6))
    plt.title('Pass Rates by Document Type Over Time')
    plt.savefig(os.path.join(run_dir, 'doc_type_pass_rates_over_time.png'))
    plt.close()

    interdependencies['country_pass_rates'].plot(figsize=(12, 6))
    plt.title('Pass Rates by Country Over Time')
    plt.savefig(os.path.join(run_dir, 'country_pass_rates_over_time.png'))
    plt.close()

# perform analysis
def perform_analysis(kyc_data, run_dir):
    try:
        print("Columns available in perform_analysis:", kyc_data.columns)
        logging.info(f"Type of kyc_data: {type(kyc_data)}")
        logging.info(f"Columns in kyc_data: {kyc_data.columns.tolist()}")
        logging.info(f"Data types of columns: {kyc_data.dtypes}")
        if 'created_at_facial' in kyc_data.columns:
            kyc_data['created_at'] = pd.to_datetime(kyc_data['created_at_facial'])
        elif 'created_at_doc' in kyc_data.columns:
            kyc_data['created_at'] = pd.to_datetime(kyc_data['created_at_doc'])
        else:
            raise ValueError("Neither 'created_at_facial' nor 'created_at_doc' found in the DataFrame")
        
        kyc_data['date'] = kyc_data['created_at'].dt.date
        kyc_data['date_of_expiry'] = pd.to_datetime(kyc_data['date_of_expiry'].replace('9999-01-01', pd.NaT), errors='coerce')
        kyc_data['date_of_expiry'] = kyc_data['date_of_expiry'].dt.tz_localize('UTC')
        kyc_data['gender'] = kyc_data['gender'].apply(standardize_gender)
        kyc_data['failure_reason'] = kyc_data.apply(get_failure_reason, axis=1)

        failure_counts = kyc_data['failure_reason'].value_counts()

        kyc_data['created_at'] = pd.to_datetime(kyc_data['created_at'], utc=True)

        overall_pass_rate = calculate_pass_rate(kyc_data)
        print(f"Overall KYC Pass Rate: {overall_pass_rate:.2%}")

        # Daily pass rates
        daily_pass_rates = kyc_data.groupby('date')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
        
        plt.figure(figsize=(12, 6))
        daily_pass_rates.plot()
        plt.title('Daily KYC Pass Rates')
        plt.xlabel('Date')
        plt.ylabel('Pass Rate')
        plt.xticks(rotation=45, ha='right')
        plt.subplots_adjust(bottom=0.2)  # Adjust the bottom margin
        plt.tight_layout()
        plt.savefig(os.path.join(run_dir, f'daily_pass_rates.png'))
        plt.close()

        # Failure reasons
        failure_counts = kyc_data[kyc_data['failure_reason'] != 'Passed']['failure_reason'].value_counts()
        
        plt.figure(figsize=(15, 8))  # Increase figure width
        plt.bar(failure_counts.index, failure_counts.values)
        plt.title('KYC Failure Reasons')
        plt.xlabel('Reason')
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')  # Rotate labels and align right
        plt.tight_layout()  # Adjust layout to prevent cutting off labels
        plt.savefig(os.path.join(run_dir, 'failure_reasons.png'), bbox_inches='tight')
        plt.close()

        # Document types
        doc_type_pass_rates = kyc_data.groupby('document_type')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
        
        # Document type pass rates
        plt.figure(figsize=(15, 8))  # Increase figure size
        plt.xticks(rotation=45, ha='right')  # Rotate labels and align
        plt.tight_layout()  # Adjust layout to fit labels
        doc_type_pass_rates.plot(kind='bar')
        plt.title('Pass Rates by Document Type')
        plt.xlabel('Document Type')
        plt.ylabel('Pass Rate')
        plt.savefig(os.path.join(run_dir, 'doc_type_pass_rates.png'), bbox_inches='tight')
        plt.close()

        # Issuing countries
        country_pass_rates = kyc_data.groupby('issuing_country')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
        top_10_countries = country_pass_rates.nlargest(10)
        bottom_10_countries = country_pass_rates.nsmallest(10)
        
        plt.figure(figsize=(12, 6))
        top_10_countries.plot(kind='bar')
        plt.title('Top 10 Countries by KYC Pass Rate')
        plt.xlabel('Issuing Country')
        plt.ylabel('Pass Rate')
        plt.savefig('top_10_countries.png')
        plt.savefig(os.path.join(run_dir, f'top_10_countries.png'))
        plt.close()

        # Gender analysis
        gender_pass_rates = kyc_data.groupby('gender')[['result_facial', 'result_doc']].apply(calculate_pass_rate)

        plt.figure(figsize=(8, 6))
        gender_pass_rates.plot(kind='bar')
        plt.title('Pass Rates by Gender')
        plt.xlabel('Gender')
        plt.ylabel('Pass Rate')
        plt.savefig(os.path.join(run_dir, f'gender_pass_rates.png'))
        plt.close()

        # Document expiry analysis
        kyc_data['date_of_expiry'] = pd.to_datetime(kyc_data['date_of_expiry'])
        kyc_data['days_to_expiry'] = (kyc_data['date_of_expiry'] - kyc_data['created_at']).dt.days
        
        plt.figure(figsize=(12, 8))  # Increase figure size
        sns.boxplot(x='failure_reason', y='days_to_expiry', data=kyc_data)
        plt.title('Days to Document Expiry vs KYC Outcome')
        plt.xlabel('KYC Outcome')
        plt.ylabel('Days to Expiry')
        plt.xticks(rotation=45, ha='right')  # Rotate labels and align
        plt.tight_layout()  # Adjust layout to fit labels
        plt.savefig(os.path.join(run_dir, 'expiry_vs_outcome.png'), bbox_inches='tight')
        plt.close()

        plt.figure(figsize=(10, 6))
        sns.barplot(x=failure_counts.index, y=failure_counts.values)
        plt.title('KYC Failure Reasons (Including Data Anomaly)')
        plt.xlabel('Reason')
        plt.ylabel('Count')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(run_dir, f'failure_reasons_with_anomaly.png'))
        plt.close()

        # Analyze October 2017 anomaly
        oct_2017_analysis = analyze_october_2017_anomaly(kyc_data)
        
        # Perform time-series analysis
        logging.info(f"Type of column selector: {type(['result_facial', 'result_doc'])}")
        logging.info(f"Content of column selector: {['result_facial', 'result_doc']}")
        try:
            daily_pass_rates = kyc_data.groupby('date')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
            logging.info(f"Groupby operation successful. Shape of daily_pass_rates: {daily_pass_rates.shape}")
        except Exception as e:
            logging.error(f"Error in groupby operation: {str(e)}")
            logging.info(f"First few rows of kyc_data: {kyc_data.head().to_dict()}")
            raise
        
        # Decompose time series
        ts_decomposition = time_series_decomposition(daily_pass_rates, period=30)
        
        # Check stationarity
        is_stationary = check_stationarity(daily_pass_rates)
        
        # Calculate rolling statistics
        rolling_mean, rolling_std = rolling_statistics(daily_pass_rates, window=7)
        
        # Perform correlation analysis
        corr_matrix = correlation_analysis(kyc_data)
        
        # Analyze trends
        trend_results = trend_analysis(daily_pass_rates)
        
        # Identify seasonal patterns
        seasonal_patterns_result = seasonal_patterns(daily_pass_rates, 'M')
        
        # Detect anomalies
        anomalies = anomaly_detection(daily_pass_rates)
        
        # Analyze interdependencies
        interdependencies = interdependency_analysis(kyc_data)

        # Generate visualizations for new analyses
        generate_new_visualizations(oct_2017_analysis, ts_decomposition, rolling_mean, rolling_std, 
                            trend_results, seasonal_patterns_result, anomalies, 
                            interdependencies, daily_pass_rates, run_dir)

        # Compile all results
        analysis_results = {
            'overall_pass_rate': overall_pass_rate,
            'daily_pass_rates': daily_pass_rates,
            'failure_counts': failure_counts,
            'doc_type_pass_rates': doc_type_pass_rates,
            'top_10_countries': top_10_countries,
            'bottom_10_countries': bottom_10_countries,
            'gender_pass_rates': gender_pass_rates,
            'oct_2017_analysis': oct_2017_analysis,
            'is_stationary': is_stationary,
            'correlation_matrix': corr_matrix,
            'trend_results': trend_results,
            'seasonal_patterns': seasonal_patterns_result,
            'anomalies': anomalies,
            'interdependencies': interdependencies
        }

        return analysis_results
    
    except Exception as e:
        logging.error(f"Error in perform_analysis: {str(e)}")
        logging.exception("Exception details:")
        raise  # Re-raise the exception after logging

# generate conclusions
def generate_conclusions(analysis_results):
    conclusions = []
    
    if 'overall_pass_rate' in analysis_results:
        conclusions.append(f"The overall KYC pass rate is {analysis_results['overall_pass_rate']:.2%}. [daily_pass_rates.png]")
    
    if 'daily_pass_rates' in analysis_results and len(analysis_results['daily_pass_rates']) > 1:
        conclusions.append(f"The pass rate has {'increased' if analysis_results['daily_pass_rates'].iloc[-1] > analysis_results['daily_pass_rates'].iloc[0] else 'decreased'} over time. [daily_pass_rates.png]")
    
    if 'failure_counts' in analysis_results and not analysis_results['failure_counts'].empty:
        conclusions.append(f"The main reason for KYC failures is {analysis_results['failure_counts'].index[0]}. [failure_reasons.png]")
    
    if 'doc_type_pass_rates' in analysis_results and not analysis_results['doc_type_pass_rates'].empty:
        conclusions.append(f"The document type with the highest pass rate is {analysis_results['doc_type_pass_rates'].idxmax()}. [doc_type_pass_rates.png]")
        conclusions.append(f"The document type with the lowest pass rate is {analysis_results['doc_type_pass_rates'].idxmin()}. [doc_type_pass_rates.png]")
    
    if 'top_10_countries' in analysis_results and not analysis_results['top_10_countries'].empty:
        conclusions.append(f"The country with the highest pass rate is {analysis_results['top_10_countries'].index[0]}. [top_10_countries.png]")
    
    if 'bottom_10_countries' in analysis_results and not analysis_results['bottom_10_countries'].empty:
        conclusions.append(f"The country with the lowest pass rate is {analysis_results['bottom_10_countries'].index[0]}. [top_10_countries.png]")
    
    if 'gender_pass_rates' in analysis_results and 'Male' in analysis_results['gender_pass_rates'] and 'Female' in analysis_results['gender_pass_rates']:
        conclusions.append(f"{'Male' if analysis_results['gender_pass_rates']['Male'] > analysis_results['gender_pass_rates']['Female'] else 'Female'} applicants have a higher pass rate. [gender_pass_rates.png]")
    
    if 'is_stationary' in analysis_results:
        conclusions.append(f"The KYC pass rates {'are' if analysis_results['is_stationary'] else 'are not'} stationary over time.")
    
    if 'correlation_matrix' in analysis_results and 'pass_rate' in analysis_results['correlation_matrix']:
        corr_series = analysis_results['correlation_matrix']['pass_rate'].sort_values(ascending=False)
        if len(corr_series) > 1:
            conclusions.append(f"The strongest correlation with pass rates is observed for {corr_series.index[1]}.")
    
    if 'seasonal_patterns' in analysis_results and not analysis_results['seasonal_patterns'].empty:
        try:
            max_period = analysis_results['seasonal_patterns'].idxmax()
            if isinstance(max_period, pd.Period):
                period_str = max_period.strftime('%B %Y') if max_period.freq == 'M' else str(max_period)
            else:
                period_str = str(max_period)
            conclusions.append(f"Seasonal patterns show highest pass rates in {period_str}.")
        except Exception as e:
            logging.warning(f"Could not determine highest seasonal pass rate: {str(e)}")

    if 'anomalies' in analysis_results:
        conclusions.append(f"{sum(analysis_results['anomalies'])} anomalies were detected in the pass rates time series.")
    
    return conclusions

# generate recommendations
def generate_recommendations(analysis_results):
    recommendations = []
    
    if 'failure_counts' in analysis_results and not analysis_results['failure_counts'].empty:
        recommendations.append(f"Focus on improving the {analysis_results['failure_counts'].index[0]} check process, as it's the main reason for failures. [failure_reasons.png]")
    
    if 'doc_type_pass_rates' in analysis_results and not analysis_results['doc_type_pass_rates'].empty:
        recommendations.append(f"Enhance the verification process for {analysis_results['doc_type_pass_rates'].idxmin()} documents, which have the lowest pass rate. [doc_type_pass_rates.png]")
    
    if 'bottom_10_countries' in analysis_results and not analysis_results['bottom_10_countries'].empty:
        recommendations.append(f"Investigate why {analysis_results['bottom_10_countries'].index[0]} has the lowest pass rate and implement country-specific improvements. [top_10_countries.png]")
    
    recommendations.extend([
        "Provide clearer instructions to users on how to take high-quality photos for both documents and facial images. [failure_reasons.png]",
        "Implement a feedback loop to continuously monitor and improve the KYC process. [daily_pass_rates.png]"
    ])
    
    if 'gender_pass_rates' in analysis_results and 'Male' in analysis_results['gender_pass_rates'] and 'Female' in analysis_results['gender_pass_rates']:
        recommendations.append(f"Address the gender disparity in pass rates, focusing on improving the process for {'female' if analysis_results['gender_pass_rates']['Male'] > analysis_results['gender_pass_rates']['Female'] else 'male'} applicants. [gender_pass_rates.png]")
    
    recommendations.extend([
        "Consider implementing a system to remind users to update their documents well before expiry. [expiry_vs_outcome.png]",
        "Develop a risk-based approach, potentially fast-tracking applications from low-risk countries and age groups. [top_10_countries.png]",
        "Invest in machine learning models to predict KYC outcomes and identify high-risk applications early in the process. [failure_reasons.png]",
        "Implement strategies to address seasonal variations in pass rates.",
        "Investigate and address the root causes of detected anomalies in pass rates."
    ])
    
    if 'interdependencies' in analysis_results and 'doc_type_pass_rates' in analysis_results['interdependencies']:
        min_doc_type = analysis_results['interdependencies']['doc_type_pass_rates'].min().idxmin()
        recommendations.append(f"Focus on improving {min_doc_type} document verification process, which shows consistently lower pass rates.")
    
    return recommendations

def analyze_october_2017_anomaly(kyc_data):
    # Define the time range for analysis
    start_date = '2017-10-01'
    end_date = '2017-10-31'
    
    # Filter data for October 2017
    oct_data = kyc_data[(kyc_data['created_at'] >= start_date) & (kyc_data['created_at'] <= end_date)]
    
    # Daily pass rates
    daily_pass_rates = oct_data.groupby('date')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
    
    # Analyze failure reasons
    failure_reasons = oct_data[oct_data['failure_reason'] != 'Passed']['failure_reason'].value_counts()
    
    # Analyze by document type
    doc_type_pass_rates = oct_data.groupby('document_type')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
    
    # Analyze by country
    country_pass_rates = oct_data.groupby('issuing_country')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
    
    # Analyze by gender
    gender_pass_rates = oct_data.groupby('gender')[['result_facial', 'result_doc']].apply(calculate_pass_rate)
    
    return {
        'daily_pass_rates': daily_pass_rates,
        'failure_reasons': failure_reasons,
        'doc_type_pass_rates': doc_type_pass_rates,
        'country_pass_rates': country_pass_rates,
        'gender_pass_rates': gender_pass_rates
    }

# analysis
def run_analysis(facial_similarity_path, doc_reports_path, properties_path):
    logging.info("Starting run_analysis function")

    current_time = datetime.now().strftime("%d%m%Y_%H%M")
    run_dir = f"analysis_run_{current_time}"
    os.makedirs(run_dir, exist_ok=True)
    
    setup_logging(run_dir)
    
    logging.info("Starting analysis")
    
    # Initialize markdown_content at the beginning
    markdown_content = "# KYC Analysis Report\n\n"
    
    try:
        kyc_data = load_data(facial_similarity_path, doc_reports_path, properties_path)
        kyc_data = validate_dates(kyc_data)
        
        analysis_results = perform_analysis(kyc_data, run_dir)
        conclusions = generate_conclusions(analysis_results)
        recommendations = generate_recommendations(analysis_results)

        # Add conclusions and recommendations to markdown_content
        markdown_content += "## Conclusions\n\n"
        for i, conclusion in enumerate(conclusions, 1):
            markdown_content += f"{i}. {conclusion}\n"
        
        markdown_content += "\n## Recommendations\n\n"
        for i, recommendation in enumerate(recommendations, 1):
            markdown_content += f"{i}. {recommendation}\n"

        # Time series analysis
        try:
            time_series_content = generate_time_series_analysis_content(analysis_results)
            if time_series_content:
                markdown_content += "\n## Time Series Analysis\n\n"
                markdown_content += time_series_content
            else:
                logging.warning("Time series analysis content generation returned empty string")
                markdown_content += "\n## Time Series Analysis\n\nTime series analysis unavailable.\n"
        except Exception as e:
            logging.error(f"Error generating time series content: {str(e)}")
            markdown_content += "\n## Time Series Analysis\n\nError in time series analysis.\n"


        # Create markdown content
        markdown_content = "# KYC Analysis Report\n\n"
        logging.info("Initialized markdown_content")
        
        markdown_content += "## Conclusions\n\n"
        for i, conclusion in enumerate(conclusions, 1):
            markdown_content += f"{i}. {conclusion}\n"
        
        markdown_content += "\n## Recommendations\n\n"
        for i, recommendation in enumerate(recommendations, 1):
            markdown_content += f"{i}. {recommendation}\n"
        
        markdown_content += "\n## Supporting Graphs\n\n"
        for graph in os.listdir(run_dir):
            if graph.endswith('.png'):
                markdown_content += f"![{graph.split('.')[0]}]({graph})\n\n"

        markdown_content += "\n## October 2017 Anomaly Analysis\n\n"
        if 'oct_2017_analysis' in analysis_results:
            markdown_content += generate_oct_2017_analysis_content(analysis_results['oct_2017_analysis'])
        else:
            markdown_content += "October 2017 analysis not available.\n"

        markdown_content += "\n## Time Series Analysis\n\n"
        markdown_content += generate_time_series_analysis_content(analysis_results)

        markdown_content += "\n## Interdependency Analysis\n\n"
        if 'interdependencies' in analysis_results:
            markdown_content += generate_interdependency_analysis_content(analysis_results['interdependencies'])
        else:
            markdown_content += "Interdependency analysis not available.\n"

        logging.info(f"Total markdown content length: {len(markdown_content)}")

        # Create markdown file in the run directory
        markdown_filename = os.path.join(run_dir, "analysis_output.md")
        create_markdown_file(markdown_filename, markdown_content)

        logging.info(f"Analysis complete. Results written to {markdown_filename}")

    except Exception as e:
        logging.error(f"Error in run_analysis: {str(e)}")
        logging.exception("Exception details:")  # This will log the full stack trace

    return markdown_content

@safe_execute
def generate_oct_2017_analysis_content(oct_2017_analysis):
    content = ""
    if 'daily_pass_rates' in oct_2017_analysis:
        content += f"Daily pass rates analysis for October 2017 is available.\n"
    if 'failure_reasons' in oct_2017_analysis:
        content += f"Top failure reason: {oct_2017_analysis['failure_reasons'].index[0]}\n"
    if 'doc_type_pass_rates' in oct_2017_analysis:
        content += f"Document type with highest pass rate: {oct_2017_analysis['doc_type_pass_rates'].idxmax()}\n"
    if 'country_pass_rates' in oct_2017_analysis:
        content += f"Country with highest pass rate: {oct_2017_analysis['country_pass_rates'].idxmax()}\n"
    if 'gender_pass_rates' in oct_2017_analysis:
        content += f"Gender with highest pass rate: {oct_2017_analysis['gender_pass_rates'].idxmax()}\n"
    return content

@safe_execute
def generate_time_series_analysis_content(analysis_results):
    logging.info("Starting time series analysis content generation")
    logging.debug(f"Analysis results keys: {analysis_results.keys()}")
    content = ""

    try:
        if 'is_stationary' in analysis_results:
            content += f"The time series is {'stationary' if analysis_results['is_stationary'] else 'non-stationary'}.\n"

        if 'trend_results' in analysis_results:
            content += "Trend analysis results are available.\n"

        if 'seasonal_patterns' in analysis_results:
            seasonal_patterns = analysis_results['seasonal_patterns']
            if not seasonal_patterns.empty:
                try:
                    max_period = seasonal_patterns.idxmax()
                    if isinstance(max_period, Period):
                        period_str = max_period.strftime('%B %Y') if max_period.freq == 'M' else str(max_period)
                    else:
                        period_str = str(max_period)
                    content += f"Seasonal patterns analysis is available. Highest pass rates in {period_str}.\n"
                except Exception as e:
                    logging.error(f"Error processing seasonal patterns: {str(e)}")
                    content += "Seasonal patterns analysis is available, but there was an error processing the results.\n"
            else:
                content += "Seasonal patterns analysis is available, but no patterns were found.\n"

        if 'anomalies' in analysis_results:
            if isinstance(analysis_results['anomalies'], (list, np.ndarray)):
                anomaly_count = sum(analysis_results['anomalies'])
                content += f"Anomaly detection found {anomaly_count} anomalies in the time series.\n"
            else:
                logging.warning(f"Unexpected type for anomalies: {type(analysis_results['anomalies'])}")
                content += "Anomaly detection results are available, but in an unexpected format.\n"

        # Add detailed logging for troubleshooting
        for key, value in analysis_results.items():
            if isinstance(value, pd.Series) or isinstance(value, pd.DataFrame):
                logging.debug(f"Data type of '{key}': {type(value)}")
                logging.debug(f"Index type of '{key}': {type(value.index)}")
                if len(value) > 0:
                    logging.debug(f"First element type of '{key}': {type(value.iloc[0])}")
                    logging.debug(f"First element of '{key}': {value.iloc[0]}")
                    if isinstance(value.index[0], Period):
                        logging.debug(f"First index element of '{key}' is a Period object")
    except Exception as e:
        error_info = traceback.extract_tb(sys.exc_info()[2])
        filename, line_number, _, _ = error_info[-1]
        logging.error(f"Error in {filename}, line {line_number}: {str(e)}")
        content += f"Error occurred during time series analysis content generation in {filename}, line {line_number}.\n"
    
    if not content:
        content = "No time series analysis content could be generated.\n"

    logging.info(f"Completed time series analysis content generation. Content length: {len(content)}")
    return content

@safe_execute
def generate_interdependency_analysis_content(interdependencies):
    content = ""
    if 'doc_type_pass_rates' in interdependencies:
        content += "Pass rates by document type over time are available.\n"
    if 'country_pass_rates' in interdependencies:
        content += "Pass rates by country over time are available.\n"
    return content

run_analysis('facial_similarity_reports.csv', 'doc_reports.csv', 'properties.csv')

INFO:root:Starting run_analysis function
INFO:root:Starting analysis


Attempting to create log file at: /Users/davidlee/_Code/rev/cs1/analysis_run_14082024_1903/debug_14082024_1903.log
Logging setup complete


DEBUG:root:Columns in facial_similarity: Index(['Unnamed: 0', 'user_id', 'result', 'face_comparison_result',
       'created_at', 'facial_image_integrity_result',
       'visual_authenticity_result', 'properties', 'attempt_id'],
      dtype='object')
DEBUG:root:Columns in doc_reports: Index(['Unnamed: 0', 'user_id', 'result', 'visual_authenticity_result',
       'image_integrity_result', 'face_detection_result',
       'image_quality_result', 'created_at', 'supported_document_result',
       'conclusive_document_quality_result', 'colour_picture_result',
       'data_validation_result', 'data_consistency_result',
       'data_comparison_result', 'attempt_id', 'police_record_result',
       'compromised_document_result', 'properties', 'sub_result'],
      dtype='object')
DEBUG:root:Columns after first merge: Index(['Unnamed: 0_facial', 'user_id', 'result_facial',
       'face_comparison_result', 'created_at', 'facial_image_integrity_result',
       'visual_authenticity_result_facial', 'p

Columns available in perform_analysis: Index(['Unnamed: 0_facial', 'user_id', 'result_facial',
       'face_comparison_result', 'created_at', 'facial_image_integrity_result',
       'visual_authenticity_result_facial', 'properties_facial', 'attempt_id',
       'Unnamed: 0_doc', 'result_doc', 'visual_authenticity_result_doc',
       'image_integrity_result_x', 'face_detection_result_x',
       'image_quality_result_x', 'created_at_doc',
       'supported_document_result_x', 'conclusive_document_quality_result_x',
       'colour_picture_result_x', 'data_validation_result_x',
       'data_consistency_result_x', 'data_comparison_result_x',
       'police_record_result_x', 'compromised_document_result_x',
       'properties_doc', 'sub_result_x', 'result',
       'visual_authenticity_result', 'image_integrity_result_y',
       'face_detection_result_y', 'image_quality_result_y',
       'supported_document_result_y', 'conclusive_document_quality_result_y',
       'colour_picture_result_y', 'd

INFO:root:Type of column selector: <class 'list'>
INFO:root:Content of column selector: ['result_facial', 'result_doc']
INFO:root:Groupby operation successful. Shape of daily_pass_rates: (161,)
