In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF
import os

# Load the dataset
def load_dataset(file_path, sep=','):
    return pd.read_csv(file_path, sep=sep)

# Clean specific column
def clean_column(df, column):
    df[column] = df[column].astype(str)
    df = df[(df[column] != 'Null') & (df[column] != 'Error') & (df[column].notnull()) & (df[column] != 'nan')]
    df[column] = df[column].str.replace(r' >= 0.1', '', regex=True)
    df[column] = df[column].str.replace(r'%', '', regex=True)
    try:
        df[column] = pd.to_numeric(df[column])
    except ValueError:
        pass
    return df[column]

# Generate a summary for numeric columns
def summarize_numeric_column(df, column):
    summary = {
        'Type': 'Numeric',
        'Mean': df[column].mean(),
        'Median': df[column].median(),
        'Standard Deviation': df[column].std(),
    }
    return summary

# Generate a summary for categorical columns
def summarize_categorical_column(df, column):
    value_counts = df[column].value_counts()
    summary = {
        'Type': 'Categorical',
        'Unique Values': len(value_counts),
        'Top Categories': value_counts.head(5).to_dict(),
    }
    return summary

# Visualize distributions of numeric columns
def plot_numeric_distributions(df, output_dir):
    numeric_cols = df.select_dtypes(include=['number']).columns
    for col in numeric_cols:
        plt.figure(figsize=(8, 6))
        sns.histplot(df[col].dropna(), kde=True, bins=30, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plot_path = os.path.join(output_dir, f'{col}_distribution.png')
        plt.savefig(plot_path)
        plt.close()

# Visualize bar plot for models
def plot_model_distribution(df, output_dir):
    plt.figure(figsize=(10, 8))
    sns.countplot(data=df, x='model', palette='Set2')
    plt.title('Model Distribution')
    plt.xlabel('Model')
    plt.ylabel('Count')
    plt.legend(title='Models')
    plot_path = os.path.join(output_dir, 'model_distribution.png')
    plt.savefig(plot_path)
    plt.close()

# Generate Markdown report
def generate_markdown_report(df, report_path, output_dir):
    rows, cols = df.shape
    with open(report_path, 'w', encoding='utf-8') as file:
        file.write(f"# Dataset Analysis Report\n\n")
        file.write(f"## Dataset Overview\n")
        file.write(f"- **Number of Rows**: {rows}\n")
        file.write(f"- **Number of Columns**: {cols}\n\n")

        file.write("## Missing Values\n\n")
        missing_values = df.isnull().sum()
        missing_percentage = (missing_values / rows) * 100
        file.write("Column | Missing Values | Missing Percentage\n")
        file.write("--- | --- | ---\n")
        for col, val in missing_values.items():
            file.write(f"{col} | {val} | {missing_percentage[col]:.2f}%\n")

        file.write("\n## Column Summaries\n")
        for column in df.columns:
            file.write(f"\n### Column: {column}\n")
            if pd.api.types.is_numeric_dtype(df[column]):
                summary = summarize_numeric_column(df, column)
                file.write(f"- **Type**: {summary['Type']}\n")
                file.write(f"- **Mean**: {summary['Mean']:.2f}\n")
                file.write(f"- **Median**: {summary['Median']:.2f}\n")
                file.write(f"- **Standard Deviation**: {summary['Standard Deviation']:.2f}\n")
            else:
                summary = summarize_categorical_column(df, column)
                file.write(f"- **Type**: {summary['Type']}\n")
                file.write(f"- **Unique Values**: {summary['Unique Values']}\n")
                file.write("- **Top Categories**:\n")
                for k, v in summary['Top Categories'].items():
                    file.write(f"  - {k}: {v}\n")

        file.write("\n## Visualizations\n")
        file.write("Distribution plots for numeric columns have been saved in the output directory.\n")
        file.write("Model distribution plot has been saved in the output directory.\n")

# Generate PDF report
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 16)
        self.cell(0, 10, 'Dataset Analysis Report', 0, 1, 'C')
        self.ln(10)
    
    def chapter_title(self, title):
        self.set_font('Arial', 'B', 14)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(5)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln(5)
    
    def table_header(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(5)

    def table_row(self, col1, col2, col3):
        self.set_font('Arial', '', 12)
        self.cell(60, 10, str(col1), 1)
        self.cell(60, 10, str(col2), 1)
        self.cell(60, 10, str(col3), 1)
        self.ln(10)

def generate_pdf_report(df, report_path, output_dir):
    pdf = PDF()
    pdf.add_page()

    # Overview section
    pdf.chapter_title("Dataset Overview")
    rows, cols = df.shape
    overview = f"Number of Rows: {rows}\nNumber of Columns: {cols}\n"
    pdf.chapter_body(overview)

    # Missing values section
    pdf.table_header("Missing Values")
    pdf.table_row("Column", "Missing Values", "Missing Percentage")
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / rows) * 100
    for col, val in missing_values.items():
        pdf.table_row(col, val, f"{missing_percentage[col]:.2f}%")

    # Column summaries section
    pdf.chapter_title("Column Summaries")
    for column in df.columns:
        pdf.chapter_title(f"Column: {column}")
        if pd.api.types.is_numeric_dtype(df[column]):
            summary = summarize_numeric_column(df, column)
            column_summary = (f"- Type: {summary['Type']}\n- Mean: {summary['Mean']:.2f}\n"
                              f"- Median: {summary['Median']:.2f}\n- Standard Deviation: {summary['Standard Deviation']:.2f}\n")
        else:
            summary = summarize_categorical_column(df, column)
            top_categories = '\n'.join([f"  - {k}: {v}" for k, v in summary['Top Categories'].items()])
            if not (column == "pdb" or column == "fasta"):
                column_summary = (f"- Type: {summary['Type']}\n- Unique Values: {summary['Unique Values']}\n"
                                  f"- Top Categories:\n{top_categories}\n")
        pdf.chapter_body(column_summary)

    # Visualizations section
    pdf.chapter_title("Visualizations")
    numeric_cols = df.select_dtypes(include=['number']).columns
    for col in numeric_cols:
        image_path = os.path.join(output_dir, f'{col}_distribution.png')
        if os.path.exists(image_path):
            pdf.add_page()
            pdf.chapter_title(f"Distribution of {col}")
            pdf.image(image_path, x=10, y=40, w=180)
            pdf.ln(85)

    model_image_path = os.path.join(output_dir, 'model_distribution.png')
    if os.path.exists(model_image_path):
        pdf.add_page()
        pdf.chapter_title("Model Distribution")
        pdf.image(model_image_path, x=10, y=40, w=180)
        pdf.ln(85)

    pdf.output(report_path)
    print(f"PDF report generated at {report_path}")

# Main execution
def main():
    input_csv = '../data/csv/validation_test.csv'
    output_dir = './output_report/'
    markdown_report_path = os.path.join(output_dir, 'Dataset_Report.md')
    pdf_report_path = os.path.join(output_dir, 'Dataset_Report.pdf')

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df = load_dataset(input_csv, sep=';')

    # Clean and analyze the dataset
    for col in df.columns:
        df[col] = clean_column(df, col)

    # Generate plots
    plot_numeric_distributions(df, output_dir)
    plot_model_distribution(df, output_dir)

    # Generate markdown report
    generate_markdown_report(df, markdown_report_path, output_dir)
    print(f"Markdown report generated at {markdown_report_path}")

    # Generate PDF report
    generate_pdf_report(df, pdf_report_path, output_dir)

if __name__ == '__main__':
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.replace(r' >= 0.1', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].str.replace(r'%', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = pd.to_numeric(df[column])
A value is trying to be set on a copy of a s

Markdown report generated at ./output_report/Dataset_Report.md
PDF report generated at ./output_report/Dataset_Report.pdf
