In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fpdf import FPDF
import numpy as np

# Function to load dataset
def load_data(filepath):
    return pd.read_csv(filepath)

# Function to identify column types
def identify_column_types(data):
    numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = data.select_dtypes(include=[object, 'category']).columns.tolist()
    return numerical_cols, categorical_cols

# Function to handle missing data
def handle_missing_data(data):
    missing_data = data.isnull().sum() / len(data) * 100
    return missing_data[missing_data > 0]

# Function to generate statistical summary
def generate_stat_summary(data, numerical_cols):
    return data[numerical_cols].describe()

# Function to generate correlation matrix
def generate_correlation_matrix(data, numerical_cols):
    return data[numerical_cols].corr()

# Function to detect outliers using IQR
def detect_outliers(data, numerical_cols):
    outliers = {}
    for col in numerical_cols:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers[col] = data[(data[col] < (Q1 - 1.5 * IQR)) | (data[col] > (Q3 + 1.5 * IQR))]
    return outliers

# Visualization: Histogram for Numerical Data
def plot_numerical_distributions(data, numerical_cols):
    data[numerical_cols].hist(bins=30, figsize=(15, 10))
    plt.savefig('numerical_distributions.png')
    plt.close()

# Visualization: Correlation Heatmap
def plot_correlation_heatmap(data, numerical_cols):
    plt.figure(figsize=(10, 8))
    sns.heatmap(data[numerical_cols].corr(), annot=True, cmap='coolwarm')
    plt.savefig('correlation_heatmap.png')
    plt.close()

# Visualization: Categorical Data Bar Plots
def plot_categorical_data(data, categorical_cols):
    for col in categorical_cols:
        plt.figure(figsize=(8, 5))
        sns.countplot(y=data[col])
        plt.title(f"Distribution of {col}")
        plt.savefig(f'{col}_distribution.png')
        plt.close()

# Generate PDF Report
class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Automatic Data Analysis Report', 0, 1, 'C')

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(10)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

    def insert_image(self, image_path):
        self.image(image_path, x=10, w=190)
        self.ln(10)

# Main function to generate the analysis report
def generate_report(data):
    pdf = PDFReport()

    # Title page
    pdf.add_page()
    pdf.chapter_title('Dataset Overview')

    # Handling missing data
    missing_data = handle_missing_data(data)
    pdf.chapter_body('Missing Data Percentage:\n' + str(missing_data))

    # Identifying column types
    numerical_cols, categorical_cols = identify_column_types(data)

    # Statistical Summary
    pdf.chapter_title('Statistical Summary for Numerical Columns')
    stat_summary = generate_stat_summary(data, numerical_cols)
    pdf.chapter_body(str(stat_summary))

    # Correlation Heatmap
    pdf.chapter_title('Correlation Matrix for Numerical Data')
    plot_correlation_heatmap(data, numerical_cols)
    pdf.insert_image('correlation_heatmap.png')

    # Outlier Detection
    outliers = detect_outliers(data, numerical_cols)
    pdf.chapter_title('Outlier Detection')
    for col, outlier_data in outliers.items():
        pdf.chapter_body(f'Outliers in {col}:\n' + str(outlier_data))

    # Distributions of Numerical Data
    pdf.chapter_title('Distributions of Numerical Data')
    plot_numerical_distributions(data, numerical_cols)
    pdf.insert_image('numerical_distributions.png')

    # Categorical Data Distributions
    pdf.chapter_title('Distributions of Categorical Data')
    plot_categorical_data(data, categorical_cols)
    for col in categorical_cols:
        pdf.insert_image(f'{col}_distribution.png')

    # Save PDF report
    pdf.output('dynamic_data_analysis_report.pdf')

# # Example usage
# if __name__ == "__main__":
#     # Load dataset (replace 'your_dataset.csv' with your actual dataset path)
data = load_data('Forwards.csv')

# Generate the report
generate_report(data)


In [2]:
pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40713 sha256=29b55b9a176ca9b732456a4ad142d6e4201e5367cbce83733768ec2a67c6978b
  Stored in directory: c:\users\darshi\appdata\local\pip\cache\wheels\65\4f\66\bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Note: you may need to restart the kernel to use updated packages.
