Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from statsmodels.graphics.gofplots import qqplot
import scipy.stats as stats

import os
from os import listdir
from os.path import isfile, join
import glob
pd.set_option('display.max_columns', 50)

Function to plot everything contained in a given dataframe

In [6]:
def plot_values(df, t, average, exclude_cols=[]):
    cols = [col for col in df.columns if not any(col.startswith(exclude) for exclude in exclude_cols)]
    num_plots = len(cols)
    num_cols = 3
    num_rows = num_plots // num_cols + (num_plots % num_cols > 0)
    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 5*num_rows))
    axs = axs.flatten()

    for i, column in enumerate(cols):
        title = column + ": (average)" if average else column + ": (sum)"
        color = "#" + ''.join([random.choice('0123456789ABCDEF') for j in range(6)])
        axs[i].plot(t, df[column], label=title, color=color)
        axs[i].set_xlabel('Timestep')
        axs[i].set_ylabel('Values')
        axs[i].set_title(column)
        axs[i].legend()
        axs[i].grid(True)
    plt.tight_layout()
    plt.show()


Function to plot histogram of given timesteps, column and dataframe

In [2]:
def plot_histogram(df, times, cols):
    if len(times) == 1:
        fig, ax = plt.subplots(figsize=(8, 5))
        ax.hist(df[df['date'] == times[0]][cols[0]], bins=25)
        ax.set_xlabel(cols[0])
        ax.set_ylabel('Frequency')
        ax.set_title(f'Distribution of {cols[0]} at t={times[0]}')
        plt.show()
    else:
        fig, axs = plt.subplots(nrows=1, ncols=len(times), figsize=(15, 5))
        for i, time in enumerate(times):
            axs[i].hist(df[df['date'] == time][col], bins=25)
            axs[i].set_xlabel(col)
            axs[i].set_ylabel('Frequency')
            axs[i].set_title(f'Distribution of {col} at t={time}')
        plt.tight_layout()
        plt.show()


Function to output distribution properties and histogram plots for given dataframe, column, and times


In [4]:


def analyze_distribution(df, col_name, dates):
    results = []
    for date in dates:
        # Filter the DataFrame by date
        df_date = df[df['date'] == date]  # assuming the date column is named 'date'

        # Calculate statistics
        mean = df_date[col_name].mean()
        std = df_date[col_name].std()
        min_val = df_date[col_name].min()
        max_val = df_date[col_name].max()
        
        # Assuming the data follows a normal distribution, the bounds of the uniform
        # distribution that may have generated the data would be around the 3-sigma
        # range (as it contains about 99.7% of the data in a normal distribution).
        uniform_min = mean - 3 * std
        uniform_max = mean + 3 * std
        
        # Adjust the bounds to not exceed the actual min and max values
        uniform_min = max(uniform_min, min_val)
        uniform_max = min(uniform_max, max_val)

        # Perform Shapiro-Wilk test for normality
        _, p_value = stats.shapiro(df_date[col_name])

        result = {
            'date': date,
            'mean': mean,
            'std': std,
            'min': min_val,
            'max': max_val,
            'uniform_distribution_bounds': (uniform_min, uniform_max),
            'normality_p_value': p_value
        }
        results.append(result)

        plot_histogram(df_date, [date], [col_name])

    return results


Function to plot multiple columns together in the same plot

In [5]:
def plot_columns(df, *col_names):
    # Create a new figure
    plt.figure(figsize=(10, 6))
    
    # Define some colors (add more if needed)
    colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
    
    for i, col_name in enumerate(col_names):
        # Normalize the column to [0, 1] for plotting on the same graph
        column = df[col_name]
        normalized_column = (column - column.min()) / (column.max() - column.min())
        
        # Plot the column with a color
        plt.plot(df.index, normalized_column, color=colors[i % len(colors)], label=col_name)
    
    # Add a legend
    plt.legend()
    
    # Show the plot
    plt.show()
