# D-1 Group Project

Import libraries

In [None]:
import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Read in the data from the csv file and put it into a pandas DataFrame. Then, modify and clean the data and put it into a new csv.

In [None]:
def read_and_clean_data():
    df = pd.read_csv("data.csv")

    new_col_names = {
        "Day of the year": "day",
        "Time in hours:minutes": "24_hr",
        "Tide Height in feet": "tide_height",
    }

    df.rename(columns=new_col_names, inplace=True)

    df["datetime"] = pd.to_datetime(df["24_hr"], format="%H:%M")
    df["minutes"] = df["datetime"].dt.hour * 60 + df["datetime"].dt.minute
    # df["time"] = ((df["day"] - 1) * 24 * 60) + df["minutes"]

    df.drop(columns=["day", "24_hr", "datetime"], inplace=True)
    df = df[["minutes", "tide_height"]]

    df.to_csv("cleaned_data.csv", index=False)

In [None]:
read_and_clean_data()

Create the python function for the oscillatory function. The formula is a slightly modified version of the one given by the TA.

In [None]:
def oscillatory_func(t, A, B, C, D):
    return A * np.sin(B * t + C) + D

Function that fits the oscillation function to the data using `scipy` and returns an array with the points on the fitted curve.

In [None]:
def fit_data(df):
    # initial guess for values from experimenting with Desmos graphing calculator
    initial_guess = [-4.4, 0.005, -3.6, 2.4]

    # sigma given by the TA
    rmse = 0.25

    # creating weights based on the sigma
    weights = 1 / rmse

    # call scipy.optimize.curve_fit() to optimize the oscillation function
    params, _ = scipy.optimize.curve_fit(
        oscillatory_func,
        df["minutes"],
        df["tide_height"],
        p0=initial_guess,
        sigma=np.full(len(df), weights),
    )

    # get the values from the curve_fit()
    A, B, C, D = params

    # use the oscillatory function and the optimized values to get the fitted curve
    fitted_curve = oscillatory_func(df["minutes"], A, B, C, D)
    
    # return the points on the fitted curve as an array
    return np.array(fitted_curve)

This function takes in the dataframe and the data from the fitted curve and creates a plot. The figure is saved as a PDF.

In [None]:
def plot_data_and_curve(df, fitted_curve):
    # create the plot for the data
    plt.figure(1)
    plt.scatter(df["minutes"], df["tide_height"], label="Original Data")
    plt.plot(df["minutes"], fitted_curve, "r-", label="Fitted Curve")
    plt.xlabel("Time (minutes)")
    plt.ylabel("Height (feet)")
    plt.legend()
    
    # save the figure as a PDF
    plt.savefig("Part2.pdf", bbox_inches="tight", dpi=400)

This function takes an array as an argument, creates a histogram, and prints the standard deviation (to 3 decimal places) based on the values in the array. The figure is saved as a PDF.

In [None]:
def plot_residuals_hist(arr):
    # create a histogram for the data
    plt.figure(2)
    w = 0.5
    plt.hist(
        arr,
        bins=np.arange(min(arr), max(arr) + w, w),
        edgecolor="black",
    )
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.title("Histogram of Residuals")

    plt.text(-2.0,20,f"Standard Deviation = {np.std(arr):4.3}")
    
    # save the figure as a PDF
    plt.savefig("Part3.pdf", bbox_inches="tight", dpi=400)

Data is read from the cleaned_data csv into a pandas DataFrame. The values are then sorted by the "minutes" column to prep for plotting and working with the data. After that, the `fit_data()` function is called to get the best fit curve using the oscillation function we defined above. The data from that function is used by `plot_data_and_curve()` to plot the curve of best fit and the original data points. The residuals are calculated from the point on the best fit curve that we stored from `fit_data()`. The `array_to_hist()` function is then called to create a histogram of the residuals.

In [None]:
# read in the cleaned data and sort the DataFrame
df = pd.read_csv("cleaned_data.csv")
df.sort_values(by="minutes", inplace=True)

# get the curve of best fit
best_fit_curve_vals = fit_data(df)

# plot the curve of best fit with the original data
plot_data_and_curve(df, best_fit_curve_vals)

# calculate the residuals by finding the difference between the original data and the data points on the curve of best fit
residuals = df["tide_height"] - best_fit_curve_vals

# plot the residuals in a histogram
plot_residuals_hist(residuals)