<a href="https://colab.research.google.com/github/clp2454/QM2/blob/main/group%20project/Correlation_GP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
#Importing the data sets and assigning them each a dataframe
import pandas as pd
import scipy.stats
import numpy as np

# Starting with the cocoa price

url = "https://fred.stlouisfed.org/series/PCOCOUSDM#0"
try:
    # Trying to read as CSV first
    cocoa_price = pd.read_csv(url)
except pd.errors.ParserError:
    # If CSV parsing fails, trying to read as HTML
    cocoa_price_list = pd.read_html(url)
    # Usually the first table is the data, but we add a check to verify
    if len(cocoa_price_list) > 0:
        cocoa_price = cocoa_price_list[0]
    else:
        raise Exception("No tables found in HTML")

# Removing the NaN column from my data
cocoa_price = cocoa_price.drop(columns=[2])
#print(cocoa_price.head())

#Repeating the process with chocolate price

url = "https://fred.stlouisfed.org/series/PCU3113531135"
try:
    # Trying to read as CSV first
    chocolate_price = pd.read_csv(url)
    chocolate_price = chocolate_price.drop(columns=[1])
except pd.errors.ParserError:
    # If CSV parsing fails, trying to read as HTML
    chocolate_price_list = pd.read_html(url)
    # Usually the first table is the data, but we add a check to verify
    if len(chocolate_price_list) > 0:
        chocolate_price = chocolate_price_list[0]
    else:
        raise Exception("No tables found in HTML")

chocolate_price = chocolate_price.drop(columns=[2])

# The columns labelled as zero will now be renamed to 'Date' and formatted for comparison
cocoa_price = cocoa_price.rename(columns={0: 'Date'})
cocoa_price['Date'] = pd.to_datetime(cocoa_price['Date'], format='%b %Y:', errors='coerce')

chocolate_price = chocolate_price.rename(columns={0: 'Date'})
chocolate_price['Date'] = pd.to_datetime(chocolate_price['Date'], format='%b %Y:', errors='coerce')

# Convert 'Cocoa Price' and 'Chocolate Price' columns to numeric
# The errors='coerce' argument will convert any invalid strings to NaN
cocoa_price[1] = pd.to_numeric(cocoa_price[1], errors='coerce')
chocolate_price[1] = pd.to_numeric(chocolate_price[1], errors='coerce')

print(cocoa_price.head())
print(chocolate_price.head())

        Date           1
0 2024-11-01  7919.32827
1 2024-10-01  6582.85802
2 2024-09-01  6421.82566
3 2024-08-01  6791.95382
4 2024-07-01  7164.62804
        Date        1
0 2024-11-01  168.539
1 2024-10-01  159.194
2 2024-09-01  158.634
3 2024-08-01  156.803
4 2024-07-01  154.422


In [21]:
#Calculating the Pearson Correlation Coefficient for cocoa price vs chocolate price

# Merge the DataFrames on the 'Date' column
merged_data = pd.merge(cocoa_price, chocolate_price, on='Date', how='inner')
merged_data = merged_data.rename(columns={'1_x': 'Cocoa Price', '1_y': 'Chocolate Price'})

# Remove rows with NaN or inf in 'Cocoa Price' and 'Chocolate Price' columns
merged_data = merged_data.replace([np.inf, -np.inf], np.nan).dropna(subset=['Cocoa Price', 'Chocolate Price'])


# Calculate the Pearson correlation coefficient and p-value
cocoa_prices = merged_data['Cocoa Price']
chocolate_prices = merged_data['Chocolate Price']
correlation_coefficient, p_value = scipy.stats.pearsonr(cocoa_prices, chocolate_prices)

# Print the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-value: {p_value}")


Pearson Correlation Coefficient: 0.6760945569290754
P-value: 0.21020585710193565
