# Comparisons

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
from datetime import datetime, timedelta
set_style("whitegrid")

In [27]:
dst_dates = pd.read_csv('data/DST_fri_mon.csv')

In [28]:
dst_fri_dates = list(dst_dates['fall_fri_before'])+list(dst_dates['spring_fri_before'])

# Function

In [71]:
def cleaner(stock):
    # drop null (for Yahoo, it seems that it's 'high' that is null)
    stock = stock.dropna(subset=['High'])
    # reverses the dataframe; Yahoo is from more recent to latest; I wanted to reverse the chronology
    stock = stock.iloc[::-1].reset_index(drop=True)

    # Exclude the 'Date' column and convert the rest to numeric
    stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
        lambda x: pd.to_numeric(str(x).replace(',', ''), errors='coerce')
    )

    stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')

    # Filter for Fridays and Mondays
    filtered_stock= stock[stock['datetime'].dt.dayofweek.isin([0, 4])]

    # Create a new column 'DayType' indicating 'Monday' or 'Friday'
    filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')

    stock=filtered_stock
    
    # Initialize a list to store the valid rows
    valid_rows = []

    # Iterate through the group (stock data) to check Friday-Monday intervals
    for i in range(len(stock) - 1):
        current_day = stock.iloc[i]['DayType']
        next_day = stock.iloc[i + 1]['DayType']
        current_date = stock.iloc[i]['datetime']
        next_date = stock.iloc[i + 1]['datetime']

        # Check if current day is Friday and next day is Monday
        if current_day == 'Friday' and next_day == 'Monday':
            # Calculate the difference in days
            days_diff = (next_date - current_date).days
            # If the difference is within 3 days, add both rows to the valid list
            if days_diff <= 3:
                valid_rows.append(stock.iloc[i])       # Add the Friday row
                valid_rows.append(stock.iloc[i + 1])   # Add the Monday row

    # Create a new DataFrame from the valid rows
    valid_pairs_df = pd.DataFrame(valid_rows).reset_index(drop=True)
    
    # Initialize a list to store the combined rows
    combined_rows = []

    # Iterate through the valid pairs (step by 2 because each pair has a Friday and a Monday)
    for i in range(0, len(valid_pairs_df), 2):
        # Get the Friday and Monday rows
        friday_row = valid_pairs_df.iloc[i]
        monday_row = valid_pairs_df.iloc[i + 1]

        # Combine the data into a single row (prefix columns with 'Friday_' and 'Monday_')
        combined_data = {
            'Friday_date': friday_row['datetime'],     # Friday date
            'Monday_date': monday_row['datetime'],     # Monday date
            'Friday_day': friday_row['DayType'],       # Should be 'Friday'
            'Monday_day': monday_row['DayType'],       # Should be 'Monday'
        }

        # Add all other columns, prefixing them with 'Friday_' or 'Monday_'
        for col in friday_row.index:
            if col not in ['datetime', 'DayType']:
                combined_data[f'Friday_{col}'] = friday_row[col]
                combined_data[f'Monday_{col}'] = monday_row[col]

        # Append the combined data to the list
        combined_rows.append(combined_data)

    # Create a new DataFrame from the combined rows
    combined_df = pd.DataFrame(combined_rows)

    combined_df.drop(columns=['Friday_day', 'Monday_day'], inplace=True)

    combined_df.drop(columns=['Friday_Date', 'Monday_Date'], inplace=True)

    stock = combined_df
    stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)

    return stock

## CIB: Bancolombia S.A., Finance (commercial banks)

In [67]:
CIB = pd.read_csv('data/CIB.csv')

In [72]:
CIB.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,26-Nov-24,33.11,33.06,32.69,32.87,32.87,93609
1,25-Nov-24,33.01,33.63,32.84,33.33,33.33,507800
2,22-Nov-24,32.8,32.99,32.61,32.79,32.79,85300
3,21-Nov-24,33.3,33.61,32.76,32.76,32.76,184400
4,20-Nov-24,32.86,33.5,32.76,33.47,33.47,375800


In [74]:
CIB=cleaner(CIB)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [76]:
CIB.to_csv('data/CIB.csv')

In [84]:
CIB.head()

Unnamed: 0,Friday_date,Monday_date,Friday_Open,Monday_Open,Friday_High,Monday_High,Friday_Low,Monday_Low,Friday_Close,Monday_Close,Friday_Adj Close,Monday_Adj Close,Friday_Volume,Monday_Volume,y
0,2019-01-04,2019-01-07,39.49,40.74,40.88,41.32,39.49,40.5,40.78,40.69,28.21,28.15,195200,173500,0
1,2019-01-11,2019-01-14,43.02,43.0,43.5,43.66,42.77,42.82,43.48,43.6,30.08,30.16,401800,169500,0
2,2019-01-25,2019-01-28,43.87,43.11,44.0,43.58,43.24,43.01,43.58,43.35,30.14,29.99,257100,348200,0
3,2019-02-01,2019-02-04,44.41,44.61,45.08,46.54,44.3,44.61,45.0,46.5,31.13,32.16,267900,281900,0
4,2019-02-08,2019-02-11,46.4,46.0,46.56,46.03,45.76,45.54,46.16,45.56,31.93,31.51,129900,136500,0


# S&P500 ^GSPC

In [85]:
sp500=pd.read_csv('data/SP500.csv')

In [90]:
sp500=cleaner(sp500)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [91]:
sp500.head()

Unnamed: 0,Friday_date,Monday_date,Friday_Open,Monday_Open,Friday_High,Monday_High,Friday_Low,Monday_Low,Friday_Close,Monday_Close,Friday_Adj Close,Monday_Adj Close,Friday_Volume,Monday_Volume,y
0,2019-01-04,2019-01-07,2474.33,2535.61,2538.07,2566.16,2474.33,2524.56,2531.94,2549.69,2531.94,2549.69,4234140000,4133120000,0
1,2019-01-11,2019-01-14,2588.11,2580.31,2596.27,2589.32,2577.4,2570.41,2596.26,2582.61,2596.26,2582.61,3447460000,3689370000,0
2,2019-01-25,2019-01-28,2657.44,2644.97,2672.38,2644.97,2657.33,2624.06,2664.76,2643.85,2664.76,2643.85,3821000000,3630820000,0
3,2019-02-01,2019-02-04,2702.32,2706.49,2716.66,2724.99,2696.88,2698.75,2706.53,2724.87,2706.53,2724.87,3782490000,3369450000,0
4,2019-02-08,2019-02-11,2692.36,2712.4,2708.07,2718.05,2681.83,2703.79,2707.88,2709.8,2707.88,2709.8,3649510000,3395330000,0


In [92]:
sp500.to_csv('data/sp500.csv')

# AHH: Armada Hoffler Properties, Inc., Finance (real estate)

In [98]:
AHH=pd.read_csv('data/AHH.csv')

In [100]:
AHH = AHH.iloc[::-1].reset_index(drop=True)

In [101]:
AHH=cleaner(AHH)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [102]:
AHH.head()

Unnamed: 0,Friday_date,Monday_date,Friday_Open,Monday_Open,Friday_High,Monday_High,Friday_Low,Monday_Low,Friday_Close,Monday_Close,Friday_Adj Close,Monday_Adj Close,Friday_Volume,Monday_Volume,y
0,2019-01-04,2019-01-07,13.84,14.22,14.15,14.39,13.71,14.02,14.07,14.3,10.17,10.34,202700,306200,0
1,2019-01-11,2019-01-14,14.79,14.73,14.85,14.95,14.64,14.58,14.81,14.62,10.7,10.57,201000,361700,0
2,2019-01-25,2019-01-28,14.43,14.42,14.6,14.7,14.41,14.38,14.51,14.62,10.49,10.57,185900,229200,0
3,2019-02-01,2019-02-04,15.03,14.97,15.1,15.3,14.76,14.93,14.99,15.3,10.83,11.06,119500,148000,0
4,2019-02-08,2019-02-11,15.08,15.02,15.36,15.45,15.08,15.02,15.19,15.43,10.98,11.15,194300,239400,0


In [103]:
AHH.to_csv('data/AHH.csv')

# MCI: Bairings Corporate Investors, Finance

In [123]:
MCI=pd.read_csv('data/MCI.csv')

In [124]:
MCI=cleaner(MCI)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [125]:
MCI.to_csv('data/MCI.csv')

# ALOT: Astronova, Inc., Technology (computer peripheral equipment)

In [104]:
ALOT=pd.read_csv('data/ALOT.csv')

In [107]:
ALOT=cleaner(ALOT)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [109]:
ALOT.to_csv('data/ALOT.csv')

# EMKR: Emcore Corporation, Technology (semiconductors)

In [110]:
EMKR=pd.read_csv('data/EMKR.csv')

In [111]:
EMKR=cleaner(EMKR)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [113]:
EMKR.to_csv('data/EMKR.csv')

# HHC: Howard Hughes Corporation, Consumer Services (real estate investment trusts)

They seem to have changed their name to Howard Hughes Holdings and the ticker is now HHH

In [114]:
HHC=pd.read_csv('data/HHC.csv')

In [115]:
HHC=cleaner(HHC)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [116]:
HHC.to_csv('data/HHC.csv')

# NSC: Norfolk Souther Corp, Transportation (railroads)

In [117]:
NSC=pd.read_csv('data/NSC.csv')

In [118]:
NSC = cleaner(NSC)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [119]:
NSC.to_csv('data/NSC.csv')

# UTL: Unitil Corp., Public Utilities (power generation)

In [120]:
UTL=pd.read_csv('data/UTL.csv')

In [121]:
UTL=cleaner(UTL)

  stock.loc[:, stock.columns != 'Date'] = stock.loc[:, stock.columns != 'Date'].applymap(
  stock['datetime'] = pd.to_datetime(stock['Date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_stock['DayType'] = np.where(filtered_stock['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')
  stock['y'] = stock['Friday_date'].isin(dst_fri_dates).astype(int)


In [122]:
UTL.to_csv('data/UTL.csv')