# Import libraries

In [289]:
import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter

# Load the data into a Panda dataframe

In [290]:
df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    'mattiuzc/stock-exchange-data',
    path='indexData.csv')

df.head()

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,NYA,1965-12-31,528.690002,528.690002,528.690002,528.690002,528.690002,0.0
1,NYA,1966-01-03,527.210022,527.210022,527.210022,527.210022,527.210022,0.0
2,NYA,1966-01-04,527.840027,527.840027,527.840027,527.840027,527.840027,0.0
3,NYA,1966-01-05,531.119995,531.119995,531.119995,531.119995,531.119995,0.0
4,NYA,1966-01-06,532.070007,532.070007,532.070007,532.070007,532.070007,0.0


# Filter data based on index and date range

In [291]:
# Filter rows where Index is NYA, IXIC or GSPTSE
df = df[df['Index'].isin(['NYA', 'IXIC', 'GSPTSE'])]

# Convert the Date column from object to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Split up the dataset into pre-covid / post-covid time frames
precovid_start_date = '2019-03-01'
precovid_end_date = '2020-02-29'

post_covid_start_date = '2020-03-01'
post_covid_end_date = '2021-05-31'

precovid_df = df[(df['Date'] >= precovid_start_date) & (df['Date'] <= precovid_end_date)]
postcovid_df = df[(df['Date'] >= post_covid_start_date) & (df['Date'] <= post_covid_end_date)]

print(f"precovid_df min date = {precovid_df['Date'].min()}, precovid_df max date = {precovid_df['Date'].max()}")
print(f"post_covid_df min date = {postcovid_df['Date'].min()}, postcovid_df max date = {postcovid_df['Date'].max()}")

precovid_df min date = 2019-03-01 00:00:00, precovid_df max date = 2020-02-28 00:00:00
post_covid_df min date = 2020-03-02 00:00:00, postcovid_df max date = 2021-05-31 00:00:00


# Check if there are any missing / nan values

In [292]:
precovid_df.isnull().sum()

Index        0
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [293]:
postcovid_df.isnull().sum()

Index        0
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

# Sort by Index and Date

In [294]:
precovid_df = precovid_df.sort_values(['Index', 'Date']).reset_index(drop=True)
postcovid_df = postcovid_df.sort_values(['Index', 'Date']).reset_index(drop=True)

In [295]:
precovid_df.head()

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,GSPTSE,2019-03-01,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0
1,GSPTSE,2019-03-04,16088.59961,16093.5,15955.0,16038.09961,16038.09961,28635800000.0
2,GSPTSE,2019-03-05,16077.29981,16105.29981,16068.59961,16086.5,16086.5,25650400000.0
3,GSPTSE,2019-03-06,16098.7002,16145.79981,16077.59961,16092.09961,16092.09961,27253180000.0
4,GSPTSE,2019-03-07,16079.0,16085.40039,15981.79981,16056.5,16056.5,27171920000.0


In [296]:
print(postcovid_df.columns)

Index(['Index', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')


# Add rows for days when the market is closed (e.g. weekends, holidays) and Market Closed column

In [297]:
def add_market_closure_rows(df):
    # Compute full date range from min to max date (e.g. 2019-03-01 to 2020-02-29)
    date_range = pd.date_range(start=df['Date'].min(), end=df['Date'].max(), freq='D')

    # Add new rows to the data frame (all columns will be NaN, including Index)
    # This temporarily sets dataframe index to Date (the index will be reset later)
    df = df.set_index('Date').reindex(date_range)

    # Add MarketClosed column
    df['Market Closed'] = df['Open'].isna()

    # Forward-fill columns
    df = df.ffill()

    # Reset dataframe index (not the Index column)
    df = df.reset_index().rename(columns={'index': 'Date'})

    return df

precovid_df = precovid_df.groupby('Index').apply(add_market_closure_rows).reset_index(drop=True)
postcovid_df = postcovid_df.groupby('Index').apply(add_market_closure_rows).reset_index(drop=True)
precovid_df.head(20)


  precovid_df = precovid_df.groupby('Index').apply(add_market_closure_rows).reset_index(drop=True)
  postcovid_df = postcovid_df.groupby('Index').apply(add_market_closure_rows).reset_index(drop=True)


Unnamed: 0,Date,Index,Open,High,Low,Close,Adj Close,Volume,Market Closed
0,2019-03-01,GSPTSE,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0,False
1,2019-03-02,GSPTSE,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0,True
2,2019-03-03,GSPTSE,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0,True
3,2019-03-04,GSPTSE,16088.59961,16093.5,15955.0,16038.09961,16038.09961,28635800000.0,False
4,2019-03-05,GSPTSE,16077.29981,16105.29981,16068.59961,16086.5,16086.5,25650400000.0,False
5,2019-03-06,GSPTSE,16098.7002,16145.79981,16077.59961,16092.09961,16092.09961,27253180000.0,False
6,2019-03-07,GSPTSE,16079.0,16085.40039,15981.79981,16056.5,16056.5,27171920000.0,False
7,2019-03-08,GSPTSE,15958.09961,16000.79981,15891.90039,15996.2002,15996.2002,25541290000.0,False
8,2019-03-09,GSPTSE,15958.09961,16000.79981,15891.90039,15996.2002,15996.2002,25541290000.0,True
9,2019-03-10,GSPTSE,15958.09961,16000.79981,15891.90039,15996.2002,15996.2002,25541290000.0,True


# Calculate 10-day and 21-day moving averages

In [298]:
precovid_df['Adj Close Moving Average 10'] = precovid_df.groupby('Index')['Adj Close'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
precovid_df['Adj Close Moving Average 21'] = precovid_df.groupby('Index')['Adj Close'].transform(lambda x: x.rolling(window=21, min_periods=1).mean())

postcovid_df['Adj Close Moving Average 10'] = postcovid_df.groupby('Index')['Adj Close'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())
postcovid_df['Adj Close Moving Average 21'] = postcovid_df.groupby('Index')['Adj Close'].transform(lambda x: x.rolling(window=10, min_periods=1).mean())

In [299]:
precovid_df.head(20)

Unnamed: 0,Date,Index,Open,High,Low,Close,Adj Close,Volume,Market Closed,Adj Close Moving Average 10,Adj Close Moving Average 21
0,2019-03-01,GSPTSE,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0,False,16068.29981,16068.29981
1,2019-03-02,GSPTSE,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0,True,16068.29981,16068.29981
2,2019-03-03,GSPTSE,16086.29981,16122.09961,16058.09961,16068.29981,16068.29981,24300350000.0,True,16068.29981,16068.29981
3,2019-03-04,GSPTSE,16088.59961,16093.5,15955.0,16038.09961,16038.09961,28635800000.0,False,16060.74976,16060.74976
4,2019-03-05,GSPTSE,16077.29981,16105.29981,16068.59961,16086.5,16086.5,25650400000.0,False,16065.899808,16065.899808
5,2019-03-06,GSPTSE,16098.7002,16145.79981,16077.59961,16092.09961,16092.09961,27253180000.0,False,16070.266442,16070.266442
6,2019-03-07,GSPTSE,16079.0,16085.40039,15981.79981,16056.5,16056.5,27171920000.0,False,16068.299807,16068.299807
7,2019-03-08,GSPTSE,15958.09961,16000.79981,15891.90039,15996.2002,15996.2002,25541290000.0,False,16059.287356,16059.287356
8,2019-03-09,GSPTSE,15958.09961,16000.79981,15891.90039,15996.2002,15996.2002,25541290000.0,True,16052.277672,16052.277672
9,2019-03-10,GSPTSE,15958.09961,16000.79981,15891.90039,15996.2002,15996.2002,25541290000.0,True,16046.669925,16046.669925


In [300]:
postcovid_df.head(20)

Unnamed: 0,Date,Index,Open,High,Low,Close,Adj Close,Volume,Market Closed,Adj Close Moving Average 10,Adj Close Moving Average 21
0,2020-03-02,GSPTSE,16325.0,16566.69922,16166.29981,16553.30078,16553.30078,37472150000.0,False,16553.30078,16553.30078
1,2020-03-03,GSPTSE,16674.90039,16798.19922,16378.29981,16423.59961,16423.59961,39312790000.0,False,16488.450195,16488.450195
2,2020-03-04,GSPTSE,16659.90039,16779.5,16539.40039,16779.5,16779.5,27371860000.0,False,16585.466797,16585.466797
3,2020-03-05,GSPTSE,16541.59961,16684.90039,16456.69922,16554.0,16554.0,31301230000.0,False,16577.600097,16577.600097
4,2020-03-06,GSPTSE,16222.09961,16261.5,16015.79981,16175.0,16175.0,37470380000.0,False,16497.080078,16497.080078
5,2020-03-07,GSPTSE,16222.09961,16261.5,16015.79981,16175.0,16175.0,37470380000.0,True,16443.400065,16443.400065
6,2020-03-08,GSPTSE,16222.09961,16261.5,16015.79981,16175.0,16175.0,37470380000.0,True,16405.057199,16405.057199
7,2020-03-09,GSPTSE,15221.79981,15240.29981,14498.29981,14514.2002,14514.2002,55922740000.0,False,16168.700074,16168.700074
8,2020-03-10,GSPTSE,15019.0,15048.79981,14481.90039,14958.09961,14958.09961,57100130000.0,False,16034.188911,16034.188911
9,2020-03-11,GSPTSE,14639.7002,14714.2002,14185.2002,14270.09961,14270.09961,46899090000.0,False,15857.779981,15857.779981


# Save results to CSV files

In [301]:
precovid_df.to_csv('../data/precovid.csv', index = False)
postcovid_df.to_csv('../data/postcovid.csv', index = False)