In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
from datetime import datetime, timedelta
set_style("whitegrid")
import warnings
import contextlib
import io

In [2]:
# huge dataset with 20 million rows
df = pd.read_csv('data/historical_stock_prices.csv')
df

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.50,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.50,11.66,275800,2013-05-09
2,AHH,11.55,11.60,8.507822,11.50,11.60,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.60,11.53,8.456484,11.50,11.60,184100,2013-05-14
...,...,...,...,...,...,...,...,...
20973884,NZF,14.60,14.59,14.590000,14.58,14.62,137500,2018-08-20
20973885,NZF,14.60,14.58,14.580000,14.57,14.61,151200,2018-08-21
20973886,NZF,14.58,14.59,14.590000,14.57,14.63,185400,2018-08-22
20973887,NZF,14.60,14.57,14.570000,14.57,14.64,135600,2018-08-23


In [3]:
# turn a date string into a datetime object; then the library can tell us the day of the week of the date
# thought it would be useful; 0=Monday,...,6=Sunday
datetime.strptime(df['date'].loc[2],'%Y-%m-%d').weekday()

4

In [4]:
df.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14


In [5]:
# getting the codes that tell us what the stocks are; e.g. PIH = Property Insurance Holdings
df2=pd.read_csv('data/historical_stocks.csv')
df2.head()

Unnamed: 0,ticker,exchange,name,sector,industry
0,PIH,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
1,PIHPP,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
2,TURN,NASDAQ,180 DEGREE CAPITAL CORP.,FINANCE,FINANCE/INVESTORS SERVICES
3,FLWS,NASDAQ,"1-800 FLOWERS.COM, INC.",CONSUMER SERVICES,OTHER SPECIALTY STORES
4,FCCY,NASDAQ,1ST CONSTITUTION BANCORP (NJ),FINANCE,SAVINGS INSTITUTIONS


In [6]:
# learn what the sectors are; seems there are 11 sectors if we don't count, nan, Misc, and Sector
df2['sector'].unique()

array(['FINANCE', 'CONSUMER SERVICES', 'TECHNOLOGY', 'PUBLIC UTILITIES',
       'CAPITAL GOODS', 'BASIC INDUSTRIES', 'HEALTH CARE',
       'CONSUMER DURABLES', nan, 'ENERGY', 'MISCELLANEOUS', 'SECTOR',
       'TRANSPORTATION', 'CONSUMER NON-DURABLES'], dtype=object)

In [7]:
# Calculate the percentage of each sector
sector_percentages = df2['sector'].value_counts(normalize=True) * 100
print(sector_percentages)

sector
FINANCE                  20.358566
CONSUMER SERVICES        15.856574
HEALTH CARE              15.617530
TECHNOLOGY               12.091633
CAPITAL GOODS             7.011952
ENERGY                    5.697211
PUBLIC UTILITIES          5.438247
BASIC INDUSTRIES          5.418327
CONSUMER NON-DURABLES     4.462151
CONSUMER DURABLES         2.868526
MISCELLANEOUS             2.768924
TRANSPORTATION            2.390438
SECTOR                    0.019920
Name: proportion, dtype: float64


In [50]:
# get all the stocks in transportation
tech_tickers = df2[df2['sector']=='TECHNOLOGY'][['ticker','name']]
tech_tickers.head(20)

Unnamed: 0,ticker,name
6,VNET,"21VIANET GROUP, INC."
7,TWOU,"2U, INC."
8,JOBS,"51JOB, INC."
22,ACIA,"ACACIA COMMUNICATIONS, INC."
37,ACIW,"ACI WORLDWIDE, INC."
39,ACMR,"ACM RESEARCH, INC."
43,ATVI,"ACTIVISION BLIZZARD, INC"
44,ACXM,ACXIOM CORPORATION
50,IOTS,ADESTO TECHNOLOGIES CORPORATION
52,ADBE,ADOBE SYSTEMS INCORPORATED


In [8]:
def get_stocks(df, df2, stock_code):
    # get prices for a particular stock; this function takes in two data frames where one has the prices
    # the other has the codes
    print(df2[df2['ticker']==stock_code])
    return df[df['ticker']==stock_code].reset_index()

In [12]:
df_tech=pd.DataFrame()

for stock in tech_tickers['ticker'].values:
    with contextlib.redirect_stdout(io.StringIO()):
        df_stock = get_stocks(df,df2,stock)
    if df_stock.empty:
        print("Skipping empty DataFrame:", stock)
        continue
    df_tech = pd.concat([df_tech,df_stock],ignore_index=True)

Skipping empty DataFrame: AMRHW
Skipping empty DataFrame: CNIT
Skipping empty DataFrame: CHUBA
Skipping empty DataFrame: CHUBK
Skipping empty DataFrame: CVONW
Skipping empty DataFrame: GFNSL
Skipping empty DataFrame: MSCC
Skipping empty DataFrame: STLRU
Skipping empty DataFrame: STLRW
Skipping empty DataFrame: VDSI


In [56]:
df_tech.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254333 entries, 0 to 2254332
Data columns (total 10 columns):
 #   Column     Dtype         
---  ------     -----         
 0   index      int64         
 1   ticker     object        
 2   open       float64       
 3   close      float64       
 4   adj_close  float64       
 5   low        float64       
 6   high       float64       
 7   volume     int64         
 8   date       object        
 9   datetime   datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 172.0+ MB


In [20]:
df_tech['datetime'] = pd.to_datetime(df_tech['date'], errors='coerce')

In [23]:
df_tech.to_csv('data/tech1970-2018.csv', index=False)

In [57]:
# Filter for Fridays and Mondays
filtered_tech = df_tech[df_tech['datetime'].dt.dayofweek.isin([0, 4])]

# Create a new column 'DayType' indicating 'Monday' or 'Friday'
filtered_tech['DayType'] = np.where(filtered_tech['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tech['DayType'] = np.where(filtered_tech['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')


In [58]:
filtered_tech.info()

<class 'pandas.core.frame.DataFrame'>
Index: 874970 entries, 1 to 2254332
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   index      874970 non-null  int64         
 1   ticker     874970 non-null  object        
 2   open       874970 non-null  float64       
 3   close      874970 non-null  float64       
 4   adj_close  874970 non-null  float64       
 5   low        874970 non-null  float64       
 6   high       874970 non-null  float64       
 7   volume     874970 non-null  int64         
 8   date       874970 non-null  object        
 9   datetime   874970 non-null  datetime64[ns]
 10  DayType    874970 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 80.1+ MB


In [59]:
# Drop the first and last row of the DataFrame
tech = filtered_tech


In [60]:
tech.head()

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
1,3114942,VNET,19.4,17.75,17.75,17.32,19.5,2323400,2011-04-25,2011-04-25,Monday
5,3114963,VNET,15.95,15.51,15.51,15.3,16.42,1343900,2011-04-29,2011-04-29,Friday
6,3114964,VNET,15.74,16.68,16.68,15.54,17.0,1471100,2011-05-02,2011-05-02,Monday
10,3114968,VNET,13.75,13.75,13.75,13.16,14.2,1012800,2011-05-06,2011-05-06,Friday
11,3114969,VNET,13.7,14.26,14.26,13.12,14.4,631700,2011-05-09,2011-05-09,Monday


In [61]:
tech = tech.sort_values(by=['ticker', 'datetime']).reset_index(drop=True)

In [62]:
tech.head(10)

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
0,15579777,AABA,1.052083,1.375,1.375,1.020833,1.791667,408720000,1996-04-12,1996-04-12,Friday
1,15579790,AABA,1.489583,1.34375,1.34375,1.25,1.5,79219200,1996-04-15,1996-04-15,Monday
2,15579831,AABA,1.255208,1.203125,1.203125,1.197917,1.28125,12913600,1996-04-19,1996-04-19,Friday
3,15579841,AABA,1.208333,1.177083,1.177083,1.145833,1.208333,8041600,1996-04-22,1996-04-22,Monday
4,15579885,AABA,1.333333,1.322917,1.322917,1.302083,1.34375,7561600,1996-04-26,1996-04-26,Friday
5,15579899,AABA,1.3125,1.291667,1.291667,1.270833,1.333333,5928000,1996-04-29,1996-04-29,Monday
6,15579943,AABA,1.34375,1.333333,1.333333,1.302083,1.354167,6116800,1996-05-03,1996-05-03,Friday
7,15579953,AABA,1.354167,1.255208,1.255208,1.223958,1.354167,8214400,1996-05-06,1996-05-06,Monday
8,15579990,AABA,1.28125,1.302083,1.302083,1.270833,1.322917,5875200,1996-05-10,1996-05-10,Friday
9,15579991,AABA,1.307292,1.260417,1.260417,1.25,1.3125,2747200,1996-05-13,1996-05-13,Monday


In [66]:
# the Mondays and Fridays are not put in the same row

tech.to_csv('data/Weekends_Tech_Stocks.csv', index=False)

In [64]:
df2[df2['ticker']=='AAPL']


Unnamed: 0,ticker,exchange,name,sector,industry
195,AAPL,NASDAQ,APPLE INC.,TECHNOLOGY,COMPUTER MANUFACTURING


In [65]:
tech[tech['ticker']=='AAPL']

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
4684,948,AAPL,0.513393,0.513393,0.023186,0.513393,0.515625,117258400,1980-12-12,1980-12-12,Friday
4685,960,AAPL,0.488839,0.486607,0.021977,0.486607,0.488839,43971200,1980-12-15,1980-12-15,Monday
4686,1000,AAPL,0.504464,0.504464,0.022783,0.504464,0.506696,12157600,1980-12-19,1980-12-19,Friday
4687,1008,AAPL,0.529018,0.529018,0.023892,0.529018,0.531250,9340800,1980-12-22,1980-12-22,Monday
4688,1040,AAPL,0.633929,0.633929,0.028630,0.633929,0.636161,13893600,1980-12-26,1980-12-26,Friday
...,...,...,...,...,...,...,...,...,...,...,...
8376,100485,AAPL,207.360001,207.529999,207.529999,206.669998,209.100006,24611200,2018-08-10,2018-08-10,Friday
8377,100486,AAPL,207.699997,208.869995,208.869995,207.699997,210.949997,25869100,2018-08-13,2018-08-13,Monday
8378,100490,AAPL,213.440002,217.580002,217.580002,213.160004,217.949997,35427000,2018-08-17,2018-08-17,Friday
8379,100491,AAPL,218.100006,215.460007,215.460007,215.110001,219.179993,30287700,2018-08-20,2018-08-20,Monday


Having gotten the data, let's do some exploration

In [3]:
tech=pd.read_csv('data/Weekends_Tech_Stocks.csv')

In [6]:
# oops, the datetime column is of strings
tech.iloc[0]['datetime']

'1996-04-12'

In [7]:
tech['datetime'] = pd.to_datetime(tech['datetime'], errors='coerce')

In [8]:
tech.iloc[0]['datetime']

Timestamp('1996-04-12 00:00:00')

In [9]:
tech.to_csv('data/Weekends_Tech_Stocks.csv',index=False)

We'll first check that the Fridays and Mondays pair up correctly.

In [11]:
# Initialize a list to store any invalid Friday-Monday pairs
invalid_pairs = []

# Group by the 'ticker' column (each stock)
for ticker, group in tech.groupby('ticker'):
    # Ensure the group is sorted by 'datetime'
    group = group.sort_values(by='datetime').reset_index(drop=True)

    # Iterate through the group (stock data) to check Friday-Monday intervals
    for i in range(len(group) - 1):
        current_day = group.iloc[i]['DayType']
        next_day = group.iloc[i + 1]['DayType']
        current_date = group.iloc[i]['datetime']
        next_date = group.iloc[i + 1]['datetime']

        # Check if current day is Friday and next day is Monday
        if current_day == 'Friday' and next_day == 'Monday':
            # Calculate the difference in days
            days_diff = (next_date - current_date).days
            # Check if the difference is more than 3 days (which would be invalid)
            if days_diff > 3:
                invalid_pairs.append((ticker, current_date, next_date, days_diff))

# Output the results
if invalid_pairs:
    print("Invalid Friday-Monday pairs:")
    for pair in invalid_pairs:
        print(f"Stock: {pair[0]}, Friday: {pair[1]} is followed by Monday: {pair[2]} ({pair[3]} days apart).")
else:
    print("All Friday-Monday pairs are valid (within 3 days).")


Invalid Friday-Monday pairs:
Stock: AAN, Friday: 1989-02-24 00:00:00 is followed by Monday: 1989-03-06 00:00:00 (10 days apart).
Stock: AAN, Friday: 1991-03-22 00:00:00 is followed by Monday: 1991-04-01 00:00:00 (10 days apart).
Stock: AAN, Friday: 1991-06-28 00:00:00 is followed by Monday: 1991-07-08 00:00:00 (10 days apart).
Stock: AAN, Friday: 1991-10-04 00:00:00 is followed by Monday: 1991-10-14 00:00:00 (10 days apart).
Stock: AAN, Friday: 1993-02-12 00:00:00 is followed by Monday: 1993-02-22 00:00:00 (10 days apart).
Stock: AAN, Friday: 1993-03-05 00:00:00 is followed by Monday: 1993-03-15 00:00:00 (10 days apart).
Stock: AAN, Friday: 1993-04-02 00:00:00 is followed by Monday: 1993-04-12 00:00:00 (10 days apart).
Stock: AAN, Friday: 1993-07-09 00:00:00 is followed by Monday: 1993-07-19 00:00:00 (10 days apart).
Stock: AAN, Friday: 1993-07-23 00:00:00 is followed by Monday: 1993-08-02 00:00:00 (10 days apart).
Stock: AAN, Friday: 1993-09-03 00:00:00 is followed by Monday: 1993-09-

In [12]:
len(invalid_pairs)

1646

In [17]:
tech[tech['ticker']=='AAN'].iloc[90:110]

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
2278,6176876,AAN,0.518519,0.555556,8.121267e+18,0.518519,0.555556,68800,1988-01-15,1988-01-15,Friday
2279,6179560,AAN,0.703704,0.703704,352780000000000.0,0.703704,0.703704,6700,1989-01-27,1989-01-27,Friday
2280,6179562,AAN,0.703704,0.703704,352780000000000.0,0.703704,0.703704,16200,1989-01-30,1989-01-30,Monday
2281,6179694,AAN,0.740741,0.75,375989000000000.0,0.740741,0.759259,473800,1989-02-03,1989-02-03,Friday
2282,6179698,AAN,0.759259,0.814815,408482100000000.0,0.759259,0.814815,201100,1989-02-06,1989-02-06,Monday
2283,6179876,AAN,0.777778,0.759259,380631100000000.0,0.740741,0.777778,48600,1989-02-10,1989-02-10,Friday
2284,6179894,AAN,0.759259,0.759259,380631100000000.0,0.759259,0.759259,13500,1989-02-13,1989-02-13,Monday
2285,6179936,AAN,0.759259,0.740741,371347400000000.0,0.740741,0.759259,33700,1989-02-17,1989-02-17,Friday
2286,6179980,AAN,0.759259,0.740741,371347400000000.0,0.722222,0.759259,70200,1989-02-24,1989-02-24,Friday
2287,6180046,AAN,0.722222,0.722222,362063900000000.0,0.722222,0.740741,56700,1989-03-06,1989-03-06,Monday


In [18]:
# Initialize a list to store the valid rows
valid_rows = []

# Group by the 'ticker' column (each stock)
for ticker, group in tech.groupby('ticker'):
    # Ensure the group is sorted by 'datetime'
    group = group.sort_values(by='datetime').reset_index(drop=True)

    # Iterate through the group (stock data) to check Friday-Monday intervals
    for i in range(len(group) - 1):
        current_day = group.iloc[i]['DayType']
        next_day = group.iloc[i + 1]['DayType']
        current_date = group.iloc[i]['datetime']
        next_date = group.iloc[i + 1]['datetime']

        # Check if current day is Friday and next day is Monday
        if current_day == 'Friday' and next_day == 'Monday':
            # Calculate the difference in days
            days_diff = (next_date - current_date).days
            # If the difference is within 3 days, add both rows to the valid list
            if days_diff <= 3:
                valid_rows.append(group.iloc[i])       # Add the Friday row
                valid_rows.append(group.iloc[i + 1])   # Add the Monday row

# Create a new DataFrame from the valid rows
valid_pairs_df = pd.DataFrame(valid_rows).reset_index(drop=True)

# Output the new DataFrame
print(valid_pairs_df.head())
print(f"Total valid pairs: {len(valid_pairs_df) // 2}")

# Save the valid pairs to a CSV file
valid_pairs_df.to_csv("tech_valid_pairs.csv", index=False)
print("Filtered data with valid Friday-Monday pairs saved to 'tech_valid_pairs.csv'.")


      index ticker      open     close  adj_close       low      high  \
0  15579777   AABA  1.052083  1.375000   1.375000  1.020833  1.791667   
1  15579790   AABA  1.489583  1.343750   1.343750  1.250000  1.500000   
2  15579831   AABA  1.255208  1.203125   1.203125  1.197917  1.281250   
3  15579841   AABA  1.208333  1.177083   1.177083  1.145833  1.208333   
4  15579885   AABA  1.333333  1.322917   1.322917  1.302083  1.343750   

      volume        date   datetime DayType  
0  408720000  1996-04-12 1996-04-12  Friday  
1   79219200  1996-04-15 1996-04-15  Monday  
2   12913600  1996-04-19 1996-04-19  Friday  
3    8041600  1996-04-22 1996-04-22  Monday  
4    7561600  1996-04-26 1996-04-26  Friday  
Total valid pairs: 402142
Filtered data with valid Friday-Monday pairs saved to 'tech_valid_pairs.csv'.


In [20]:
valid_tech=pd.read_csv('tech_valid_pairs.csv')

In [21]:
valid_tech.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 804284 entries, 0 to 804283
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   index      804284 non-null  int64  
 1   ticker     804284 non-null  object 
 2   open       804284 non-null  float64
 3   close      804284 non-null  float64
 4   adj_close  804284 non-null  float64
 5   low        804284 non-null  float64
 6   high       804284 non-null  float64
 7   volume     804284 non-null  int64  
 8   date       804284 non-null  object 
 9   datetime   804284 non-null  object 
 10  DayType    804284 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 67.5+ MB


In [22]:
len(valid_tech)

804284

In [26]:
valid_tech[valid_tech['ticker']=='AAN'].iloc[80:90]

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
2120,6179694,AAN,0.740741,0.75,375989000000000.0,0.740741,0.759259,473800,1989-02-03,1989-02-03,Friday
2121,6179698,AAN,0.759259,0.814815,408482100000000.0,0.759259,0.814815,201100,1989-02-06,1989-02-06,Monday
2122,6179876,AAN,0.777778,0.759259,380631100000000.0,0.740741,0.777778,48600,1989-02-10,1989-02-10,Friday
2123,6179894,AAN,0.759259,0.759259,380631100000000.0,0.759259,0.759259,13500,1989-02-13,1989-02-13,Monday
2124,6180087,AAN,0.722222,0.740741,371347400000000.0,0.722222,0.740741,136300,1989-03-10,1989-03-10,Friday
2125,6180091,AAN,0.740741,0.759259,380631100000000.0,0.740741,0.759259,41800,1989-03-13,1989-03-13,Monday
2126,6180131,AAN,0.759259,0.740741,371347400000000.0,0.740741,0.759259,36400,1989-03-17,1989-03-17,Friday
2127,6180149,AAN,0.796296,0.796296,399198400000000.0,0.796296,0.796296,14800,1989-03-20,1989-03-20,Monday
2128,6180231,AAN,0.777778,0.777778,389914800000000.0,0.777778,0.777778,1300,1989-03-31,1989-03-31,Friday
2129,6180235,AAN,0.777778,0.777778,389914800000000.0,0.777778,0.814815,32400,1989-04-03,1989-04-03,Monday


In [27]:
# Initialize a list to store the combined rows
combined_rows = []

# Iterate through the valid pairs (step by 2 because each pair has a Friday and a Monday)
for i in range(0, len(valid_pairs_df), 2):
    # Get the Friday and Monday rows
    friday_row = valid_pairs_df.iloc[i]
    monday_row = valid_pairs_df.iloc[i + 1]

    # Combine the data into a single row (prefix columns with 'Friday_' and 'Monday_')
    combined_data = {
        'ticker': friday_row['ticker'],            # Stock ticker (same for both)
        'Friday_date': friday_row['datetime'],     # Friday date
        'Monday_date': monday_row['datetime'],     # Monday date
        'Friday_day': friday_row['DayType'],       # Should be 'Friday'
        'Monday_day': monday_row['DayType'],       # Should be 'Monday'
    }

    # Add all other columns, prefixing them with 'Friday_' or 'Monday_'
    for col in friday_row.index:
        if col not in ['ticker', 'datetime', 'DayType']:
            combined_data[f'Friday_{col}'] = friday_row[col]
            combined_data[f'Monday_{col}'] = monday_row[col]

    # Append the combined data to the list
    combined_rows.append(combined_data)

# Create a new DataFrame from the combined rows
combined_df = pd.DataFrame(combined_rows)

# Output the new DataFrame
print(combined_df.head())
print(f"New DataFrame shape: {combined_df.shape}")

# Save the combined DataFrame to a CSV file
combined_df.to_csv("tech_combined_pairs.csv", index=False)
print("Combined Friday-Monday pairs saved to 'tech_combined_pairs.csv'.")


  ticker Friday_date Monday_date Friday_day Monday_day  Friday_index  \
0   AABA  1996-04-12  1996-04-15     Friday     Monday      15579777   
1   AABA  1996-04-19  1996-04-22     Friday     Monday      15579831   
2   AABA  1996-04-26  1996-04-29     Friday     Monday      15579885   
3   AABA  1996-05-03  1996-05-06     Friday     Monday      15579943   
4   AABA  1996-05-10  1996-05-13     Friday     Monday      15579990   

   Monday_index  Friday_open  Monday_open  Friday_close  Monday_close  \
0      15579790     1.052083     1.489583      1.375000      1.343750   
1      15579841     1.255208     1.208333      1.203125      1.177083   
2      15579899     1.333333     1.312500      1.322917      1.291667   
3      15579953     1.343750     1.354167      1.333333      1.255208   
4      15579991     1.281250     1.307292      1.302083      1.260417   

   Friday_adj_close  Monday_adj_close  Friday_low  Monday_low  Friday_high  \
0          1.375000          1.343750    1.020833 

In [28]:
combined_tech = pd.read_csv('tech_combined_pairs.csv')

In [30]:
combined_tech.sample(5)

Unnamed: 0,ticker,Friday_date,Monday_date,Friday_day,Monday_day,Friday_index,Monday_index,Friday_open,Monday_open,Friday_close,Monday_close,Friday_adj_close,Monday_adj_close,Friday_low,Monday_low,Friday_high,Monday_high,Friday_volume,Monday_volume
356024,TISA,2018-06-15,2018-06-18,Friday,Monday,8084801,8084802,1.07,1.01,1.01,1.03,1.01,1.03,1.0,1.01,1.07,1.05,28000,1000
137145,FNSR,2004-04-02,2004-04-05,Friday,Monday,10238712,10238714,18.0,17.6,18.0,18.4,18.0,18.4,17.440001,17.6,18.16,18.639999,340200,240900
202618,LLL,2016-11-11,2016-11-14,Friday,Monday,20167894,20167896,150.149994,150.899994,150.929993,153.929993,145.939758,148.840576,148.75,150.130005,151.350006,154.570007,876000,1183500
126901,ETN,1979-01-12,1979-01-15,Friday,Monday,16832903,16832918,1.789631,1.789631,1.783606,1.801683,0.021398,0.021615,1.783606,1.789631,1.789631,1.813734,378800,420400
98471,CYOU,2018-03-16,2018-03-19,Friday,Monday,19180857,19180873,28.24,27.68,27.559999,27.709999,18.77817,18.880373,27.5,27.17,28.530001,27.85,317900,83100


In [32]:
combined_df.drop(columns=['Friday_day', 'Monday_day'], inplace=True)

# Save the updated DataFrame back to the CSV file
combined_df.to_csv("tech_combined_pairs.csv", index=False)

In [33]:
combined_tech = pd.read_csv('tech_combined_pairs.csv')

In [34]:
combined_tech.sample(5)

Unnamed: 0,ticker,Friday_date,Monday_date,Friday_index,Monday_index,Friday_open,Monday_open,Friday_close,Monday_close,Friday_adj_close,Monday_adj_close,Friday_low,Monday_low,Friday_high,Monday_high,Friday_volume,Monday_volume
391525,WSCI,1997-03-14,1997-03-17,16522573,16522574,3.375,3.375,3.375,3.625,2.230348,2.395559,3.375,3.375,3.375,3.625,900,14600
321305,SMTC,1983-06-17,1983-06-20,19561422,19561461,1.15625,1.1875,1.171875,1.25,1.171875,1.25,1.15625,1.1875,1.21875,1.25,60800,72800
346666,SYMC,2005-10-07,2005-10-10,4158084,4158098,21.620001,22.120001,21.92,22.15,15.705549,15.870338,21.42,22.049999,22.040001,22.4,12415800,11292700
391973,WSCI,2010-08-06,2010-08-09,16552684,16552696,3.01,3.06,3.15,3.1,2.644477,2.602501,3.0,3.05,3.27,3.15,13600,10900
175630,IPG,2002-09-13,2002-09-16,1426589,1426599,18.200001,17.870001,17.889999,17.75,14.740666,14.625314,17.620001,17.5,18.200001,18.1,1857200,1098000


In [35]:
combined_tech.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402142 entries, 0 to 402141
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ticker            402142 non-null  object 
 1   Friday_date       402142 non-null  object 
 2   Monday_date       402142 non-null  object 
 3   Friday_index      402142 non-null  int64  
 4   Monday_index      402142 non-null  int64  
 5   Friday_open       402142 non-null  float64
 6   Monday_open       402142 non-null  float64
 7   Friday_close      402142 non-null  float64
 8   Monday_close      402142 non-null  float64
 9   Friday_adj_close  402142 non-null  float64
 10  Monday_adj_close  402142 non-null  float64
 11  Friday_low        402142 non-null  float64
 12  Monday_low        402142 non-null  float64
 13  Friday_high       402142 non-null  float64
 14  Monday_high       402142 non-null  float64
 15  Friday_volume     402142 non-null  int64  
 16  Monday_volume     40

In [36]:
dst_dates = pd.read_csv('data/DST_fri_mon.csv')

In [37]:
dst_fri_dates = list(dst_dates['fall_fri_before'])+list(dst_dates['spring_fri_before'])

In [38]:
# this gives a 1 or 0 for whether a date is DST or not

combined_tech['y'] = combined_tech['Friday_date'].isin(dst_fri_dates).astype(int)

In [40]:
combined_tech.to_csv("data/Weekends_Tech_Stocks.csv", index=False)

In [41]:
tech = pd.read_csv('data/Weekends_Tech_Stocks.csv')

In [42]:
tech.head()

Unnamed: 0,ticker,Friday_date,Monday_date,Friday_index,Monday_index,Friday_open,Monday_open,Friday_close,Monday_close,Friday_adj_close,Monday_adj_close,Friday_low,Monday_low,Friday_high,Monday_high,Friday_volume,Monday_volume,y
0,AABA,1996-04-12,1996-04-15,15579777,15579790,1.052083,1.489583,1.375,1.34375,1.375,1.34375,1.020833,1.25,1.791667,1.5,408720000,79219200,0
1,AABA,1996-04-19,1996-04-22,15579831,15579841,1.255208,1.208333,1.203125,1.177083,1.203125,1.177083,1.197917,1.145833,1.28125,1.208333,12913600,8041600,0
2,AABA,1996-04-26,1996-04-29,15579885,15579899,1.333333,1.3125,1.322917,1.291667,1.322917,1.291667,1.302083,1.270833,1.34375,1.333333,7561600,5928000,0
3,AABA,1996-05-03,1996-05-06,15579943,15579953,1.34375,1.354167,1.333333,1.255208,1.333333,1.255208,1.302083,1.223958,1.354167,1.354167,6116800,8214400,0
4,AABA,1996-05-10,1996-05-13,15579990,15579991,1.28125,1.307292,1.302083,1.260417,1.302083,1.260417,1.270833,1.25,1.322917,1.3125,5875200,2747200,0


In [44]:
# Proportion of nonDST to DST
tech['y'].value_counts(normalize=True)

y
0    0.95791
1    0.04209
Name: proportion, dtype: float64