In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
from datetime import datetime, timedelta
set_style("whitegrid")
import warnings
import contextlib
import io

In [2]:
# huge dataset with 20 million rows
df = pd.read_csv('historical_stock_prices.csv')
df

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.50,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.50,11.66,275800,2013-05-09
2,AHH,11.55,11.60,8.507822,11.50,11.60,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.60,11.53,8.456484,11.50,11.60,184100,2013-05-14
...,...,...,...,...,...,...,...,...
20973884,NZF,14.60,14.59,14.590000,14.58,14.62,137500,2018-08-20
20973885,NZF,14.60,14.58,14.580000,14.57,14.61,151200,2018-08-21
20973886,NZF,14.58,14.59,14.590000,14.57,14.63,185400,2018-08-22
20973887,NZF,14.60,14.57,14.570000,14.57,14.64,135600,2018-08-23


In [3]:
# turn a date string into a datetime object; then the library can tell us the day of the week of the date
# thought it would be useful; 0=Monday,...,6=Sunday
datetime.strptime(df['date'].loc[2],'%Y-%m-%d').weekday()

4

In [4]:
df.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14


In [5]:
# getting the codes that tell us what the stocks are; e.g. PIH = Property Insurance Holdings
df2=pd.read_csv('historical_stocks.csv')
df2.head()

Unnamed: 0,ticker,exchange,name,sector,industry
0,PIH,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
1,PIHPP,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
2,TURN,NASDAQ,180 DEGREE CAPITAL CORP.,FINANCE,FINANCE/INVESTORS SERVICES
3,FLWS,NASDAQ,"1-800 FLOWERS.COM, INC.",CONSUMER SERVICES,OTHER SPECIALTY STORES
4,FCCY,NASDAQ,1ST CONSTITUTION BANCORP (NJ),FINANCE,SAVINGS INSTITUTIONS


In [6]:
# learn what the sectors are; seems there are 11 sectors if we don't count, nan, Misc, and Sector
df2['sector'].unique()

array(['FINANCE', 'CONSUMER SERVICES', 'TECHNOLOGY', 'PUBLIC UTILITIES',
       'CAPITAL GOODS', 'BASIC INDUSTRIES', 'HEALTH CARE',
       'CONSUMER DURABLES', nan, 'ENERGY', 'MISCELLANEOUS', 'SECTOR',
       'TRANSPORTATION', 'CONSUMER NON-DURABLES'], dtype=object)

In [7]:
# Calculate the percentage of each sector
sector_percentages = df2['sector'].value_counts(normalize=True) * 100
print(sector_percentages)

sector
FINANCE                  20.358566
CONSUMER SERVICES        15.856574
HEALTH CARE              15.617530
TECHNOLOGY               12.091633
CAPITAL GOODS             7.011952
ENERGY                    5.697211
PUBLIC UTILITIES          5.438247
BASIC INDUSTRIES          5.418327
CONSUMER NON-DURABLES     4.462151
CONSUMER DURABLES         2.868526
MISCELLANEOUS             2.768924
TRANSPORTATION            2.390438
SECTOR                    0.019920
Name: proportion, dtype: float64


In [8]:
# get all the stocks in transportation
health_tickers = df2[df2['sector']=='HEALTH CARE'][['ticker','name']]
health_tickers.head(20)

Unnamed: 0,ticker,name
14,ABEO,ABEONA THERAPEUTICS INC.
15,ABEOW,ABEONA THERAPEUTICS INC.
17,ABMD,"ABIOMED, INC."
18,ABLX,ABLYNX NV
21,ACIU,AC IMMUNE SA
24,ACHC,"ACADIA HEALTHCARE COMPANY, INC."
25,ACAD,ACADIA PHARMACEUTICALS INC.
26,ACST,"ACASTI PHARMA, INC."
28,XLRN,ACCELERON PHARMA INC.
30,ARAY,ACCURAY INCORPORATED


In [10]:
def get_stocks(df, df2, stock_code):
    # get prices for a particular stock; this function takes in two data frames where one has the prices
    # the other has the codes
    print(df2[df2['ticker']==stock_code])
    return df[df['ticker']==stock_code].reset_index()

In [12]:
df_health=pd.DataFrame()

for stock in health_tickers['ticker'].values:
    with contextlib.redirect_stdout(io.StringIO()):
        df_stock = get_stocks(df,df2,stock)
    if df_stock.empty:
        print("Skipping empty DataFrame:", stock)
        continue
    df_health = pd.concat([df_health,df_stock],ignore_index=True)

Skipping empty DataFrame: ABEOW
Skipping empty DataFrame: ABLX
Skipping empty DataFrame: ADXSW
Skipping empty DataFrame: ARMO
Skipping empty DataFrame: AVXS
Skipping empty DataFrame: AHPAU
Skipping empty DataFrame: AHPAW
Skipping empty DataFrame: BNTCW
Skipping empty DataFrame: BVXVW
Skipping empty DataFrame: CRME
Skipping empty DataFrame: CLRBW
Skipping empty DataFrame: CLRBZ
Skipping empty DataFrame: CERCW
Skipping empty DataFrame: CHEKW
Skipping empty DataFrame: CHEKZ
Skipping empty DataFrame: CTXRW
Skipping empty DataFrame: CYHHZ
Skipping empty DataFrame: CXRX
Skipping empty DataFrame: CYTXW
Skipping empty DataFrame: DRIOW
Skipping empty DataFrame: EBIO
Skipping empty DataFrame: NDRAW
Skipping empty DataFrame: EYEGW
Skipping empty DataFrame: IMRNW
Skipping empty DataFrame: KTOVW
Skipping empty DataFrame: MTFBW
Skipping empty DataFrame: MYNDW
Skipping empty DataFrame: NUROW
Skipping empty DataFrame: ONSIW
Skipping empty DataFrame: ONSIZ
Skipping empty DataFrame: ONTXW
Skipping empty

In [13]:
df_health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2093693 entries, 0 to 2093692
Data columns (total 9 columns):
 #   Column     Dtype  
---  ------     -----  
 0   index      int64  
 1   ticker     object 
 2   open       float64
 3   close      float64
 4   adj_close  float64
 5   low        float64
 6   high       float64
 7   volume     int64  
 8   date       object 
dtypes: float64(5), int64(2), object(2)
memory usage: 143.8+ MB


In [14]:
df_health['datetime'] = pd.to_datetime(df_health['date'], errors='coerce')

In [15]:
df_health.to_csv('health1970-2018.csv', index=False)

In [16]:
# Filter for Fridays and Mondays
filtered_health = df_health[df_health['datetime'].dt.dayofweek.isin([0, 4])]

# Create a new column 'DayType' indicating 'Monday' or 'Friday'
filtered_health['DayType'] = np.where(filtered_health['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_health['DayType'] = np.where(filtered_health['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')


In [17]:
filtered_health.info()

<class 'pandas.core.frame.DataFrame'>
Index: 812379 entries, 3 to 2093692
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   index      812379 non-null  int64         
 1   ticker     812379 non-null  object        
 2   open       812379 non-null  float64       
 3   close      812379 non-null  float64       
 4   adj_close  812379 non-null  float64       
 5   low        812379 non-null  float64       
 6   high       812379 non-null  float64       
 7   volume     812379 non-null  int64         
 8   date       812379 non-null  object        
 9   datetime   812379 non-null  datetime64[ns]
 10  DayType    812379 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 74.4+ MB


In [18]:
# Drop the first and last row of the DataFrame
health = filtered_health


In [19]:
health.head()

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
3,17579679,ABEO,265.625,281.25,257.842102,265.625,328.125,100,1998-09-18,1998-09-18,Friday
4,17579734,ABEO,406.25,562.5,515.684204,406.25,562.5,100,1998-09-25,1998-09-25,Friday
8,17580849,ABEO,687.5,656.25,601.631592,617.174988,687.5,200,1999-02-08,1999-02-08,Monday
10,17581752,ABEO,789.049988,750.0,687.578979,703.125,789.049988,100,1999-06-11,1999-06-11,Friday
11,17581761,ABEO,750.0,687.5,630.280701,687.5,875.0,400,1999-06-14,1999-06-14,Monday


In [20]:
health = health.sort_values(by=['ticker', 'datetime']).reset_index(drop=True)

In [25]:
health.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 812379 entries, 0 to 812378
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   index      812379 non-null  int64         
 1   ticker     812379 non-null  object        
 2   open       812379 non-null  float64       
 3   close      812379 non-null  float64       
 4   adj_close  812379 non-null  float64       
 5   low        812379 non-null  float64       
 6   high       812379 non-null  float64       
 7   volume     812379 non-null  int64         
 8   date       812379 non-null  object        
 9   datetime   812379 non-null  datetime64[ns]
 10  DayType    812379 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 68.2+ MB


In [22]:
# Warning: the Mondays and Fridays are not put in the same row

health.to_csv('Weekends_Health_Stocks.csv', index=False)

In [23]:
# get all the stocks in transportation
df2[df2['ticker']=='ABEO']


Unnamed: 0,ticker,exchange,name,sector,industry
14,ABEO,NASDAQ,ABEONA THERAPEUTICS INC.,HEALTH CARE,MAJOR PHARMACEUTICALS


In [24]:
health[health['ticker']=='ABEO']

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
2898,17579679,ABEO,265.625000,281.25,257.842102,265.625000,328.125000,100,1998-09-18,1998-09-18,Friday
2899,17579734,ABEO,406.250000,562.50,515.684204,406.250000,562.500000,100,1998-09-25,1998-09-25,Friday
2900,17580849,ABEO,687.500000,656.25,601.631592,617.174988,687.500000,200,1999-02-08,1999-02-08,Monday
2901,17581752,ABEO,789.049988,750.00,687.578979,703.125000,789.049988,100,1999-06-11,1999-06-11,Friday
2902,17581761,ABEO,750.000000,687.50,630.280701,687.500000,875.000000,400,1999-06-14,1999-06-14,Monday
...,...,...,...,...,...,...,...,...,...,...,...
4269,17620446,ABEO,14.300000,13.65,13.650000,13.600000,14.750000,406800,2018-08-10,2018-08-10,Friday
4270,17620448,ABEO,13.600000,12.90,12.900000,12.850000,14.080000,760500,2018-08-13,2018-08-13,Monday
4271,17620484,ABEO,13.850000,13.25,13.250000,13.000000,14.250000,401500,2018-08-17,2018-08-17,Friday
4272,17620501,ABEO,13.150000,13.65,13.650000,13.150000,13.750000,360200,2018-08-20,2018-08-20,Monday
