In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from seaborn import set_style
from datetime import datetime, timedelta
set_style("whitegrid")
import warnings
import contextlib
import io

In [2]:
# huge dataset with 20 million rows
df = pd.read_csv('data/historical_stock_prices.csv')
df

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.50,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.50,11.66,275800,2013-05-09
2,AHH,11.55,11.60,8.507822,11.50,11.60,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.60,11.53,8.456484,11.50,11.60,184100,2013-05-14
...,...,...,...,...,...,...,...,...
20973884,NZF,14.60,14.59,14.590000,14.58,14.62,137500,2018-08-20
20973885,NZF,14.60,14.58,14.580000,14.57,14.61,151200,2018-08-21
20973886,NZF,14.58,14.59,14.590000,14.57,14.63,185400,2018-08-22
20973887,NZF,14.60,14.57,14.570000,14.57,14.64,135600,2018-08-23


In [3]:
# turn a date string into a datetime object; then the library can tell us the day of the week of the date
# thought it would be useful; 0=Monday,...,6=Sunday
datetime.strptime(df['date'].loc[2],'%Y-%m-%d').weekday()

4

In [4]:
df.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14


In [5]:
# getting the codes that tell us what the stocks are; e.g. PIH = Property Insurance Holdings
df2=pd.read_csv('data/historical_stocks.csv')
df2.head()

Unnamed: 0,ticker,exchange,name,sector,industry
0,PIH,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
1,PIHPP,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
2,TURN,NASDAQ,180 DEGREE CAPITAL CORP.,FINANCE,FINANCE/INVESTORS SERVICES
3,FLWS,NASDAQ,"1-800 FLOWERS.COM, INC.",CONSUMER SERVICES,OTHER SPECIALTY STORES
4,FCCY,NASDAQ,1ST CONSTITUTION BANCORP (NJ),FINANCE,SAVINGS INSTITUTIONS


In [6]:
# learn what the sectors are; seems there are 11 sectors if we don't count, nan, Misc, and Sector
df2['sector'].unique()

array(['FINANCE', 'CONSUMER SERVICES', 'TECHNOLOGY', 'PUBLIC UTILITIES',
       'CAPITAL GOODS', 'BASIC INDUSTRIES', 'HEALTH CARE',
       'CONSUMER DURABLES', nan, 'ENERGY', 'MISCELLANEOUS', 'SECTOR',
       'TRANSPORTATION', 'CONSUMER NON-DURABLES'], dtype=object)

In [7]:
# Calculate the percentage of each sector
sector_percentages = df2['sector'].value_counts(normalize=True) * 100
print(sector_percentages)

sector
FINANCE                  20.358566
CONSUMER SERVICES        15.856574
HEALTH CARE              15.617530
TECHNOLOGY               12.091633
CAPITAL GOODS             7.011952
ENERGY                    5.697211
PUBLIC UTILITIES          5.438247
BASIC INDUSTRIES          5.418327
CONSUMER NON-DURABLES     4.462151
CONSUMER DURABLES         2.868526
MISCELLANEOUS             2.768924
TRANSPORTATION            2.390438
SECTOR                    0.019920
Name: proportion, dtype: float64


In [50]:
# get all the stocks in transportation
tech_tickers = df2[df2['sector']=='TECHNOLOGY'][['ticker','name']]
tech_tickers.head(20)

Unnamed: 0,ticker,name
6,VNET,"21VIANET GROUP, INC."
7,TWOU,"2U, INC."
8,JOBS,"51JOB, INC."
22,ACIA,"ACACIA COMMUNICATIONS, INC."
37,ACIW,"ACI WORLDWIDE, INC."
39,ACMR,"ACM RESEARCH, INC."
43,ATVI,"ACTIVISION BLIZZARD, INC"
44,ACXM,ACXIOM CORPORATION
50,IOTS,ADESTO TECHNOLOGIES CORPORATION
52,ADBE,ADOBE SYSTEMS INCORPORATED


In [8]:
def get_stocks(df, df2, stock_code):
    # get prices for a particular stock; this function takes in two data frames where one has the prices
    # the other has the codes
    print(df2[df2['ticker']==stock_code])
    return df[df['ticker']==stock_code].reset_index()

In [12]:
df_tech=pd.DataFrame()

for stock in tech_tickers['ticker'].values:
    with contextlib.redirect_stdout(io.StringIO()):
        df_stock = get_stocks(df,df2,stock)
    if df_stock.empty:
        print("Skipping empty DataFrame:", stock)
        continue
    df_tech = pd.concat([df_tech,df_stock],ignore_index=True)

Skipping empty DataFrame: AMRHW
Skipping empty DataFrame: CNIT
Skipping empty DataFrame: CHUBA
Skipping empty DataFrame: CHUBK
Skipping empty DataFrame: CVONW
Skipping empty DataFrame: GFNSL
Skipping empty DataFrame: MSCC
Skipping empty DataFrame: STLRU
Skipping empty DataFrame: STLRW
Skipping empty DataFrame: VDSI


In [56]:
df_tech.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254333 entries, 0 to 2254332
Data columns (total 10 columns):
 #   Column     Dtype         
---  ------     -----         
 0   index      int64         
 1   ticker     object        
 2   open       float64       
 3   close      float64       
 4   adj_close  float64       
 5   low        float64       
 6   high       float64       
 7   volume     int64         
 8   date       object        
 9   datetime   datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 172.0+ MB


In [20]:
df_tech['datetime'] = pd.to_datetime(df_tech['date'], errors='coerce')

In [23]:
df_tech.to_csv('data/tech1970-2018.csv', index=False)

In [57]:
# Filter for Fridays and Mondays
filtered_tech = df_tech[df_tech['datetime'].dt.dayofweek.isin([0, 4])]

# Create a new column 'DayType' indicating 'Monday' or 'Friday'
filtered_tech['DayType'] = np.where(filtered_tech['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tech['DayType'] = np.where(filtered_tech['datetime'].dt.dayofweek == 0, 'Monday', 'Friday')


In [58]:
filtered_tech.info()

<class 'pandas.core.frame.DataFrame'>
Index: 874970 entries, 1 to 2254332
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   index      874970 non-null  int64         
 1   ticker     874970 non-null  object        
 2   open       874970 non-null  float64       
 3   close      874970 non-null  float64       
 4   adj_close  874970 non-null  float64       
 5   low        874970 non-null  float64       
 6   high       874970 non-null  float64       
 7   volume     874970 non-null  int64         
 8   date       874970 non-null  object        
 9   datetime   874970 non-null  datetime64[ns]
 10  DayType    874970 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(2), object(3)
memory usage: 80.1+ MB


In [59]:
# Drop the first and last row of the DataFrame
tech = filtered_tech


In [60]:
tech.head()

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
1,3114942,VNET,19.4,17.75,17.75,17.32,19.5,2323400,2011-04-25,2011-04-25,Monday
5,3114963,VNET,15.95,15.51,15.51,15.3,16.42,1343900,2011-04-29,2011-04-29,Friday
6,3114964,VNET,15.74,16.68,16.68,15.54,17.0,1471100,2011-05-02,2011-05-02,Monday
10,3114968,VNET,13.75,13.75,13.75,13.16,14.2,1012800,2011-05-06,2011-05-06,Friday
11,3114969,VNET,13.7,14.26,14.26,13.12,14.4,631700,2011-05-09,2011-05-09,Monday


In [61]:
tech = tech.sort_values(by=['ticker', 'datetime']).reset_index(drop=True)

In [62]:
tech.head(10)

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
0,15579777,AABA,1.052083,1.375,1.375,1.020833,1.791667,408720000,1996-04-12,1996-04-12,Friday
1,15579790,AABA,1.489583,1.34375,1.34375,1.25,1.5,79219200,1996-04-15,1996-04-15,Monday
2,15579831,AABA,1.255208,1.203125,1.203125,1.197917,1.28125,12913600,1996-04-19,1996-04-19,Friday
3,15579841,AABA,1.208333,1.177083,1.177083,1.145833,1.208333,8041600,1996-04-22,1996-04-22,Monday
4,15579885,AABA,1.333333,1.322917,1.322917,1.302083,1.34375,7561600,1996-04-26,1996-04-26,Friday
5,15579899,AABA,1.3125,1.291667,1.291667,1.270833,1.333333,5928000,1996-04-29,1996-04-29,Monday
6,15579943,AABA,1.34375,1.333333,1.333333,1.302083,1.354167,6116800,1996-05-03,1996-05-03,Friday
7,15579953,AABA,1.354167,1.255208,1.255208,1.223958,1.354167,8214400,1996-05-06,1996-05-06,Monday
8,15579990,AABA,1.28125,1.302083,1.302083,1.270833,1.322917,5875200,1996-05-10,1996-05-10,Friday
9,15579991,AABA,1.307292,1.260417,1.260417,1.25,1.3125,2747200,1996-05-13,1996-05-13,Monday


In [66]:
# the Mondays and Fridays are not put in the same row

tech.to_csv('data/Weekends_Tech_Stocks.csv', index=False)

In [64]:
df2[df2['ticker']=='AAPL']


Unnamed: 0,ticker,exchange,name,sector,industry
195,AAPL,NASDAQ,APPLE INC.,TECHNOLOGY,COMPUTER MANUFACTURING


In [65]:
tech[tech['ticker']=='AAPL']

Unnamed: 0,index,ticker,open,close,adj_close,low,high,volume,date,datetime,DayType
4684,948,AAPL,0.513393,0.513393,0.023186,0.513393,0.515625,117258400,1980-12-12,1980-12-12,Friday
4685,960,AAPL,0.488839,0.486607,0.021977,0.486607,0.488839,43971200,1980-12-15,1980-12-15,Monday
4686,1000,AAPL,0.504464,0.504464,0.022783,0.504464,0.506696,12157600,1980-12-19,1980-12-19,Friday
4687,1008,AAPL,0.529018,0.529018,0.023892,0.529018,0.531250,9340800,1980-12-22,1980-12-22,Monday
4688,1040,AAPL,0.633929,0.633929,0.028630,0.633929,0.636161,13893600,1980-12-26,1980-12-26,Friday
...,...,...,...,...,...,...,...,...,...,...,...
8376,100485,AAPL,207.360001,207.529999,207.529999,206.669998,209.100006,24611200,2018-08-10,2018-08-10,Friday
8377,100486,AAPL,207.699997,208.869995,208.869995,207.699997,210.949997,25869100,2018-08-13,2018-08-13,Monday
8378,100490,AAPL,213.440002,217.580002,217.580002,213.160004,217.949997,35427000,2018-08-17,2018-08-17,Friday
8379,100491,AAPL,218.100006,215.460007,215.460007,215.110001,219.179993,30287700,2018-08-20,2018-08-20,Monday
