# NYSE ML Project Data Cleaning and Wrangling
In this notebook I will clean and wrangle the data from the data files I will be using for this project. As an output there will be a one file to use for the machine learning model analysis.

In [1]:
import pandas as pd

In [2]:
close_2018_df = pd.read_csv("./data/2018-close-price.tsv", delimiter="\t")
price_df = pd.read_csv("./data/prices-split-adjusted.csv")
fundamentals_df = pd.read_csv("./data/fundamentals.csv")
securities_df = pd.read_csv("./data/securities.csv")

### Clean 2018-close-price

In [3]:
close_2018_df.shape

(999, 2)

In [4]:
close_2018_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Ticker Symbol  501 non-null    object 
 1   2018 Close     436 non-null    float64
dtypes: float64(1), object(1)
memory usage: 15.7+ KB


In [5]:
# Remove empty rows
close_2018_df = close_2018_df[close_2018_df['Ticker Symbol'].notna()]
close_2018_df.shape

(501, 2)

In [6]:
# Remove any tickers without a closing price in 2018
close_2018_df = close_2018_df[close_2018_df['2018 Close'].notna()]
close_2018_df.shape

(436, 2)

In [7]:
# Check if there are any duplicate tickers
len(close_2018_df['Ticker Symbol'].unique())

436

### Clean price-split-adjusted

In [8]:
price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851264 entries, 0 to 851263
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    851264 non-null  object 
 1   symbol  851264 non-null  object 
 2   open    851264 non-null  float64
 3   close   851264 non-null  float64
 4   low     851264 non-null  float64
 5   high    851264 non-null  float64
 6   volume  851264 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 45.5+ MB


In [10]:
price_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
open,851264.0,64.99362,75.20389,1.66,31.27,48.46,75.12,1584.44
close,851264.0,65.01191,75.20122,1.59,31.29278,48.48,75.14,1578.13
low,851264.0,64.33654,74.45952,1.5,30.94,47.97,74.4,1549.94
high,851264.0,65.63975,75.90686,1.81,31.62,48.96,75.85,1600.93
volume,851264.0,5415113.0,12494680.0,0.0,1221500.0,2476250.0,5222500.0,859643400.0


In [11]:
price_df.head()

Unnamed: 0,date,symbol,open,close,low,high,volume
0,1/5/2016,WLTW,123.43,125.839996,122.309998,126.25,2163600
1,1/6/2016,WLTW,125.239998,119.980003,119.940002,125.540001,2386400
2,1/7/2016,WLTW,116.379997,114.949997,114.93,119.739998,2489500
3,1/8/2016,WLTW,115.480003,116.620003,113.5,117.440002,2006300
4,1/11/2016,WLTW,117.010002,114.970001,114.089996,117.330002,1408600


In [12]:
# Convert date column to datetime.
price_df['date'] = pd.to_datetime(price_df['date'], format="%m/%d/%Y")
price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851264 entries, 0 to 851263
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   date    851264 non-null  datetime64[ns]
 1   symbol  851264 non-null  object        
 2   open    851264 non-null  float64       
 3   close   851264 non-null  float64       
 4   low     851264 non-null  float64       
 5   high    851264 non-null  float64       
 6   volume  851264 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 45.5+ MB


In [15]:
price_df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,851264.0,2013-07-21 02:56:29.294037760,2010-01-04 00:00:00,2011-10-19 00:00:00,2013-08-02 00:00:00,2015-04-24 00:00:00,2016-12-30 00:00:00,
open,851264.0,64.993618,1.66,31.27,48.459999,75.120003,1584.439941,75.203893
close,851264.0,65.011913,1.59,31.292776,48.48,75.139999,1578.130005,75.201216
low,851264.0,64.336541,1.5,30.940001,47.970001,74.400002,1549.939941,74.459518
high,851264.0,65.639748,1.81,31.620001,48.959999,75.849998,1600.930054,75.906861
volume,851264.0,5415112.640027,0.0,1221500.0,2476250.0,5222500.0,859643400.0,12494681.433084


In [16]:
# Drop all dates before 01/01/2013.
price_df = price_df[price_df['date'] > '12/31/2012']
price_df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
date,497186.0,2015-01-06 05:07:56.253152512,2013-01-02 00:00:00,2014-01-07 00:00:00,2015-01-09 00:00:00,2016-01-07 00:00:00,2016-12-30 00:00:00,
open,497186.0,77.618982,1.66,38.09,58.48,87.639999,1584.439941,88.696438
close,497186.0,77.638583,1.59,38.099998,58.509998,87.669998,1578.130005,88.68694
low,497186.0,76.873468,1.5,37.709999,57.939999,86.860001,1549.939941,87.825628
high,497186.0,78.354314,1.81,38.48,59.029999,88.43,1600.930054,89.509429
volume,497186.0,4576383.987884,0.0,1134000.0,2223500.0,4581900.0,616620500.0,9137975.550856


In [18]:
# Remove all stocks that are not in 2018 data.
tickers_2018 = close_2018_df['Ticker Symbol']
price_df = price_df[price_df['symbol'].isin(tickers_2018)]
price_df.shape

(433302, 7)

### Clean fundamentals