# Pandas: the Python structured data library

Pandas (allegedly) stands for **Pan**el **da**ta (**s**?) and lets you manipulate 'spreadsheet-like' data in Python easily

In [None]:
!pip install pandas

In [None]:
import pandas as pd

## Series: kind of like a `list` and `dict` put together

In [None]:
s = pd.Series([1,2,3])
s

In [None]:
import numpy as np

s = pd.Series([1,2,3], dtype=np.int8)
s

In [None]:
s[1] = 3.14
s

In [None]:
'a b c'.split()

In [None]:
s = pd.Series([1,2,3], index='a b c'.split())
s

In [None]:
s[0]

In [None]:
s['a']

In [None]:
s.a

## DataFrame -- the main data type



In [None]:
df = pd.DataFrame(
    [
        [1,2,3],
        [4,5,6],
        [7,8,9],
        [7,8,9],
    ],
    columns='a b c'.split(),
    index='x y z w'.split()
)
df

In [None]:
df.columns

In [None]:
df.index

In [None]:
df['a']

In [None]:
df.a

In [None]:
type(df.a)

Multiple columns

In [None]:
cola = pd.Series([1, 4, 7])
colb = pd.Series([2, 5, 8])
colc = pd.Series([3, 6, 9], dtype=np.float32)
pd.DataFrame({'a': cola, 'b': colb, 'c': colc})

Manipulating dataframes

In [None]:
df

In [None]:
cols = ['a', 'b']
df_test = df[cols]
df_test

In [None]:
cols = ['a']
df_test = df[cols]
df_test

In [None]:
df_test.shape

In [None]:
df_test = df[['a']]
df_test

In [None]:
df_test = df['a']
df_test

In [None]:
df_test.shape

Indexing ambiguity

In [None]:
s = pd.Series([1,2,3], index=[1,2,3])
s

In [None]:
s[1]  # label/index value

In [None]:
s[1:3]  # position/offset

# Indexing using .loc, .iloc

In [None]:
s.loc[1]   # uses index 'label'

In [None]:
s.iloc[1]  # uses the offset ('integer index value')

In [None]:
df

In [None]:
df['a']

In [None]:
df.loc['x']

In [None]:
df.iloc[0]

In [None]:
df.loc['x', 'a']

In [None]:
df.loc['x', :]  # retrieve all columns

In [None]:
df.loc[:, 'a']  # retrieve all rows

In [None]:
df.loc['x':'y']   # includes both endpoints (df.loc[x] and df.loc[y])

In [None]:
df.iloc[0:2]     # excludes the right endpoint (df.iloc[2])

In [None]:
df.info(memory_usage='deep')

In [None]:
import sys
sys.getsizeof(5.5)

In [None]:
12*24

In [None]:
df.loc[:, 'b'] = 200

In [None]:
df

In [None]:
df['a']

In [None]:
df2 = df[['a']]
df2

In [None]:
len(df2)

In [None]:
df2.shape

In [None]:
df['a']

In [None]:
df['a'].shape

In [None]:
df['c'] = 22

In [None]:
df

# Reading CSV data

Most of the time, we *won't* be building `DataFrame`s out of the basic constructor, but rather using one of the readers built in to Pandas. One of these is `read_csv`:

In [None]:
df = pd.read_csv('./data/closing-prices.csv')
df.head() # Only show the first few rows  aka df.iloc[:5]

In [None]:
!head data/closing-prices.csv

The CSV reader is pretty good about inferring types, but not perfect. We can check lots of things about the structure of a `DataFrame` with the `.info()` method:

In [None]:
df.info(memory_usage='deep')

In [None]:
ls -hl data/closing-prices.csv

In [None]:
float('nan')

In [None]:
np.nan == np.nan

In [None]:
np.nan is np.nan

In [None]:
5 * np.nan

In [None]:
!ls -lh ./data/closing-prices.csv

In [None]:
import csv
with open('./data/closing-prices.csv') as f:
    rows = list(csv.reader(f))

In [None]:
len(rows)

In [None]:
rows[:5]

In [None]:
!pip install pympler

In [None]:
import pympler

In [None]:
import pympler.asizeof

In [None]:
pympler.asizeof.asizeof(rows)

The first column was read in as an `object` (meaning Pandas couldn't be more specific about its type, usually what happens with string data). Let's tell Pandas that column is a date:

In [None]:
pd.to_datetime('4/13/22')

In [None]:
pd.to_datetime('2022-04-13T01:36')

In [None]:
pd.to_datetime(df['Unnamed: 0'])

In [None]:
df['Unnamed: 0'] = pd.to_datetime(df['Unnamed: 0'])
df.info(memory_usage='deep')

We can also parse datetimes during the import:

In [None]:
df = pd.read_csv('./data/closing-prices.csv', parse_dates=[0])
df.info()

In [None]:
ls -lh ./data/closing-prices.csv

In [None]:
df.iloc[:5] # also df.head()

In [None]:
df.tail()

We can set the index of the dataframe as well:

In [None]:
df = df.set_index('Unnamed: 0')  # also df.set_index('Unnamed: 0', inplace=True)
df.head()

In [None]:
df.info(memory_usage='deep')

Its even better if we do it when we read in the frame:

In [None]:
df = pd.read_csv('./data/closing-prices.csv', index_col=0, parse_dates=[0])
df.info()

In [None]:
df.head()

In [None]:
df.loc['1/3/14']

In [None]:
df.loc['Jan 3 2014']

In [None]:
df.loc['2014-01-03']

In [None]:
df.iloc[1]

In [None]:
!cp ./data/closing-prices.csv ./data/closing-prices-2.csv
!gzip -f ./data/closing-prices-2.csv

In [None]:
!ls -lh ./data/closing-prices-2.csv.gz

In [None]:
df = pd.read_csv(
    './data/closing-prices-2.csv.gz', 
    index_col=0, 
    parse_dates=[0], 
    dtype=np.float16,
)
df.info()

(If you install s3fs, you can even read CSVs from s3://BUCKET/KEY/...csv.gz urls!)

## Reading from external APIs

There are some data sources for market data available in the pandas_datareader package:

In [None]:
!pip install -U "pandas<1.5"

In [None]:
import pandas as pd

In [None]:
!pip install -U pandas_datareader

In [None]:
from datetime import datetime

import pandas_datareader.data as web

start, end = datetime(2016, 1, 1), datetime(2022, 1, 1)
data = web.DataReader(
    ['F', 'TSLA', 'GOOG', 'IBM', 'AAPL', 'CRM', 'NTNX'], 
    'yahoo', start, end,
)
data.head()

In [None]:
data.tail()

In [None]:
data['Close'].tail()

In [None]:
data.loc[:, ('Close', "CRM")]

In [None]:
data.columns

In [None]:
data.columns.levels

In [None]:
dfs = {
    attr: data[attr]
    for attr in data.columns.levels[0]
}

In [None]:
dfs['Volume'].head()

In [None]:
data.columns = data.columns.swaplevel()
data.head()

In [None]:
data['TSLA'].head()

(If Yahoo finance won't work for us)

In [None]:
dfs = {}
for attr in ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']:
    dfs[attr] = pd.read_excel('./data/stocks.xlsx', attr, index_col='Date')

In [None]:
dfs['Close']

## Writing Excel data

We can write a multi-page Excel file using an ExcelWriter:

In [None]:
!pip install xlrd xlwt openpyxl

In normal python to write a file you might say:

```python
with open(filename, 'w') as fp:
    fp.write(some_data)
```

In [None]:
with pd.ExcelWriter('./data/stocks.xlsx') as writer:
    for name, sheet in dfs.items():
        sheet.to_excel(writer, name)

In [None]:
!file data/stocks.xlsx

In [None]:
!cp data/stocks.xlsx ~/Downloads

## Reading Excel data

We can also read a sheet from an Excel workbook:

In [None]:
closing = pd.read_excel('./data/stocks.xlsx', 'Close', index_col='Date')
closing.head()

In [None]:
closing.info()

You can also use read_excel(..., engine='odf') with odfpy to read OpenOffice spreadsheets

In [None]:
!pip install odfpy

## Data from SQL

In [None]:
import pandas as pd
import sqlite3
con = sqlite3.connect('./data/real-estate.db')

In [None]:
transactions = pd.read_sql(
    'SELECT * FROM transactions', con, 
    index_col='index', 
    parse_dates=['sale_date'],
)
transactions.head()

We can even build a quick little bulk load function in a couple of lines of pandas:

In [None]:
stock = pd.read_csv('./data/closing-prices.csv', index_col=[0], parse_dates=True)
stock.to_sql('stock', con, if_exists='replace')

In [None]:
for row in con.execute('select * from stock limit 5'):
    print(row)

In [None]:
con.execute('select count(*) from stock').fetchall()

(for non-sqlite3 databases, you must use a sqlalchemy engine object and the `sqlalchemy.create_engine` function)

## Data from HTML

In [None]:
!pip install html5lib

In [None]:
tables = pd.read_html(
    'https://en.wikipedia.org/wiki/Python_(genus)',
)

In [None]:
len(tables)

In [None]:
tables[0]

In [None]:
tables = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population',
    match='New York'
)
len(tables)

In [None]:
tables[0]

In [None]:
tables[0].info()

In [None]:
tables[1].head()

In [None]:
tables[1].info()

## Data from JSON APIs

In [None]:
!pip install requests

In [None]:
import requests

# I don't have any idea who's API key this is, but they're free, so....
APPID = '10d4440bbaa8581bb8da9bd1fbea5617'   
UNITS = 'metric'
city = 'Dublin'
resp = requests.get(
    'http://api.openweathermap.org/data/2.5/forecast', 
    params={
        'q': city,
        'units': UNITS,
        'appid': APPID,
    }
)
data = resp.json()

In [None]:
data

In [None]:
data['list'][0]

In [None]:
row = data['list'][0]
{
    'date': row['dt_txt'], 
    **row['main'], 
    **row['weather'][0]
} 

In [None]:
# Python magic to build a list of dicts

raw_data = [
    {
        'date': row['dt_txt'], 
        **row['main'], 
        **row['weather'][0]
    } 
    for row in data['list']
]

In [None]:
raw_data[0]

In [None]:
weather = pd.DataFrame.from_dict(raw_data)
weather.head()

In [None]:
weather['date'] = pd.to_datetime(weather['date'])
weather.set_index('date', inplace=True)
weather.head()

In [None]:
weather.info(memory_usage='deep')

Much easier...

In [None]:
pd.json_normalize(raw_data).head()

In [None]:
pd.json_normalize(data['list'])

## Writing csv data

In [None]:
weather.to_csv('./data/weather.csv')

In [None]:
!head data/weather.csv

JSON lines

In [None]:
weather.head()

In [None]:
weather.reset_index().to_json('./data/weather.jsonlines', orient='records', lines=True)

In [None]:
!cat data/weather.jsonlines

In [None]:
df = pd.read_json('./data/weather.jsonlines', lines=True).set_index('date')

In [None]:
df.info()

In [None]:
weather.info(memory_usage='deep')

In [None]:
weather.temp * 1.8 + 32

In [None]:
%timeit weather.temp * 1.8 + 32

In [None]:
weather.temp.values

In [None]:
%timeit weather.temp.values * 1.8 + 32

In [None]:
pd.Series(weather.temp.values * 1.8 + 32, index=weather.index)

Open the [Pandas IO Lab][pandas-io-lab]

[pandas-io-lab]: ./pandas-io-lab.ipynb