# Foundations: NumPy
## NumPy Array

In [None]:
matrix = [[1, 2, 3],
          [4, 5, 6],
          [7, 8, 9]]

In [None]:
[[i + 1 for i in row] for row in matrix]

In [None]:
import numpy as np

In [None]:
# Constructing an array with a simple list results in a 1d array
array1 = np.array([10, 100, 1000.])

In [None]:
# Constructing an array with a nested list results in a 2d array
array2 = np.array([[1., 2., 3.],
                   [4., 5., 6.]])

In [None]:
array1.dtype

## Vectorization and Broadcasting

In [None]:
array2 + 1

In [None]:
array2 * array2

In [None]:
array2 * array1

In [None]:
array2 @ array2.T

## Universal Functions (ufunc)

In [None]:
import math
math.sqrt(array2)  # this will raise en Error

In [None]:
np.array([[math.sqrt(i) for i in row] for row in array2])

In [None]:
np.sqrt(array2)

In [None]:
array2.sum(axis=0)  # returns a 1d array

## Getting and Setting Array Elements

In [None]:
array1[2]  # returns a scalar

In [None]:
array2[0, 0]  # returns a scalar

In [None]:
array2[:, 1:]  # returns a 2d array

In [None]:
array2[:, 1]  # returns a 1d array

In [None]:
array2[1, :2] # returns a 1d array

## Useful Array Constructors

In [None]:
np.arange(10).reshape(2, 5)  # 2 rows, 5 columns

In [None]:
np.random.randn(2, 3)  # 2 rows, 3 columns

## View vs. Copy

In [None]:
array2

In [None]:
subset = array2[:, :2]
subset

In [None]:
subset[0, 0] = 1000

In [None]:
subset

In [None]:
array2

# DataFrame and Series
## Introduction

In [None]:
import pandas as pd
data=[['Mark', 55, 'Italy', 4.5, 'Europe'],
      ['John', 33, 'USA', 6.7, 'America'],
      ['Tim', 41, 'USA', 3.9, 'America'],
      ['Jenny', 12, 'Germany', 9.0, 'Europe']]
df = pd.DataFrame(data=data,
                  columns=['name', 'age', 'country',
                           'score', 'continent'],
                  index=[1001, 1000, 1002, 1003])
df

In [None]:
df.info()

## Index

In [None]:
df.index

In [None]:
df.index.name = 'user_id'
df

In [None]:
# Turns the index into a column, 
# replacing the index with the default index
df.reset_index()

In [None]:
# Turns 'user_id' into a regular column and
# makes the column 'name' the index
df.reset_index().set_index('name')

In [None]:
df.reindex([999, 1000, 1001, 1004])

In [None]:
df.sort_index()

In [None]:
df.sort_values(['name', 'age'])

## Columns

In [None]:
df.columns

In [None]:
df.columns.name = 'properties'
df

In [None]:
df.rename(columns={'name': 'First Name', 'age': 'Age'})

In [None]:
df.drop(columns=['name', 'country'],
        index=[1000, 1003])

In [None]:
df.T  # shortcut for df.transpose()

In [None]:
df.loc[:, ['continent', 'country', 'name', 'age', 'score']]

# Data Manipulation
## Selecting Data

In [None]:
df.loc[1000, 'name']  # Returns a Scalar

In [None]:
df.loc[[1000, 1002], 'name']  # Returns a Series

In [None]:
df.loc[:1002, ['name', 'country']]  # Returns a DataFrame

In [None]:
df.iloc[0, 0]  # Returns a Scalar

In [None]:
df.iloc[[0, 2], 1]  # Returns a Series

In [None]:
df.iloc[:3, [0, 2]]  # Returns a DataFrame

In [None]:
tf = (df['age'] > 40) & (df['country'] == 'USA')
tf  # true/false

In [None]:
df.loc[tf, :]

In [None]:
df.loc[df.index > 1001, :]

In [None]:
df.loc[df['country'].isin(['Italy', 'Germany']), :]

In [None]:
numbers = pd.DataFrame({'A': [1.3, 3.2],
                        'B': [-4.2, -2.1],
                        'C': [-5.5, 3.1]})
numbers

In [None]:
numbers < 2

In [None]:
numbers[numbers < 2]

In [None]:
# MultiIndex need to be sorted
df_multi = df.reset_index().set_index(['continent', 'country'])
df_multi = df_multi.sort_index()
df_multi

In [None]:
df_multi.loc['Europe', :]

In [None]:
df_multi.loc[('Europe', 'Italy'), :]

In [None]:
df_multi.reset_index(level=0)

## Setting Data

In [None]:
df2 = df.copy()

In [None]:
df2.loc[1000, 'name'] = 'JOHN'
df2

In [None]:
df2.loc[[1000, 1001], 'score'] = [3, 4]
df2

In [None]:
tf = (df2['age'] < 20) | (df2['country'] == 'USA')
df2.loc[tf, 'name'] = 'xxx'
df2

In [None]:
numbers2 = numbers.copy()
numbers2

In [None]:
numbers2[numbers2 < 2] = 0
numbers2

In [None]:
df.replace('USA', 'U.S.')

In [None]:
df2.loc[:, 'zeroes'] = 0
df2.loc[:, 'integers'] = [1, 2, 3, 4]
df2

In [None]:
df2 = df.copy()
df2.loc[:, 'double score'] = df2['score'] * 2
df2

## Missing Data

In [None]:
df2 = df.copy()
df2.loc[1000, 'score'] = None
df2.loc[1003, :] = None
df2

In [None]:
df2.dropna()

In [None]:
df2.dropna(how='all')

In [None]:
df2.isna()

In [None]:
df2.fillna({'score': df2['score'].mean()})

## Duplicated Data

In [None]:
df.drop_duplicates(['country', 'continent'])

In [None]:
df['country'].is_unique

In [None]:
df['country'].unique()

## Arithmetic Operations

In [None]:
numbers

In [None]:
numbers + 1

In [None]:
numbers2 = pd.DataFrame(data=[[1, 2], [3, 4]],
                        index=[0, 2],
                        columns=['A', 'D'])
numbers2

In [None]:
numbers + numbers2

In [None]:
numbers.add(numbers2, fill_value=0)

In [None]:
numbers.loc[1, :]

In [None]:
numbers + numbers.loc[1, :]

In [None]:
numbers.add(numbers.loc[:, 'C'], axis=0)

In [None]:
text_df = pd.DataFrame(data=[' mArk ', 'JOHN  ', 'Tim', ' jenny'],
                       columns=['name'])
text_df

In [None]:
cleaned = text_df.loc[:, 'name'].str.strip().str.capitalize()
cleaned

In [None]:
cleaned.str.startswith('J')

## Applying a Function

In [None]:
numbers

In [None]:
def deduct_index(s):
    return s - s.index

numbers.apply(deduct_index, axis=0)

In [None]:
numbers.apply(lambda s: s - s.index, axis=0)

In [None]:
numbers.applymap(lambda x: f'{x:,.3f}')

# Combining DataFrames
## Concatenating

In [None]:
data=[[15, 'France', 4.1, 'Becky'],
      [44, 'Canada', 6.1, 'Leanne']]
more_rows = pd.DataFrame(data=data,
                         columns=['age', 'country', 'score', 'name'],
                         index=[1000, 1011])
more_rows

In [None]:
pd.concat([df, more_rows], axis=0)

In [None]:
data=[[3, 4],
      [5, 6]]
more_cols = pd.DataFrame(data=data,
                         columns=['quizzes', 'logins'],
                         index=[1000, 2000])
more_cols

In [None]:
pd.concat([df, more_cols], axis=1)

## Joining and Merging

In [None]:
df1 = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]],
                   columns=['A', 'B'])
df1

In [None]:
df2 = pd.DataFrame(data=[[10, 20], [30, 40]],
                   columns=['C', 'D'], index=[1, 3])
df2

In [None]:
df1.join(df2, how='inner')

In [None]:
df1.join(df2, how='left')

In [None]:
df1.join(df2, how='right')

In [None]:
df1.join(df2, how='outer')

In [None]:
df1['category'] = ['a', 'b', 'c']
df2['category'] = ['c', 'b']

In [None]:
df1

In [None]:
df2

In [None]:
df1.merge(df2, how='inner', on=['category'])

In [None]:
df1.merge(df2, how='left', on=['category'])

# Data Aggregation and Descriptive Statistics
## Descriptive Statistics

In [None]:
numbers

In [None]:
numbers.sum()

In [None]:
numbers.sum(axis=1)

## Grouping

In [None]:
df.groupby(['continent']).mean()

In [None]:
df.groupby(['continent', 'country']).mean()

In [None]:
df.groupby(['continent']).agg(lambda x: x.max() - x.min())

## Pivoting and Melting

In [None]:
data = [['Oranges', 'North', 12.30],
        ['Apples', 'South', 10.55],
        ['Oranges', 'South', 22.00],
        ['Bananas', 'South', 5.90],
        ['Bananas', 'North', 31.30],
        ['Oranges', 'North', 13.10]]

sales = pd.DataFrame(data=data,
                     columns=['Fruit', 'Region', 'Revenue'])
sales

In [None]:
pivot = pd.pivot_table(sales,
                       index='Fruit', columns='Region',
                       values='Revenue', aggfunc='sum',
                       margins=True, margins_name='Total')
pivot

In [None]:
pd.melt(pivot.iloc[:-1,:-1].reset_index(),
        id_vars='Fruit',
        value_vars=['North', 'South'], value_name='Revenue')

# Plotting
## Matplotlib

In [None]:
# or: %matplotlib notebook
%matplotlib inline

In [None]:
data = pd.DataFrame(data=np.random.rand(4, 4) * 100000,
                    index=['Q1', 'Q2', 'Q3', 'Q4'],
                    columns=['East', 'West', 'North', 'South'])
data.index.name = 'Quarters'
data.columns.name = 'Region'
data

In [None]:
data.plot() # shortcut for data.plot.line()

## Plotly

In [None]:
# Set the plotting backend to Plotly
pd.options.plotting.backend = 'plotly'

In [None]:
data.plot()

In [None]:
data.plot.bar(barmode='group')

# Data Import and Export
## Importing a CSV file

In [None]:
msft = pd.read_csv('MSFT.csv')

In [None]:
msft.info()

In [None]:
# I am selecting a few columns because of space issues
# You can also just run: msft.head()
msft.loc[:, ['Date', 'Adj Close', 'Volume']].head()

In [None]:
msft.loc[:, ['Date', 'Adj Close', 'Volume']].tail(2)

In [None]:
msft.loc[:, ['Adj Close', 'Volume']].describe()

In [None]:
# the line break in the URL is only to make it fit on the page
url = ('https://raw.githubusercontent.com/fzumstein/'
       'python-for-excel/master/ch04/MSFT.csv')
msft = pd.read_csv(url)

In [None]:
msft.loc[:, ['Date', 'Adj Close', 'Volume']].head(2)

## Exporting to a CSV file

In [None]:
df.to_csv('course_participants.csv')

# Time Series
## DatetimeIndex

In [None]:
# This creates a DatetimeIndex based on a start timestamp,
# number of periods and frequency ('D' = daily).
daily_index = pd.date_range('2020-02-28', periods=4, freq='D')
daily_index

In [None]:
# This creates a DatetimeIndex based on start/end timestamp.
# The frequency is set to "weekly on Mondays" ('W-MON').
weekly_index = pd.date_range('2020-01-01', '2020-01-31', freq='W-MON')
weekly_index

In [None]:
# Construct a DataFrame based on the weekly_index
pd.DataFrame(data=1, columns=['dummy'], index=weekly_index)

In [None]:
msft = pd.read_csv('MSFT.csv')

In [None]:
msft.info()

In [None]:
msft.loc[:, 'Date'] = pd.to_datetime(msft['Date'])

In [None]:
msft.dtypes

In [None]:
msft = pd.read_csv('MSFT.csv', index_col='Date', parse_dates=['Date'])

In [None]:
msft.info()

In [None]:
msft.loc[:, 'Volume'] = msft['Volume'].astype('float')
msft['Volume'].dtype

In [None]:
msft = msft.sort_index()

In [None]:
msft.index.date

In [None]:
msft.loc['2019', 'Adj Close']

In [None]:
msft.loc['2019-06':'2020-05', 'Adj Close'].plot()

## Working with Time Zones

In [None]:
msft_tz = msft.loc[:, ['Adj Close']].copy()
msft_tz.index = msft_tz.index + pd.DateOffset(hours=16)
msft_tz.head(2)

In [None]:
msft_tz = msft_tz.tz_localize('America/New_York')
msft_tz.head(2)

In [None]:
msft_tz = msft_tz.tz_convert('UTC')
msft_tz.loc['2020-01-02', 'Adj Close']  # 21:00 without DST

In [None]:
msft_tz.loc['2020-05-01', 'Adj Close']  # 20:00 with DST

## Shifting and Percentage Changes

In [None]:
msft.iloc[:5, 4]

In [None]:
msft.iloc[:5, 4].shift(1)

In [None]:
returns = np.log(msft['Adj Close'] / msft['Adj Close'].shift(1))
returns.head()

In [None]:
returns.plot.hist()

In [None]:
simple_rets = msft[['Adj Close']].pct_change()
simple_rets = simple_rets.rename(columns={'Adj Close': 'returns'})
simple_rets.head()

## Rebasing and Correlation

In [None]:
adj_close_parts = [] # list to collect individual DataFrames
for ticker in ['AAPL', 'AMZN', 'GOOGL', 'MSFT']:
    # usecols allows us to only read in the Date and Adj Close
    adj_close = pd.read_csv(ticker + '.csv', index_col='Date',
                            parse_dates=['Date'],
                            usecols=['Date', 'Adj Close'])
    # rename the column into the ticker symbol
    adj_close = adj_close.rename(columns={'Adj Close': ticker})
    # append the stock's DataFrame to the adj_close_parts list
    adj_close_parts.append(adj_close)

In [None]:
# Combine the 4 DataFrames into a single DataFrame
adj_close = pd.concat(adj_close_parts, axis=1)
adj_close

In [None]:
adj_close = adj_close.dropna()
adj_close.info()

In [None]:
# Use a sample from June 2019 - May 2020
adj_close_sample = adj_close.loc['2019-06':'2020-05', :]
rebased_prices = adj_close_sample / adj_close_sample.iloc[0, :] * 100
rebased_prices.head(2)

In [None]:
rebased_prices.plot()

In [None]:
returns = np.log(adj_close / adj_close.shift(1))
returns.corr()

In [None]:
import plotly.express as px
fig = px.imshow(returns.corr(),
                x=adj_close.columns,
                y=adj_close.columns,
                color_continuous_scale=list(
                    reversed(px.colors.sequential.RdBu)),
                zmin=-1, zmax=1)
fig.show()

## Resampling

In [None]:
end_of_month = adj_close.resample('M').last()
end_of_month.head()

In [None]:
end_of_month.resample('D').asfreq().head()  # no transformation

In [None]:
end_of_month.resample('W-FRI').ffill().head()  # forward fill

## Rolling Windows

In [None]:
# Plot the moving average for MSFT with data from 2019
msft19 = msft.loc['2019', ['Adj Close']].copy()
# Add the 25 day moving average as a new column to the DataFrame
msft19['25day average'] = msft19['Adj Close'].rolling(25).mean()
msft19.plot()