##### VanderPlas, Jake. 2016. Python Data Science Handbook: Essential Tools for Working with Data. O'Reilly Media. Available at: https://jakevdp.github.io/PythonDataScienceHandbook/

## Introducing Pandas Objects

In [None]:
import numpy as np
import pandas as pd

three fundamental Pandas data structures: the Series, DataFrame, and Index


In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)

In [None]:
type(data)

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

In [None]:
data['b']

In [None]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

In [None]:
population['California':'Florida']

In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

In [None]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

In [None]:
type(states)

In [None]:
states.index

In [None]:
states.columns

In [None]:
pd.DataFrame(population, columns=['population'])

##### Derive dataframe from a dictionary with a mathematical operation

In [None]:
counter = []

for i in range(3):
    counter.append(i)

print(counter)

In [None]:
print([i for i in range(3)])

In [None]:
data = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(data)

In [None]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

In [None]:
pd.DataFrame(np.random.rand(3, 2),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])

## Data Indexing and Selection

In [None]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

In [None]:
'a' in data

In [None]:
data.keys()

In [None]:
list(data.items())

In [None]:
data['e'] = 1.25
data

In [None]:
data[(data > 0.3) & (data < 0.8)]

In [None]:
data['a']

In [None]:
data[['a', 'e']]

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

In [None]:
data.loc['New York','area']

In [None]:
data.loc['New York']

In [None]:
data.loc[:,'area']

In [None]:
data.loc[:'New York', :'area']

In [None]:
data.iloc[1:3]

In [None]:
data.area

In [None]:
data['density'] = data['pop'] / data['area']
data

In [None]:
data.values

In [None]:
data.T

In [None]:
data.iloc[:3, :2]

In [None]:
data

In [None]:
data.loc[data.density > 100, ['pop', 'density']]

In [None]:
data[data.density > 100]

In [None]:
data.iloc[0, 2] = 90
data

## Operating on Data in Pandas

In [None]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser

In [None]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)),
                  columns=['A', 'B', 'C', 'D'])
df

In [None]:
np.exp(ser)

In [None]:
np.sin(df * np.pi / 4)

In [None]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [None]:
population / area

In [None]:
area.index.union(population.index) #or intersection

In [None]:
area.index.intersection(population.index)

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

In [None]:
A.add(B, fill_value=0)

In [None]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

In [None]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

In [None]:
A + B

In [None]:
fill = A.stack().mean()
print(fill)
A.add(B, fill_value=fill)

#### Python Operator: Pandas Method(s)
- "+" add()
- "-" sub(), subtract()
- "*" mul(), multiply()
- "/" truediv(), div(), divide()
- "//" floordiv()
- "%" mod()
- "**" pow()

In [None]:
A = rng.randint(10, size=(3, 4))
A

In [None]:
type(A)

In [None]:
A[0]

In [None]:
A - A[0]

In [None]:
df = pd.DataFrame(A, columns=list('QRST'))
df - df.iloc[0]

In [None]:
df.iloc[0]

In [None]:
df.subtract(df['R'], axis=0)

## Handling Missing Data

In [None]:
vals1 = np.array([1, None, 3, 4])
vals1

In [None]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

In [None]:
x = 1E6
x*1E-3

In [None]:
x = 5
y = 3

%timeit z = x + y
print()

x = 'ahmet'
y = 'mehmet'

%timeit z = x + y
print()

In [None]:
vals1.sum()

In [None]:
vals2 = np.array([1, np.nan, 3, 4]) 
vals2.dtype

In [None]:
1 + np.nan

In [None]:
0 *  np.nan

In [None]:
vals2.sum(), vals2.min(), vals2.max()

In [None]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

In [None]:
pd.Series([1, np.nan, 2, None])

In [None]:
x = pd.Series(range(2),  dtype=int)
x

In [None]:
x[0] = None
x

In [None]:
data = pd.Series([1, np.nan, 'hello', None])

In [None]:
data

In [None]:
data.isnull()

In [None]:
data[data.notnull()]

In [None]:
data.dropna()

In [None]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

In [None]:
df.dropna()

In [None]:
df.dropna(axis=0)

In [None]:
df[3] = np.nan
df

In [None]:
df.dropna(axis='columns', how='all')

In [None]:
df.dropna(axis='columns', how='any')

In [None]:
df.dropna(axis='rows', thresh=3)

In [None]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

In [None]:
data.fillna(0)

In [None]:
# forward-fill
data.fillna(method='ffill')

In [None]:
# back-fill
data.fillna(method='bfill')

In [None]:
df

In [None]:
df.fillna(method='ffill', axis=1)