In [None]:
import pandas as pd
import numpy as np

In [None]:
some_data = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj1 = pd.Series(some_data)
obj1

In [None]:
obj1.index

In [None]:
states = ['California', 'Ohio', 'New York', 'Texas']
obj2 = pd.Series(some_data, index=states)
obj2

In [None]:
obj2.isnull()  # missing data, NaN or None

In [None]:
obj1 + obj2  # sums data, adds missing indexes

In [None]:
obj1.name = 'population'
obj1.index.name = 'state'
obj1

In [None]:
#
# DataFrame
#
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
frame = pd.DataFrame(data)
frame

In [None]:
frame.head()

In [None]:
frame.tail()

In [None]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])

In [None]:
frame = pd.DataFrame(data, columns=['year', 'state', 'pop'], index=['one', 'two', 'three', 'four', 'five', 'six'])
frame

In [None]:
print(frame.columns)
print(frame.year)
print(frame['state'])  # fetch column

In [None]:
frame.loc['one']

In [None]:
frame['debt'] = np.arange(6)
frame

In [None]:
frame['debt'] = pd.Series([-1, 20.3], index=['two', 'five'])
frame

In [None]:
frame.state == 'Ohio'

In [None]:
frame['isOhio'] = frame.state == 'Ohio'
frame

In [None]:
del frame['isOhio']
frame

In [None]:
frame.T

In [None]:
frame

In [None]:
index = frame.index
index

In [None]:
# index[2] = 'other stuff'  # TypeError

In [None]:
labels = pd.Index(['something', 'other_something', 'something'])
labels

In [None]:
obj = pd.Series(np.arange(4), index=['a', 'b', 'c', 'd'])
obj

In [None]:
list('abdef')

In [None]:
obj.reindex(list('abdef'))

In [None]:
obj = pd.Series(list('abc'), index=[0, 2, 4])
obj

In [None]:
obj.reindex(range(6), method='ffill')

In [None]:
obj = pd.Series(range(5), index=list('abcde'))
obj

In [None]:
obj.drop('c')

In [None]:
obj.drop(['a', 'e'])

In [None]:
obj.drop('b', inplace=True)
obj

In [None]:
df = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four']
)
df

In [None]:
df['three'] > 5

In [None]:
df[df['three'] > 5]

In [None]:
df.loc['Utah']

In [None]:
df.iloc[2]  # row index

In [None]:
df.iloc[:, :3][df.three > 5]

In [None]:
# Arithmetic
df

In [None]:
df + df

In [None]:
df * 2

In [None]:
print(df.index)
df.sort_index()

In [None]:
df.add(df)

In [None]:
df + df

In [None]:
print(df)

df.apply(lambda x: x.max(), axis='columns')

In [None]:
df.apply(max)

In [None]:
#
# Operations between Series and DataFrame
#

df = pd.DataFrame(
    np.arange(12.0).reshape((4, 3)),
    columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon']
)
series = df.iloc[0]
print(df)
print(series)

In [None]:
df - series  # broadcasts down the rows

In [None]:
series = pd.Series(range(3), index=['b', 'e', 'f'])
print(series)
print(df)
df + series

In [None]:
print(df)
series = df['d']
series

In [None]:
df.sub(series, axis='index')  # broadcasts across columns

In [None]:
df = pd.DataFrame(
    np.random.randn(4, 3),
    columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon']
)
df

In [None]:
np.abs(df)

In [None]:
df.apply(lambda x: x.max() - x.min())

In [None]:
df.apply(lambda x: x.max() - x.min(), axis='columns')

In [None]:
format_func = lambda x: '%.2f' % x
df.applymap(format_func)  # elementwise

In [None]:
df['e'].map(format_func)

In [None]:
print(df.e)
print(df.e.sort_index())

In [None]:
df.sort_index()

In [None]:
df.sort_index(axis=1, ascending=False) 

In [None]:
print(df.e)
df['e'].sort_values()

In [None]:
df.sort_values(by='e')

In [None]:
#
# Summarizing and computing basic statistics
#
df = pd.DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
    index=list('abcd'),
    columns=['one', 'two']
)
df

In [None]:
df.sum()

In [None]:
df.mean()

In [None]:
df.mean(axis=1, skipna=False)

In [None]:
print(df)
df.idxmax()  # indexes where we'll find max vals

In [None]:
df.describe()  # pure magic

In [None]:
#
# Data manipulations
#
import os  # read files

In [None]:
files_rel_path = os.path.join(os.path.abspath(''), '..', '..', '3-class', 'files')
files_path = os.path.realpath(files_rel_path)
files_path

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex1.csv'))
df

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex2.csv'), names='a b c d message'.split(), index_col='message')
df

In [None]:
#
# Reading large files in pieces
#
pd.options.display.max_rows = 10

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex3.csv'))
df

In [None]:
chunker = pd.read_csv(os.path.join(files_path, 'ex3.csv'), chunksize=1000)
chunker  # slow loading of file for iteration

total = pd.Series([], dtype=np.int64)
for piece in chunker:
    total = total.add(piece['key'].value_counts(), fill_value=0)
total = total.sort_values(ascending=False)
total

In [None]:
# Writing
df

In [None]:
df.to_csv(os.path.join(files_path, 'out1.csv'), index=False, header=None)

In [None]:
#
# Preparation, Cleaning
#
s = pd.Series(['asdasd', None, 'csvddsf', np.nan, 'fdafgkwer'])
s

In [None]:
s.isnull()

In [None]:
s[s.notnull()]

In [None]:
s.dropna()

In [None]:
df = pd.DataFrame([[1., 6.5, 3.], [2., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.2, 5.]])
df

In [None]:
df.dropna()  # removes rows with any null

In [None]:
df.dropna(how='all')

In [None]:
df

In [None]:
df.dropna(axis=1, how='all')

In [None]:
df.dropna(thresh=2)  # leaves only rows with >= 2 observations (not null)

In [None]:
df.fillna(0)

In [None]:
df

In [None]:
df.fillna({0: 100, 1: -0.5, 2: 0})

In [None]:
print(df)
df.fillna(method='ffill')

In [None]:
print(df)
df.fillna(df.mean())

In [None]:
df = pd.DataFrame({
    'k1': ['one', 'two'] * 3 + ['two'],
    'k2': [1, 1, 2, 3, 3, 4, 4]
})
df

In [None]:
df.duplicated()

In [None]:
df.drop_duplicates()

In [None]:
df['k3'] = np.arange(7)
df

In [None]:
df.drop_duplicates(['k1'])

In [None]:
df.drop_duplicates(['k1', 'k2'], keep='last')

In [None]:
df = pd.DataFrame({
    'food': ['bacon', 'pork', 'Bacon', 'Ham', 'beef', 'bacon', 'ham', 'chicken', 'salmon'],
    'kg': [2, 3, 4, 5, 7.5, 8, 3, 4, 4]
})
df

In [None]:
meat_to_animal = {
    'bacon': 'pig',
    'pork': 'pig',
    'ham': 'pig',
    'beef': 'cow',
    'chicken': 'chicken',
    'salmon': 'salmon'
}
df['animal'] = df['food'].map(meat_to_animal)
df

In [None]:
df['animal'] = df.food.str.lower().map(meat_to_animal)
df

In [None]:
df['food'].map(lambda x: meat_to_animal[x.lower()])

In [None]:
data = pd.Series([1., -999, 2., -999, -1000., 3., 4.])
data

In [None]:
data.replace([-999, -1000], [np.nan, 0])

In [None]:
data.replace({-999: np.nan, -1000: 0})

In [None]:
df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()

In [None]:
# finding values exceeding abs 3 in a given column...
col = df[0]
col[np.abs(col) > 3]

In [None]:
# in all cols...
df[(np.abs(df) > 3).any(1)]   # seeing if there's any abs(value) > 3 along a row (across columns)

In [None]:
# let's cap them
df[np.abs(df) > 3] = np.sign(df) * 3
df.describe()

In [None]:
np.sign(df).head()

In [None]:
df.sample(n=5)

In [None]:
#
# Combining, merging, reshaping...
#
s = pd.Series(
    np.random.randn(9),
    index=[list('aaabbccdd'), [1, 2, 3, 1, 3, 1, 2, 2, 3]]
)
s

In [None]:
s.unstack().stack()

In [None]:
df1 = pd.DataFrame({'key': list('bbacaab'), 'data1': range(7)})
df2 = pd.DataFrame({'key': list('abd'), 'data2': range(3)})
print(df1)
print(df2)

In [None]:
pd.merge(df1, df2)  # join on 'key'

In [None]:
pd.merge(df1, df2, how='outer')

In [None]:
#
# Concatenating
#
arr = np.arange(12).reshape((4, 3))
arr

In [None]:
np.concatenate([arr, arr], axis=1)

In [None]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
pd.concat([s1, s2])  # axis 0 produces another Series

In [None]:
pd.concat([s1, s2], axis=1)  # produces dataframe

In [None]:
s3 = pd.Series([5, 6, 7], index=['a', 'b', 'f'])
print(s1)
print(s3)

In [None]:
pd.concat([s1, s3], axis=1, join='inner')  # db inner join (intersection)

In [None]:
#
# Plotting!!!
#

In [None]:
import matplotlib.pyplot as plt

In [None]:
data = np.arange(10)
data

In [None]:
plt.plot(data)

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)

plt.plot(np.random.randn(50).cumsum(), 'k--')

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)

ax1.hist(np.random.randn(100), bins=20, color='k', alpha=0.3)
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))
ax3.plot(np.random.randn(50).cumsum(), 'k--')