In [None]:
import numpy as np
import pandas as pd
import os

files_path = os.path.join(os.path.abspath(''), '3-class', 'files')

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex1.csv'))   # notice the header
df

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex2.csv'), header=None)  # no header, 1st line is already data
df

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex2.csv'), names = 'a b c d message'.split())
df

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex2.csv'), names = 'a b c d message'.split(), index_col='message')
df

In [None]:
#
# Reading large files in pieces
#
pd.options.display.max_rows = 10

In [None]:
df = pd.read_csv(os.path.join(files_path, 'ex3.csv'))
df

In [None]:
chunker = pd.read_csv(os.path.join(files_path, 'ex3.csv'), chunksize=1000)
chunker  # slow loading of file for iteration

In [None]:
total = pd.Series([], dtype=np.int64)
for piece in chunker:
    total = total.add(piece['key'].value_counts(), fill_value=0)
total = total.sort_values(ascending=False)
total

In [None]:
# Writing
df.to_csv(os.path.join(files_path, 'out1.csv'), index=False, header=False)

In [None]:
#
# Preparation, Cleaning
#

In [None]:
s = pd.Series(['asdsad', None, 'csc', np.nan, 'vff'])
s

In [None]:
s.isnull()

In [None]:
s[s.notnull()]

In [None]:
s.dropna()

In [None]:
df = pd.DataFrame([[1., 6.5, 3.], [2., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.2, 5.]])
df

In [None]:
df.dropna()  # removes rows with any nulls..

In [None]:
df.dropna(how='all')  # only rows with all null

In [None]:
df.dropna(axis=1, how='all')  # columns this time

In [None]:
df.dropna(thresh=2)  # leaves only rows with > 2observations (non null)

In [None]:
df.fillna(0)

In [None]:
df.fillna({0: 100, 1: -0.5, 2: 0})

In [None]:
another_df = df.copy()
another_df.fillna(0, inplace=True)
another_df

In [None]:
df.fillna(method='ffill')  # forward fill along the columns

In [None]:
df.fillna(df.mean())  # fill with mean for each column


In [None]:
df = pd.DataFrame({'k1': ['one', 'two']*3 + ['two'], 'k2': [1, 1, 2, 3, 3, 4, 4]})
df

In [None]:
df.duplicated()  # duplicated row!

In [None]:
df.drop_duplicates()  # such easy magic..

In [None]:
df['k3'] = range(7)
df

In [None]:
df.drop_duplicates(['k1'])  # pass sequence of labels, looks only at that value

In [None]:
df.drop_duplicates(['k1', 'k2'])

In [None]:
df.drop_duplicates(['k1', 'k2'], keep='last')  # Pandas truly does everything...


In [None]:
df = pd.DataFrame({
    'food': ['bacon', 'pork', 'Bacon', 'Ham', 'beef', 'bacon', 'ham', 'chicken', 'salmon'],
    'kg': [2, 3, 4, 5, 7.5, 8, 3, 4, 4]
})
df

In [None]:
meat_to_animal = {
    'bacon': 'pig',
    'pork': 'pig',
    'ham': 'pig',
    'beef': 'cow',
    'chicken': 'chicken',
    'salmon': 'salmon'
}
df['animal'] = df['food'].map(meat_to_animal)
df

In [None]:
df['animal'] = df['food'].str.lower().map(meat_to_animal)   # all str operations
df

In [None]:
df['food'].map(lambda x: meat_to_animal[x.lower()])

In [None]:
data = pd.Series([1., -999, 2., -999, -1000., 3., 4.])
data

In [None]:
data.replace(-999, np.nan)  # -999 is clearly invalid stuff here

In [None]:
data.replace([-999, -1000], np.nan)

In [None]:
data.replace([-999, -1000], [np.nan, 0])

In [None]:
data.replace({-999: np.nan, -1000: 0})  # perhaps more readable?

In [None]:
# filtering
df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()

In [None]:
# finding values exceeding abs 3 in a given column...
col = df[0]
col[np.abs(col) > 3]

In [None]:
# seeing in all cols
df[(np.abs(df) > 3).any(1)]   # seeing if there's any abs(value) > 3 along a row (across axis 1 - columns)

In [None]:
# we don't want those vals, let's cap them
df[np.abs(df) > 3] = np.sign(df) * 3  # puts sign of each val
df.describe()

In [None]:
np.sign(df).head()

In [None]:
df.sample(n=5)  # take random 5 vals

In [None]:
#
# More string functions
#

In [None]:
data = pd.Series({
    'Dave': 'dave@google.com',
    'Steve': 'steve@gmail.com',
    'G': 'g@pm.me',
    'John': 'john@gmail.com',
    'Pedro': np.nan
})
data

In [None]:
data.str.contains('gmail')

In [None]:
# regex nonsense, not gonna go much into it...
import re

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'  # raw string
data.str.findall(pattern, flags=re.IGNORECASE)

In [None]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str.get(0)
matches

In [None]:
matches.str.get(1)

In [None]:
data.str.extract(pattern, flags=re.IGNORECASE)  # extract to organized dataframe!

In [None]:
#
# Combining, merging, reshaping...
#

In [None]:
s = pd.Series(np.random.rand(9),
    index=[list('aaabbccdd'), [1, 2, 3, 1, 3, 1, 2, 2, 3]])
s

In [None]:
s.index  # multi level index -> easily convertible to dataframe

In [None]:
s.unstack()

In [None]:
s.unstack().stack()

In [None]:
df1 = pd.DataFrame({'key': list('bbacaab'), 'data1': range(7)})
df2 = pd.DataFrame({'key': list('abd'), 'data2': range(3)})
print(df1)
print(df2)

In [None]:
pd.merge(df1, df2)  # note c and d disappearing (join is being done on 'key')
# same as pd.merge(df1, df2, on='key')

In [None]:
pd.merge(df1, df2, how='outer')

In [None]:
#
# Concatenating
#
arr = np.arange(12).reshape((4, 3))
arr

In [None]:
np.concatenate([arr, arr], axis=1)

In [None]:
np.concatenate([arr, arr])

In [None]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
pd.concat([s1, s2])  # axis 0 produces another Series

In [None]:
pd.concat([s1, s2], axis=1)   # produces a dataframe

In [None]:
s3 = pd.Series([5, 6, 7], index=['a', 'b', 'f'])
pd.concat([s1, s3], axis=1, join='inner')  # db inner join is intersection