### Heavily based on ["10 Minutes to pandas"](https://pandas.pydata.org/pandas-docs/stable/10min.html)

In [1]:
import pandas as pd

import numpy as np

# Series, DataFrames

In [2]:
# a pandas series is a vector of data, a column
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [3]:
# a DataFrame is a data table, always indexed.
# creating one from a random numpy 2D array (notice the index isn't specified, automatically becomes zero based counter):
df = pd.DataFrame(np.random.randn(6,4), columns = ['A', 'B', 'C', 'D'])
print(df)

          A         B         C         D
0  0.092355 -0.659736  0.840346  2.688674
1  1.519475  0.894653  0.548488 -0.382577
2  3.015042 -1.371197 -0.913546  1.189768
3 -0.277704 -2.481435 -1.607659 -0.947474
4  0.626947  1.031938  0.641178 -0.330257
5  0.361567  0.692051  1.334572 -0.954379


In [4]:
# creating a DataFrame from a very varied dictionary where each key is a column (also see pd.from_dict()):
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index = list(range(4)), dtype = 'float32'),
                    'D' : np.array(np.arange(4), dtype = 'int32'),
                    'E' : pd.Categorical(["test", "train", "test", "train"]),
                    'F' : 'foo' })
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  0   test  foo
1  1.0 2013-01-02  1.0  1  train  foo
2  1.0 2013-01-02  1.0  2   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [5]:
# looking at the DataFrame columns types (notice this is a Series!):
print(df2.dtypes)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [6]:
# reading from CSV
df3 = pd.read_csv("../../datasets/test.csv")
print(df3)

   x   y
0  1   6
1  2   7
2  3   8
3  4   9
4  5  10


# Some info

In [7]:
# get top rows
print(df.head())

          A         B         C         D
0  0.092355 -0.659736  0.840346  2.688674
1  1.519475  0.894653  0.548488 -0.382577
2  3.015042 -1.371197 -0.913546  1.189768
3 -0.277704 -2.481435 -1.607659 -0.947474
4  0.626947  1.031938  0.641178 -0.330257


In [8]:
# get bottom rows
print(df.tail(2))

          A         B         C         D
4  0.626947  1.031938  0.641178 -0.330257
5  0.361567  0.692051  1.334572 -0.954379


In [9]:
# view index
print(df.index)

RangeIndex(start=0, stop=6, step=1)


In [10]:
# view column names
print(df.columns)

Index(['A', 'B', 'C', 'D'], dtype='object')


In [11]:
# get the underlying numpy matrix
print(df.values)

[[ 0.0923551  -0.65973551  0.84034612  2.68867361]
 [ 1.51947522  0.89465268  0.54848789 -0.38257737]
 [ 3.01504218 -1.37119711 -0.91354644  1.18976757]
 [-0.27770354 -2.48143483 -1.60765943 -0.94747427]
 [ 0.62694726  1.03193828  0.64117776 -0.3302569 ]
 [ 0.36156742  0.69205058  1.33457223 -0.95437889]]


In [12]:
# quick statistics summary
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.889614 -0.315621  0.140563  0.210626
std    1.205010  1.429649  1.140158  1.445152
min   -0.277704 -2.481435 -1.607659 -0.954379
25%    0.159658 -1.193332 -0.548038 -0.806250
50%    0.494257  0.016158  0.594833 -0.356417
75%    1.296343  0.844002  0.790554  0.809761
max    3.015042  1.031938  1.334572  2.688674


In [13]:
# transposing a DataFrame
print(df.T)

          0         1         2         3         4         5
A  0.092355  1.519475  3.015042 -0.277704  0.626947  0.361567
B -0.659736  0.894653 -1.371197 -2.481435  1.031938  0.692051
C  0.840346  0.548488 -0.913546 -1.607659  0.641178  1.334572
D  2.688674 -0.382577  1.189768 -0.947474 -0.330257 -0.954379


In [14]:
# sorting by column names
print(df.sort_index(axis = 1, ascending = False))

          D         C         B         A
0  2.688674  0.840346 -0.659736  0.092355
1 -0.382577  0.548488  0.894653  1.519475
2  1.189768 -0.913546 -1.371197  3.015042
3 -0.947474 -1.607659 -2.481435 -0.277704
4 -0.330257  0.641178  1.031938  0.626947
5 -0.954379  1.334572  0.692051  0.361567


In [15]:
# sorting by a specific column values
print(df.sort_values(by = 'B'))

          A         B         C         D
3 -0.277704 -2.481435 -1.607659 -0.947474
2  3.015042 -1.371197 -0.913546  1.189768
0  0.092355 -0.659736  0.840346  2.688674
5  0.361567  0.692051  1.334572 -0.954379
1  1.519475  0.894653  0.548488 -0.382577
4  0.626947  1.031938  0.641178 -0.330257


# Selection

In [16]:
# select a specific column (this will give you a Series!) - pandas people do not recommend this, see loc/iloc
print(df['A'])

0    0.092355
1    1.519475
2    3.015042
3   -0.277704
4    0.626947
5    0.361567
Name: A, dtype: float64


In [17]:
# slicing rows as in a numpy array - pandas people do not recommend this, see loc/iloc
print(df[0:3])
print(df.T['A':'B'])

          A         B         C         D
0  0.092355 -0.659736  0.840346  2.688674
1  1.519475  0.894653  0.548488 -0.382577
2  3.015042 -1.371197 -0.913546  1.189768
          0         1         2         3         4         5
A  0.092355  1.519475  3.015042 -0.277704  0.626947  0.361567
B -0.659736  0.894653 -1.371197 -2.481435  1.031938  0.692051


### Selection by label (`loc`)

In [18]:
# select a specific column, all rows
print(df.loc[:, 'A'])

0    0.092355
1    1.519475
2    3.015042
3   -0.277704
4    0.626947
5    0.361567
Name: A, dtype: float64


In [19]:
# select first 2 rows, specific two columns
print(df.loc[0:1, ['A', 'B']])

          A         B
0  0.092355 -0.659736
1  1.519475  0.894653


In [20]:
# select specific value
print(df.loc[0, 'A'])

# but "at" is preferred
print(df.at[0, 'A'])

0.0923551012183
0.0923551012183


### Selection by position (`iloc`)

In [21]:
# specific row (as a Series)
print(df.iloc[3])

A   -0.277704
B   -2.481435
C   -1.607659
D   -0.947474
Name: 3, dtype: float64


In [22]:
# specific row (as a sub-table)
print(df.iloc[3:4, :])

          A         B         C         D
3 -0.277704 -2.481435 -1.607659 -0.947474


In [23]:
# sub-table
print(df.iloc[3:5, 0:2])

          A         B
3 -0.277704 -2.481435
4  0.626947  1.031938


In [24]:
# integer indexing
print(df.iloc[[1,2,4], [0,2]])

          A         C
1  1.519475  0.548488
2  3.015042 -0.913546
4  0.626947  0.641178


### Boolean indexing

In [25]:
# filter only values answering condition in specific column
print(df[df.A > 0])

          A         B         C         D
0  0.092355 -0.659736  0.840346  2.688674
1  1.519475  0.894653  0.548488 -0.382577
2  3.015042 -1.371197 -0.913546  1.189768
4  0.626947  1.031938  0.641178 -0.330257
5  0.361567  0.692051  1.334572 -0.954379


In [26]:
# if not using specific column...
print(df[df > 0])

          A         B         C         D
0  0.092355       NaN  0.840346  2.688674
1  1.519475  0.894653  0.548488       NaN
2  3.015042       NaN       NaN  1.189768
3       NaN       NaN       NaN       NaN
4  0.626947  1.031938  0.641178       NaN
5  0.361567  0.692051  1.334572       NaN


# Setting

In [27]:
# set a new column s1
s1 = pd.Series([1,2,3,4,5,6])
df['E'] = s1
print(df)

          A         B         C         D  E
0  0.092355 -0.659736  0.840346  2.688674  1
1  1.519475  0.894653  0.548488 -0.382577  2
2  3.015042 -1.371197 -0.913546  1.189768  3
3 -0.277704 -2.481435 -1.607659 -0.947474  4
4  0.626947  1.031938  0.641178 -0.330257  5
5  0.361567  0.692051  1.334572 -0.954379  6


In [28]:
# set a specific value
df.at[0, 'A'] = 0
print(df)

          A         B         C         D  E
0  0.000000 -0.659736  0.840346  2.688674  1
1  1.519475  0.894653  0.548488 -0.382577  2
2  3.015042 -1.371197 -0.913546  1.189768  3
3 -0.277704 -2.481435 -1.607659 -0.947474  4
4  0.626947  1.031938  0.641178 -0.330257  5
5  0.361567  0.692051  1.334572 -0.954379  6


In [29]:
# set a whole column with numpy
df.loc[:, 'D'] = np.array([5] * len(df))
print(df)

          A         B         C  D  E
0  0.000000 -0.659736  0.840346  5  1
1  1.519475  0.894653  0.548488  5  2
2  3.015042 -1.371197 -0.913546  5  3
3 -0.277704 -2.481435 -1.607659  5  4
4  0.626947  1.031938  0.641178  5  5
5  0.361567  0.692051  1.334572  5  6


In [30]:
# set with boolean indexing
df[df < 0] = np.nan
print(df)

          A         B         C  D  E
0  0.000000       NaN  0.840346  5  1
1  1.519475  0.894653  0.548488  5  2
2  3.015042       NaN       NaN  5  3
3       NaN       NaN       NaN  5  4
4  0.626947  1.031938  0.641178  5  5
5  0.361567  0.692051  1.334572  5  6


# Missing Data

In [31]:
# dropping rows with any missing data (see documentation for more)
df1 = df.copy()
print(df1.dropna(how = 'any'))

          A         B         C  D  E
1  1.519475  0.894653  0.548488  5  2
4  0.626947  1.031938  0.641178  5  5
5  0.361567  0.692051  1.334572  5  6


In [32]:
# filling missing values with a specific value
print(df1.fillna(value = 5))

          A         B         C  D  E
0  0.000000  5.000000  0.840346  5  1
1  1.519475  0.894653  0.548488  5  2
2  3.015042  5.000000  5.000000  5  3
3  5.000000  5.000000  5.000000  5  4
4  0.626947  1.031938  0.641178  5  5
5  0.361567  0.692051  1.334572  5  6


In [33]:
# getting a boolean mask of where missing values are (similar to R)
print(pd.isna(df1))

       A      B      C      D      E
0  False   True  False  False  False
1  False  False  False  False  False
2  False   True   True  False  False
3   True   True   True  False  False
4  False  False  False  False  False
5  False  False  False  False  False


# Basic Operations

In [34]:
# apply mean to each column
print(df.mean())

A    1.104606
B    0.872881
C    0.841146
D    5.000000
E    3.500000
dtype: float64


In [37]:
# apply sum to each row
print(df.sum(axis = 1))

0     6.840346
1     9.962616
2    11.015042
3     9.000000
4    12.300063
5    13.388190
dtype: float64


In [38]:
# df.apply a non-pandas function
print(df.apply(np.cumsum))

          A         B         C   D   E
0  0.000000       NaN  0.840346   5   1
1  1.519475  0.894653  1.388834  10   3
2  4.534517       NaN       NaN  15   6
3       NaN       NaN       NaN  20  10
4  5.161465  1.926591  2.030012  25  15
5  5.523032  2.618642  3.364584  30  21


In [39]:
# apply anonumous function
print(df.apply(lambda x: x.max() - x.min()))

A    3.015042
B    0.339888
C    0.786084
D    0.000000
E    5.000000
dtype: float64


### See more in documentation