### Heavily based on ["10 Minutes to pandas"](https://pandas.pydata.org/pandas-docs/stable/10min.html)

In [1]:
import pandas as pd

import numpy as np

# Series, DataFrames

In [2]:
# a pandas series is a vector of data, a column
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [3]:
# a DataFrame is a data table, always indexed.
# creating one from a random numpy 2D array (notice the index isn't specified, automatically becomes zero based counter):
df = pd.DataFrame(np.random.randn(6,4), columns = ['A', 'B', 'C', 'D'])
print(df)

          A         B         C         D
0 -0.781913  0.355445 -1.809361 -0.437243
1  1.779803  0.990665  0.936495 -0.358011
2  0.585873 -1.913129  0.176131  0.818582
3  2.558650 -0.231655 -0.855915  0.914393
4  1.626634  0.317939  1.621115  0.096541
5 -0.938682 -0.686302 -0.564103 -1.163258


In [4]:
# creating a DataFrame from a very varied dictionary where each key is a column (also see pd.from_dict()):
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1, index = list(range(4)), dtype = 'float32'),
                    'D' : np.array(np.arange(4), dtype = 'int32'),
                    'E' : pd.Categorical(["test", "train", "test", "train"]),
                    'F' : 'foo' })
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  0   test  foo
1  1.0 2013-01-02  1.0  1  train  foo
2  1.0 2013-01-02  1.0  2   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [5]:
# looking at the DataFrame columns types (notice this is a Series!):
print(df2.dtypes)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [6]:
# reading from CSV
df3 = pd.read_csv("https://raw.githubusercontent.com/gsimchoni/Intro2DS/master/datasets/test.csv")
print(df3)

   x   y
0  1   6
1  2   7
2  3   8
3  4   9
4  5  10


# Some info

In [7]:
# get top rows
print(df.head())

          A         B         C         D
0 -0.781913  0.355445 -1.809361 -0.437243
1  1.779803  0.990665  0.936495 -0.358011
2  0.585873 -1.913129  0.176131  0.818582
3  2.558650 -0.231655 -0.855915  0.914393
4  1.626634  0.317939  1.621115  0.096541


In [8]:
# get bottom rows
print(df.tail(2))

          A         B         C         D
4  1.626634  0.317939  1.621115  0.096541
5 -0.938682 -0.686302 -0.564103 -1.163258


In [9]:
# view index
print(df.index)

RangeIndex(start=0, stop=6, step=1)


In [10]:
# view column names
print(df.columns)

Index(['A', 'B', 'C', 'D'], dtype='object')


In [11]:
# get the underlying numpy matrix
print(df.values)

[[-0.78191322  0.35544534 -1.80936052 -0.43724299]
 [ 1.77980332  0.99066474  0.93649465 -0.35801087]
 [ 0.58587348 -1.91312924  0.17613082  0.81858181]
 [ 2.55864971 -0.23165519 -0.85591468  0.91439348]
 [ 1.62663435  0.31793891  1.62111514  0.09654099]
 [-0.93868177 -0.68630172 -0.56410341 -1.16325769]]


In [12]:
# quick statistics summary
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.805061 -0.194506 -0.082606 -0.021499
std    1.435782  1.016522  1.250576  0.798155
min   -0.938682 -1.913129 -1.809361 -1.163258
25%   -0.439967 -0.572640 -0.782962 -0.417435
50%    1.106254  0.043142 -0.193986 -0.130735
75%    1.741511  0.346069  0.746404  0.638072
max    2.558650  0.990665  1.621115  0.914393


In [13]:
# transposing a DataFrame
print(df.T)

          0         1         2         3         4         5
A -0.781913  1.779803  0.585873  2.558650  1.626634 -0.938682
B  0.355445  0.990665 -1.913129 -0.231655  0.317939 -0.686302
C -1.809361  0.936495  0.176131 -0.855915  1.621115 -0.564103
D -0.437243 -0.358011  0.818582  0.914393  0.096541 -1.163258


In [14]:
# sorting by column names
print(df.sort_index(axis = 1, ascending = False))

          D         C         B         A
0 -0.437243 -1.809361  0.355445 -0.781913
1 -0.358011  0.936495  0.990665  1.779803
2  0.818582  0.176131 -1.913129  0.585873
3  0.914393 -0.855915 -0.231655  2.558650
4  0.096541  1.621115  0.317939  1.626634
5 -1.163258 -0.564103 -0.686302 -0.938682


In [15]:
# sorting by a specific column values
print(df.sort_values(by = 'B'))

          A         B         C         D
2  0.585873 -1.913129  0.176131  0.818582
5 -0.938682 -0.686302 -0.564103 -1.163258
3  2.558650 -0.231655 -0.855915  0.914393
4  1.626634  0.317939  1.621115  0.096541
0 -0.781913  0.355445 -1.809361 -0.437243
1  1.779803  0.990665  0.936495 -0.358011


# Selection

In [16]:
# select a specific column (this will give you a Series!) - pandas people do not recommend this, see loc/iloc
print(df['A'])

0   -0.781913
1    1.779803
2    0.585873
3    2.558650
4    1.626634
5   -0.938682
Name: A, dtype: float64


In [17]:
# slicing rows as in a numpy array - pandas people do not recommend this, see loc/iloc
print(df[0:3])
print(df.T['A':'B'])

          A         B         C         D
0 -0.781913  0.355445 -1.809361 -0.437243
1  1.779803  0.990665  0.936495 -0.358011
2  0.585873 -1.913129  0.176131  0.818582
          0         1         2         3         4         5
A -0.781913  1.779803  0.585873  2.558650  1.626634 -0.938682
B  0.355445  0.990665 -1.913129 -0.231655  0.317939 -0.686302


### Selection by label (`loc`)

In [18]:
# select a specific column, all rows
print(df.loc[:, 'A'])

0   -0.781913
1    1.779803
2    0.585873
3    2.558650
4    1.626634
5   -0.938682
Name: A, dtype: float64


In [19]:
# select first 2 rows, specific two columns
print(df.loc[0:1, ['A', 'B']])

          A         B
0 -0.781913  0.355445
1  1.779803  0.990665


In [20]:
# select specific value
print(df.loc[0, 'A'])

# but "at" is preferred
print(df.at[0, 'A'])

-0.781913216351
-0.781913216351


### Selection by position (`iloc`)

In [21]:
# specific row (as a Series)
print(df.iloc[3])

A    2.558650
B   -0.231655
C   -0.855915
D    0.914393
Name: 3, dtype: float64


In [22]:
# specific row (as a sub-table)
print(df.iloc[3:4, :])

         A         B         C         D
3  2.55865 -0.231655 -0.855915  0.914393


In [23]:
# sub-table
print(df.iloc[3:5, 0:2])

          A         B
3  2.558650 -0.231655
4  1.626634  0.317939


In [24]:
# integer indexing
print(df.iloc[[1,2,4], [0,2]])

          A         C
1  1.779803  0.936495
2  0.585873  0.176131
4  1.626634  1.621115


### Boolean indexing

In [25]:
# filter only values answering condition in specific column
print(df[df.A > 0])

          A         B         C         D
1  1.779803  0.990665  0.936495 -0.358011
2  0.585873 -1.913129  0.176131  0.818582
3  2.558650 -0.231655 -0.855915  0.914393
4  1.626634  0.317939  1.621115  0.096541


In [26]:
# if not using specific column...
print(df[df > 0])

          A         B         C         D
0       NaN  0.355445       NaN       NaN
1  1.779803  0.990665  0.936495       NaN
2  0.585873       NaN  0.176131  0.818582
3  2.558650       NaN       NaN  0.914393
4  1.626634  0.317939  1.621115  0.096541
5       NaN       NaN       NaN       NaN


# Setting

In [27]:
# set a new column s1
s1 = pd.Series([1,2,3,4,5,6])
df['E'] = s1
print(df)

          A         B         C         D  E
0 -0.781913  0.355445 -1.809361 -0.437243  1
1  1.779803  0.990665  0.936495 -0.358011  2
2  0.585873 -1.913129  0.176131  0.818582  3
3  2.558650 -0.231655 -0.855915  0.914393  4
4  1.626634  0.317939  1.621115  0.096541  5
5 -0.938682 -0.686302 -0.564103 -1.163258  6


In [28]:
# set a specific value
df.at[0, 'A'] = 0
print(df)

          A         B         C         D  E
0  0.000000  0.355445 -1.809361 -0.437243  1
1  1.779803  0.990665  0.936495 -0.358011  2
2  0.585873 -1.913129  0.176131  0.818582  3
3  2.558650 -0.231655 -0.855915  0.914393  4
4  1.626634  0.317939  1.621115  0.096541  5
5 -0.938682 -0.686302 -0.564103 -1.163258  6


In [29]:
# set a whole column with numpy
df.loc[:, 'D'] = np.array([5] * len(df))
print(df)

          A         B         C  D  E
0  0.000000  0.355445 -1.809361  5  1
1  1.779803  0.990665  0.936495  5  2
2  0.585873 -1.913129  0.176131  5  3
3  2.558650 -0.231655 -0.855915  5  4
4  1.626634  0.317939  1.621115  5  5
5 -0.938682 -0.686302 -0.564103  5  6


In [30]:
# set with boolean indexing
df[df < 0] = np.nan
print(df)

          A         B         C  D  E
0  0.000000  0.355445       NaN  5  1
1  1.779803  0.990665  0.936495  5  2
2  0.585873       NaN  0.176131  5  3
3  2.558650       NaN       NaN  5  4
4  1.626634  0.317939  1.621115  5  5
5       NaN       NaN       NaN  5  6


# Missing Data

In [31]:
# dropping rows with any missing data (see documentation for more)
df1 = df.copy()
print(df1.dropna(how = 'any'))

          A         B         C  D  E
1  1.779803  0.990665  0.936495  5  2
4  1.626634  0.317939  1.621115  5  5


In [32]:
# filling missing values with a specific value
print(df1.fillna(value = 5))

          A         B         C  D  E
0  0.000000  0.355445  5.000000  5  1
1  1.779803  0.990665  0.936495  5  2
2  0.585873  5.000000  0.176131  5  3
3  2.558650  5.000000  5.000000  5  4
4  1.626634  0.317939  1.621115  5  5
5  5.000000  5.000000  5.000000  5  6


In [33]:
# getting a boolean mask of where missing values are (similar to R)
print(pd.isna(df1))

       A      B      C      D      E
0  False  False   True  False  False
1  False  False  False  False  False
2  False   True  False  False  False
3  False   True   True  False  False
4  False  False  False  False  False
5   True   True   True  False  False


# Basic Operations

In [34]:
# apply mean to each column
print(df.mean())

A    1.310192
B    0.554683
C    0.911247
D    5.000000
E    3.500000
dtype: float64


In [35]:
# apply sum to each row
print(df.sum(axis = 1))

0     6.355445
1    10.706963
2     8.762004
3    11.558650
4    13.565688
5    11.000000
dtype: float64


In [36]:
# df.apply a non-pandas function
print(df.apply(np.cumsum))

          A         B         C   D   E
0  0.000000  0.355445       NaN   5   1
1  1.779803  1.346110  0.936495  10   3
2  2.365677       NaN  1.112625  15   6
3  4.924327       NaN       NaN  20  10
4  6.550961  1.664049  2.733741  25  15
5       NaN       NaN       NaN  30  21


In [37]:
# apply anonumous function
print(df.apply(lambda x: x.max() - x.min()))

A    2.558650
B    0.672726
C    1.444984
D    0.000000
E    5.000000
dtype: float64


# Basic Manipulation

pandas is inspired by R's `tidyverse` philosophy and works best with "tidy", long datasets, in which every column is a single variable, and every row is a single observation.

In [38]:
# group by a categorical variable, and get mean by group for all numeric variables
df2 = pd.DataFrame({'categorical': np.tile(['A', 'B'], (5, )),
                    'numerical1': np.random.randn(10),
                    'numerical2': 10 + 2 * np.random.randn(10)})
df2.groupby('categorical').mean()

Unnamed: 0_level_0,numerical1,numerical2
categorical,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.259247,10.037896
B,-0.457582,11.098038


### See more in documentation