# 10 Minutes to Panda
written by: Ejaz ur Rehman\
Date: 2025-01-29\
Email: ijazfinance@gmail.com

## 1. Object Creation

### 1.1 Creating a Series by passing a list of values, letting pandas create a default RangeIndex.

In [337]:
import pandas as pd

In [338]:
import numpy as np
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### 1.2. Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [339]:
dates = pd.date_range("20130101", periods=30)

dates
df = pd.DataFrame(np.random.randn(30, 4), index=dates, columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2013-01-01,0.808254,0.91306,0.600169,0.865391
2013-01-02,0.588575,-0.565803,-1.016182,0.317379
2013-01-03,0.079288,-1.521031,1.087633,0.626385
2013-01-04,-0.499889,0.909325,0.792397,-1.19673
2013-01-05,0.434448,0.215282,-0.091708,0.095273
2013-01-06,-0.173144,-0.482806,-0.439633,0.655965
2013-01-07,-0.180431,-1.090554,0.427637,0.023892
2013-01-08,0.234825,-0.71985,-1.670747,-2.152453
2013-01-09,0.631361,-0.552115,0.575438,-1.234901
2013-01-10,0.307118,1.329634,-0.951555,1.718633


### 1.3. Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [340]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 1.4.The columns of the resulting DataFrame have different dtypes:

In [341]:
df2.dtypes


A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [342]:
df2.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype        
---  ------  --------------  -----        
 0   A       4 non-null      float64      
 1   B       4 non-null      datetime64[s]
 2   C       4 non-null      float32      
 3   D       4 non-null      int32        
 4   E       4 non-null      category     
 5   F       4 non-null      object       
dtypes: category(1), datetime64[s](1), float32(1), float64(1), int32(1), object(1)
memory usage: 288.0+ bytes


## 2. Viewing Data

### 2.1 Use DataFrame.head() and DataFrame.tail() to view the top and bottom rows of the frame respectively:

In [343]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.808254,0.91306,0.600169,0.865391
2013-01-02,0.588575,-0.565803,-1.016182,0.317379
2013-01-03,0.079288,-1.521031,1.087633,0.626385
2013-01-04,-0.499889,0.909325,0.792397,-1.19673
2013-01-05,0.434448,0.215282,-0.091708,0.095273


In [344]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-28,-0.033195,0.130311,-0.479971,-1.062321
2013-01-29,-0.705078,1.132809,1.396762,0.57024
2013-01-30,-0.444263,-1.070513,1.104254,0.536625


### 2.2 Display the DataFrame.index or DataFrame.columns:

In [345]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10', '2013-01-11', '2013-01-12',
               '2013-01-13', '2013-01-14', '2013-01-15', '2013-01-16',
               '2013-01-17', '2013-01-18', '2013-01-19', '2013-01-20',
               '2013-01-21', '2013-01-22', '2013-01-23', '2013-01-24',
               '2013-01-25', '2013-01-26', '2013-01-27', '2013-01-28',
               '2013-01-29', '2013-01-30'],
              dtype='datetime64[ns]', freq='D')

In [346]:
df2.index

Index([0, 1, 2, 3], dtype='int64')

In [347]:
df.columns 

Index(['A', 'B', 'C', 'D'], dtype='object')

### 2.3 Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels:

In [348]:
df.to_numpy()

array([[ 0.80825443,  0.9130602 ,  0.60016894,  0.86539092],
       [ 0.58857484, -0.56580266, -1.0161822 ,  0.31737938],
       [ 0.07928847, -1.5210307 ,  1.08763279,  0.62638472],
       [-0.49988943,  0.90932536,  0.79239689, -1.19673   ],
       [ 0.43444816,  0.2152822 , -0.09170784,  0.0952728 ],
       [-0.17314403, -0.48280557, -0.43963305,  0.65596489],
       [-0.18043086, -1.09055408,  0.42763709,  0.02389244],
       [ 0.2348247 , -0.7198503 , -1.67074718, -2.15245345],
       [ 0.63136107, -0.5521151 ,  0.57543805, -1.23490106],
       [ 0.30711776,  1.32963375, -0.95155547,  1.71863267],
       [ 0.2887877 , -0.18121447, -1.52838471, -0.15802194],
       [ 0.90267891,  1.57073836,  1.2580685 , -0.13000837],
       [ 0.45198586, -1.06571471,  2.20916828,  1.93364596],
       [-0.40126117, -0.80478854,  0.40784617,  0.65835056],
       [ 0.20239755,  0.37554099,  0.67741347,  1.37012656],
       [ 0.21299148, -1.38609858,  0.42714499, -1.27615432],
       [ 1.21464786,  0.

In [349]:
df2.dtypes 

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [350]:
df2.to_numpy ()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

### 2.4 describe() shows a quick statistic summary of the data:

In [351]:
df.describe ()


Unnamed: 0,A,B,C,D
count,30.0,30.0,30.0,30.0
mean,0.169192,-0.023488,-0.08714,0.023158
std,0.749515,1.081416,1.101156,1.036738
min,-2.079024,-1.521031,-1.703151,-2.152453
25%,-0.346054,-0.783554,-1.050592,-0.811299
50%,0.261806,-0.295688,0.068442,0.059583
75%,0.620665,0.793977,0.733153,0.64857
max,1.495158,2.169494,2.209168,1.933646


### 2.5 Transposing the data:

In [352]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06,2013-01-07,2013-01-08,2013-01-09,2013-01-10,...,2013-01-21,2013-01-22,2013-01-23,2013-01-24,2013-01-25,2013-01-26,2013-01-27,2013-01-28,2013-01-29,2013-01-30
A,0.808254,0.588575,0.079288,-0.499889,0.434448,-0.173144,-0.180431,0.234825,0.631361,0.307118,...,1.250852,-0.754091,0.419782,-2.079024,-0.784164,1.495158,0.323675,-0.033195,-0.705078,-0.444263
B,0.91306,-0.565803,-1.521031,0.909325,0.215282,-0.482806,-1.090554,-0.71985,-0.552115,1.329634,...,-1.296335,-0.595062,2.169494,-0.410161,0.447934,-1.386317,-0.174098,0.130311,1.132809,-1.070513
C,0.600169,-1.016182,1.087633,0.792397,-0.091708,-0.439633,0.427637,-1.670747,0.575438,-0.951555,...,-1.500373,-1.062061,-0.737269,-0.802826,-1.496244,-1.146498,1.2845,-0.479971,1.396762,1.104254
D,0.865391,0.317379,0.626385,-1.19673,0.095273,0.655965,0.023892,-2.152453,-1.234901,1.718633,...,-0.83588,-1.560322,-0.737556,-0.110415,1.507341,-0.076251,0.161692,-1.062321,0.57024,0.536625


### 2.6 DataFrame.sort_index() sorts by an axis:

In [353]:
df.sort_index(axis=1, ascending=False)
# axis =1 for columns, axis = 0 for rows

Unnamed: 0,D,C,B,A
2013-01-01,0.865391,0.600169,0.91306,0.808254
2013-01-02,0.317379,-1.016182,-0.565803,0.588575
2013-01-03,0.626385,1.087633,-1.521031,0.079288
2013-01-04,-1.19673,0.792397,0.909325,-0.499889
2013-01-05,0.095273,-0.091708,0.215282,0.434448
2013-01-06,0.655965,-0.439633,-0.482806,-0.173144
2013-01-07,0.023892,0.427637,-1.090554,-0.180431
2013-01-08,-2.152453,-1.670747,-0.71985,0.234825
2013-01-09,-1.234901,0.575438,-0.552115,0.631361
2013-01-10,1.718633,-0.951555,1.329634,0.307118


In [354]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-30,-0.444263,-1.070513,1.104254,0.536625
2013-01-29,-0.705078,1.132809,1.396762,0.57024
2013-01-28,-0.033195,0.130311,-0.479971,-1.062321
2013-01-27,0.323675,-0.174098,1.2845,0.161692
2013-01-26,1.495158,-1.386317,-1.146498,-0.076251
2013-01-25,-0.784164,0.447934,-1.496244,1.507341
2013-01-24,-2.079024,-0.410161,-0.802826,-0.110415
2013-01-23,0.419782,2.169494,-0.737269,-0.737556
2013-01-22,-0.754091,-0.595062,-1.062061,-1.560322
2013-01-21,1.250852,-1.296335,-1.500373,-0.83588


### 2.7 DataFrame.sort_values() sorts by values:

In [355]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.079288,-1.521031,1.087633,0.626385
2013-01-26,1.495158,-1.386317,-1.146498,-0.076251
2013-01-16,0.212991,-1.386099,0.427145,-1.276154
2013-01-21,1.250852,-1.296335,-1.500373,-0.83588
2013-01-07,-0.180431,-1.090554,0.427637,0.023892
2013-01-30,-0.444263,-1.070513,1.104254,0.536625
2013-01-13,0.451986,-1.065715,2.209168,1.933646
2013-01-14,-0.401261,-0.804789,0.407846,0.658351
2013-01-08,0.234825,-0.71985,-1.670747,-2.152453
2013-01-22,-0.754091,-0.595062,-1.062061,-1.560322


In [356]:
df.sort_values(by="A")

Unnamed: 0,A,B,C,D
2013-01-24,-2.079024,-0.410161,-0.802826,-0.110415
2013-01-25,-0.784164,0.447934,-1.496244,1.507341
2013-01-22,-0.754091,-0.595062,-1.062061,-1.560322
2013-01-29,-0.705078,1.132809,1.396762,0.57024
2013-01-18,-0.543786,2.090287,0.751732,-1.056025
2013-01-04,-0.499889,0.909325,0.792397,-1.19673
2013-01-30,-0.444263,-1.070513,1.104254,0.536625
2013-01-14,-0.401261,-0.804789,0.407846,0.658351
2013-01-07,-0.180431,-1.090554,0.427637,0.023892
2013-01-06,-0.173144,-0.482806,-0.439633,0.655965


## 3. Selection

### 3.1 Getitem ([]): 

#### 3.1.1 For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A:

In [357]:
df["A"]

2013-01-01    0.808254
2013-01-02    0.588575
2013-01-03    0.079288
2013-01-04   -0.499889
2013-01-05    0.434448
2013-01-06   -0.173144
2013-01-07   -0.180431
2013-01-08    0.234825
2013-01-09    0.631361
2013-01-10    0.307118
2013-01-11    0.288788
2013-01-12    0.902679
2013-01-13    0.451986
2013-01-14   -0.401261
2013-01-15    0.202398
2013-01-16    0.212991
2013-01-17    1.214648
2013-01-18   -0.543786
2013-01-19    0.808854
2013-01-20    1.018407
2013-01-21    1.250852
2013-01-22   -0.754091
2013-01-23    0.419782
2013-01-24   -2.079024
2013-01-25   -0.784164
2013-01-26    1.495158
2013-01-27    0.323675
2013-01-28   -0.033195
2013-01-29   -0.705078
2013-01-30   -0.444263
Freq: D, Name: A, dtype: float64

#### 3.1.2 For a DataFrame, passing a slice : selects matching rows:

In [358]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.808254,0.91306,0.600169,0.865391
2013-01-02,0.588575,-0.565803,-1.016182,0.317379
2013-01-03,0.079288,-1.521031,1.087633,0.626385


### 3.2 Selecitoin by Lable

#### 3.2.1 Selecting a row matching a label:

In [359]:
df.loc[dates[0]]

A    0.808254
B    0.913060
C    0.600169
D    0.865391
Name: 2013-01-01 00:00:00, dtype: float64

#### 3.2.2 Selecting all rows (:) with a select column labels:

In [360]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,0.808254,0.91306
2013-01-02,0.588575,-0.565803
2013-01-03,0.079288,-1.521031
2013-01-04,-0.499889,0.909325
2013-01-05,0.434448,0.215282
2013-01-06,-0.173144,-0.482806
2013-01-07,-0.180431,-1.090554
2013-01-08,0.234825,-0.71985
2013-01-09,0.631361,-0.552115
2013-01-10,0.307118,1.329634


#### 3.2.3 For label slicing, both endpoints are included:

In [361]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,0.588575,-0.565803
2013-01-03,0.079288,-1.521031
2013-01-04,-0.499889,0.909325


#### 3.2.4 Selecting a single row and column label returns a scalar:

In [362]:
df.loc[dates[0], "A"]

np.float64(0.8082544256131313)

#### 3.2.5 For getting fast access to a scalar (equivalent to the prior method):

In [363]:
df.at[dates[0], "A"]

np.float64(0.8082544256131313)

### 3.3 Selection by Position

#### 3.3.1 Select via the position of the passed integers:

In [364]:
df.iloc[3] # getting the 4th row data in transpose form

A   -0.499889
B    0.909325
C    0.792397
D   -1.196730
Name: 2013-01-04 00:00:00, dtype: float64

#### 3.3.2 Integer slices acts similar to NumPy/Python:

In [365]:
df.iloc[3:5, 0:2] 

Unnamed: 0,A,B
2013-01-04,-0.499889,0.909325
2013-01-05,0.434448,0.215282


#### 3.3.3 Lists of integer position locations:

In [366]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.588575,-1.016182
2013-01-03,0.079288,1.087633
2013-01-05,0.434448,-0.091708


#### 3.3.4 For slicing rows explicitly:

In [367]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.588575,-0.565803,-1.016182,0.317379
2013-01-03,0.079288,-1.521031,1.087633,0.626385


#### 3.3.5 For slicing columns explicitly:

In [368]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,0.91306,0.600169
2013-01-02,-0.565803,-1.016182
2013-01-03,-1.521031,1.087633
2013-01-04,0.909325,0.792397
2013-01-05,0.215282,-0.091708
2013-01-06,-0.482806,-0.439633
2013-01-07,-1.090554,0.427637
2013-01-08,-0.71985,-1.670747
2013-01-09,-0.552115,0.575438
2013-01-10,1.329634,-0.951555


#### 3.3.6 For getting a value explicitly:

In [369]:
df.iloc[1, 1]

np.float64(-0.565802661933663)

#### 3.3.7 For getting fast access to a scalar (equivalent to the prior method):

In [370]:
df.iat[1, 1]

np.float64(-0.565802661933663)

### 3.4 Boolean indexing

#### 3.4.1 Select rows where df.A is greater than 0

In [371]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.808254,0.91306,0.600169,0.865391
2013-01-02,0.588575,-0.565803,-1.016182,0.317379
2013-01-03,0.079288,-1.521031,1.087633,0.626385
2013-01-05,0.434448,0.215282,-0.091708,0.095273
2013-01-08,0.234825,-0.71985,-1.670747,-2.152453
2013-01-09,0.631361,-0.552115,0.575438,-1.234901
2013-01-10,0.307118,1.329634,-0.951555,1.718633
2013-01-11,0.288788,-0.181214,-1.528385,-0.158022
2013-01-12,0.902679,1.570738,1.258069,-0.130008
2013-01-13,0.451986,-1.065715,2.209168,1.933646


#### 3.4.2 Selecting values from a DataFrame where a boolean condition is met:

In [372]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.808254,0.91306,0.600169,0.865391
2013-01-02,0.588575,,,0.317379
2013-01-03,0.079288,,1.087633,0.626385
2013-01-04,,0.909325,0.792397,
2013-01-05,0.434448,0.215282,,0.095273
2013-01-06,,,,0.655965
2013-01-07,,,0.427637,0.023892
2013-01-08,0.234825,,,
2013-01-09,0.631361,,0.575438,
2013-01-10,0.307118,1.329634,,1.718633


#### 3.4.3 Using isin() method for filtering:

In [373]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three", "one", "one", "two", "three", "four", "three", "one", "one", "two", "three", "four", "three", "one", "one", "two", "three", "four", "three", "one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.808254,0.91306,0.600169,0.865391,one
2013-01-02,0.588575,-0.565803,-1.016182,0.317379,one
2013-01-03,0.079288,-1.521031,1.087633,0.626385,two
2013-01-04,-0.499889,0.909325,0.792397,-1.19673,three
2013-01-05,0.434448,0.215282,-0.091708,0.095273,four
2013-01-06,-0.173144,-0.482806,-0.439633,0.655965,three
2013-01-07,-0.180431,-1.090554,0.427637,0.023892,one
2013-01-08,0.234825,-0.71985,-1.670747,-2.152453,one
2013-01-09,0.631361,-0.552115,0.575438,-1.234901,two
2013-01-10,0.307118,1.329634,-0.951555,1.718633,three


In [374]:
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,0.079288,-1.521031,1.087633,0.626385,two
2013-01-05,0.434448,0.215282,-0.091708,0.095273,four
2013-01-09,0.631361,-0.552115,0.575438,-1.234901,two
2013-01-11,0.288788,-0.181214,-1.528385,-0.158022,four
2013-01-15,0.202398,0.375541,0.677413,1.370127,two
2013-01-17,1.214648,0.077284,0.228593,0.126049,four
2013-01-21,1.250852,-1.296335,-1.500373,-0.83588,two
2013-01-23,0.419782,2.169494,-0.737269,-0.737556,four
2013-01-27,0.323675,-0.174098,1.2845,0.161692,two
2013-01-29,-0.705078,1.132809,1.396762,0.57024,four


## 4. Setting

#### 4.1 Setting a new column automatically aligns the data by the indexes:

In [375]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

#### 4.2 Setting values by label:

In [376]:
df.at[dates[0], "A"] = 0

#### 4.3 Setting values by position:

In [377]:
df.iat[0, 1] = 0

#### 4.4 Setting by assigning with a NumPy array:

In [378]:
df.loc[:, "D"] = np.array([5] * len(df))

#### 4.5 The result of the prior setting operations:

In [379]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.600169,5.0
2013-01-02,0.588575,-0.565803,-1.016182,5.0
2013-01-03,0.079288,-1.521031,1.087633,5.0
2013-01-04,-0.499889,0.909325,0.792397,5.0
2013-01-05,0.434448,0.215282,-0.091708,5.0
2013-01-06,-0.173144,-0.482806,-0.439633,5.0
2013-01-07,-0.180431,-1.090554,0.427637,5.0
2013-01-08,0.234825,-0.71985,-1.670747,5.0
2013-01-09,0.631361,-0.552115,0.575438,5.0
2013-01-10,0.307118,1.329634,-0.951555,5.0


#### 4.6 A where operation with setting:

In [380]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.600169,-5.0
2013-01-02,-0.588575,-0.565803,-1.016182,-5.0
2013-01-03,-0.079288,-1.521031,-1.087633,-5.0
2013-01-04,-0.499889,-0.909325,-0.792397,-5.0
2013-01-05,-0.434448,-0.215282,-0.091708,-5.0
2013-01-06,-0.173144,-0.482806,-0.439633,-5.0
2013-01-07,-0.180431,-1.090554,-0.427637,-5.0
2013-01-08,-0.234825,-0.71985,-1.670747,-5.0
2013-01-09,-0.631361,-0.552115,-0.575438,-5.0
2013-01-10,-0.307118,-1.329634,-0.951555,-5.0


## 5. Missing Data

#### 5.1 Reindexing allows us to change/add/delete the index on a specified axis. This returns a copy of the data:

In [381]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.600169,5.0,1.0
2013-01-02,0.588575,-0.565803,-1.016182,5.0,1.0
2013-01-03,0.079288,-1.521031,1.087633,5.0,
2013-01-04,-0.499889,0.909325,0.792397,5.0,


#### 5.2 DataFrame.dropna() drops any rows that have missing data:

In [382]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.600169,5.0,1.0
2013-01-02,0.588575,-0.565803,-1.016182,5.0,1.0


#### 5.3 DataFrame.fillna() fills missing data:

In [383]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,0.600169,5.0,1.0
2013-01-02,0.588575,-0.565803,-1.016182,5.0,1.0
2013-01-03,0.079288,-1.521031,1.087633,5.0,5.0
2013-01-04,-0.499889,0.909325,0.792397,5.0,5.0


#### 5.4 isna() gets the boolean mask where values are nan:

In [384]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


## 6. Operations

### 6.1 Stats

#### 6.1.1 Calculate the mean value for each column:

In [385]:
df.mean()

A    0.142250
B   -0.053923
C   -0.087140
D    5.000000
dtype: float64

#### 6.1.2 Calculate the mean value for each row:

In [386]:
df.mean(axis=1)

2013-01-01    1.400042
2013-01-02    1.001647
2013-01-03    1.161473
2013-01-04    1.550458
2013-01-05    1.389506
2013-01-06    0.976104
2013-01-07    1.039163
2013-01-08    0.711057
2013-01-09    1.413671
2013-01-10    1.421299
2013-01-11    0.894797
2013-01-12    2.182871
2013-01-13    1.648860
2013-01-14    1.050449
2013-01-15    1.563838
2013-01-16    1.063509
2013-01-17    1.630131
2013-01-18    1.824558
2013-01-19    1.586774
2013-01-20    0.949200
2013-01-21    0.863536
2013-01-22    0.647197
2013-01-23    1.713002
2013-01-24    0.426997
2013-01-25    0.791881
2013-01-26    0.990586
2013-01-27    1.608519
2013-01-28    1.154286
2013-01-29    1.706123
2013-01-30    1.147369
Freq: D, dtype: float64

#### 6.1.3 Operating with another Series or DataFrame with a different index or column will align the result with the union of the index or column labels. In addition, pandas automatically broadcasts along the specified dimension and will fill unaligned labels with np.nan.

In [387]:
s = pd.Series([1, 3, 5, np.nan, 6, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32], index=dates).shift(2)
s

2013-01-01     NaN
2013-01-02     NaN
2013-01-03     1.0
2013-01-04     3.0
2013-01-05     5.0
2013-01-06     NaN
2013-01-07     6.0
2013-01-08     8.0
2013-01-09     9.0
2013-01-10    10.0
2013-01-11    11.0
2013-01-12    12.0
2013-01-13    13.0
2013-01-14    14.0
2013-01-15    15.0
2013-01-16    16.0
2013-01-17    17.0
2013-01-18    18.0
2013-01-19    19.0
2013-01-20    20.0
2013-01-21    21.0
2013-01-22    22.0
2013-01-23    23.0
2013-01-24    24.0
2013-01-25    25.0
2013-01-26    26.0
2013-01-27    27.0
2013-01-28    28.0
2013-01-29    29.0
2013-01-30    30.0
Freq: D, dtype: float64

In [388]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,-0.920712,-2.521031,0.087633,4.0
2013-01-04,-3.499889,-2.090675,-2.207603,2.0
2013-01-05,-4.565552,-4.784718,-5.091708,0.0
2013-01-06,,,,
2013-01-07,-6.180431,-7.090554,-5.572363,-1.0
2013-01-08,-7.765175,-8.71985,-9.670747,-3.0
2013-01-09,-8.368639,-9.552115,-8.424562,-4.0
2013-01-10,-9.692882,-8.670366,-10.951555,-5.0


### 6.2 User defined Functions

#### 6.2.1 DataFrame.agg() and DataFrame.transform() applies a user defined function that reduces or broadcasts its result respectively.

In [389]:
df.agg(lambda x: np.mean(x) * 5.6)

A     0.796602
B    -0.301970
C    -0.487982
D    28.000000
dtype: float64

In [390]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,60.737097,506.0
2013-01-02,59.563774,-57.259229,-102.837638,506.0
2013-01-03,8.023993,-153.928307,110.068438,506.0
2013-01-04,-50.588811,92.023726,80.190565,506.0
2013-01-05,43.966154,21.786559,-9.280833,506.0
2013-01-06,-17.522176,-48.859924,-44.490864,506.0
2013-01-07,-18.259603,-110.364073,43.276874,506.0
2013-01-08,23.76426,-72.84885,-169.079614,506.0
2013-01-09,63.893741,-55.874048,58.23433,506.0
2013-01-10,31.080317,134.558935,-96.297413,506.0


### 6.3 Value Counts

In [391]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    2
1    0
2    0
3    5
4    0
5    4
6    2
7    0
8    1
9    4
dtype: int32

In [392]:
s.value_counts()

0    4
2    2
4    2
5    1
1    1
Name: count, dtype: int64

### 6.4 String Methods

#### 6.4.1 Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below.

In [393]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## 7. Merge

### 7.1 Contact

#### 7.1.1 Concatenating pandas objects together row-wise with concat():

In [394]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,-0.441178,-1.133467,0.690161,-0.119986
1,-1.147587,1.377381,-0.864518,-1.879313
2,0.582635,0.959161,-2.65461,-0.710754
3,1.009623,-2.690739,-0.872136,-0.477217
4,0.350455,1.359235,-0.90754,0.787727
5,0.536642,0.963751,-0.7391,0.766017
6,1.160477,-0.367473,-0.030316,-0.666408
7,-1.712969,0.350592,-0.209788,-0.284829
8,-0.46657,-0.511373,1.098968,0.410608
9,-0.81488,0.610935,-0.885488,1.56727


In [395]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.441178,-1.133467,0.690161,-0.119986
1,-1.147587,1.377381,-0.864518,-1.879313
2,0.582635,0.959161,-2.65461,-0.710754
3,1.009623,-2.690739,-0.872136,-0.477217
4,0.350455,1.359235,-0.90754,0.787727
5,0.536642,0.963751,-0.7391,0.766017
6,1.160477,-0.367473,-0.030316,-0.666408
7,-1.712969,0.350592,-0.209788,-0.284829
8,-0.46657,-0.511373,1.098968,0.410608
9,-0.81488,0.610935,-0.885488,1.56727


### 7.2 Join

#### 7.2.1 merge() enables SQL style join types along specific columns.

In [396]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [397]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [398]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


#### 7.2.2 merge() on unique keys:

In [399]:
left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [400]:
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [401]:
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


## 8. Grouping

##### By “group by” we are referring to a process involving one or more of the following steps:

##### Splitting the data into groups based on some criteria

##### Applying a function to each group independently

##### Combining the results into a data structure

In [402]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)
df

Unnamed: 0,A,B,C,D
0,foo,one,2.110003,-0.550223
1,bar,one,-0.66708,0.720739
2,foo,two,-0.673888,0.279459
3,bar,three,0.320695,-0.434682
4,foo,two,-0.412611,-0.745979
5,bar,two,-0.796738,0.368489
6,foo,one,1.349312,0.089003
7,foo,three,0.133883,2.342172


#### 8.1 Grouping by a column label, selecting column labels, and then applying the DataFrameGroupBy.sum() function to the resulting groups:

In [403]:
df.groupby("A")[["C", "D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.143123,0.654547
foo,2.5067,1.414431


#### 8.2 Grouping by multiple columns label forms MultiIndex.

In [404]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.66708,0.720739
bar,three,0.320695,-0.434682
bar,two,-0.796738,0.368489
foo,one,3.459315,-0.46122
foo,three,0.133883,2.342172
foo,two,-1.086499,-0.46652
