Creating a **Series** by passing a list of values

In [None]:
import pandas as pd
import numpy as np

s = pd.Series([1,3,5,np.nan,6,8])

In [None]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a **DataFrame** using Numpy array, with a datetime index and labeld column

In [None]:
dates = pd.date_range("20210106", periods=6)

In [None]:
dates

DatetimeIndex(['2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09',
               '2021-01-10', '2021-01-11'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list("ABCD"))

In [None]:
df

Unnamed: 0,A,B,C,D
2021-01-06,1.067907,-0.30837,-2.556374,-0.479453
2021-01-07,-0.406669,1.527887,-0.530647,-1.747194
2021-01-08,-0.448772,2.024578,-1.654048,0.723015
2021-01-09,-0.348636,-1.013858,-0.022972,-0.691032
2021-01-10,-0.649714,0.099438,0.250337,-0.258712
2021-01-11,-0.354186,0.564028,-0.491385,-0.266737


**DataFrame** from dict()

In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0, 
        "B": pd.Timestamp("20210305"), 
        "C": pd.Series(1, index = list(range(4)), dtype = "float32"),
        "D": np.array([3] * 4, dtype = "int32"),
        "E": pd.Categorical(["test","train"] * 2),
        "F": "foo"
    })

In [None]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-05,1.0,3,test,foo
1,1.0,2021-03-05,1.0,3,train,foo
2,1.0,2021-03-05,1.0,3,test,foo
3,1.0,2021-03-05,1.0,3,train,foo


In [None]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing Data

In [None]:
df.head()

Unnamed: 0,A,B,C,D
2021-01-06,1.067907,-0.30837,-2.556374,-0.479453
2021-01-07,-0.406669,1.527887,-0.530647,-1.747194
2021-01-08,-0.448772,2.024578,-1.654048,0.723015
2021-01-09,-0.348636,-1.013858,-0.022972,-0.691032
2021-01-10,-0.649714,0.099438,0.250337,-0.258712


In [None]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-09,-0.348636,-1.013858,-0.022972,-0.691032
2021-01-10,-0.649714,0.099438,0.250337,-0.258712
2021-01-11,-0.354186,0.564028,-0.491385,-0.266737


In [None]:
df.index

DatetimeIndex(['2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09',
               '2021-01-10', '2021-01-11'],
              dtype='datetime64[ns]', freq='D')

In [None]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [None]:
df.to_numpy()
# Gives a numpy representation of the DataFrame

array([[ 1.06790673, -0.30837038, -2.55637425, -0.47945282],
       [-0.40666925,  1.52788708, -0.53064658, -1.74719353],
       [-0.44877157,  2.02457801, -1.65404805,  0.72301475],
       [-0.34863644, -1.0138577 , -0.02297185, -0.69103236],
       [-0.64971447,  0.09943791,  0.25033708, -0.25871192],
       [-0.35418648,  0.56402774, -0.49138533, -0.26673704]])

In [None]:
df.describe()
# Shows a quick statistic summary of your data

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.190012,0.482284,-0.834181,-0.453352
std,0.626051,1.139191,1.066022,0.798467
min,-0.649714,-1.013858,-2.556374,-1.747194
25%,-0.438246,-0.206418,-1.373198,-0.638137
50%,-0.380428,0.331733,-0.511016,-0.373095
75%,-0.350024,1.286922,-0.140075,-0.260718
max,1.067907,2.024578,0.250337,0.723015


In [None]:
df.T
# Transposing your data using T

Unnamed: 0,2021-01-06,2021-01-07,2021-01-08,2021-01-09,2021-01-10,2021-01-11
A,1.067907,-0.406669,-0.448772,-0.348636,-0.649714,-0.354186
B,-0.30837,1.527887,2.024578,-1.013858,0.099438,0.564028
C,-2.556374,-0.530647,-1.654048,-0.022972,0.250337,-0.491385
D,-0.479453,-1.747194,0.723015,-0.691032,-0.258712,-0.266737


In [None]:
df.sort_index(axis=1,ascending=False)
# sorting based on axis 1

Unnamed: 0,D,C,B,A
2021-01-06,-0.479453,-2.556374,-0.30837,1.067907
2021-01-07,-1.747194,-0.530647,1.527887,-0.406669
2021-01-08,0.723015,-1.654048,2.024578,-0.448772
2021-01-09,-0.691032,-0.022972,-1.013858,-0.348636
2021-01-10,-0.258712,0.250337,0.099438,-0.649714
2021-01-11,-0.266737,-0.491385,0.564028,-0.354186


In [None]:
df.sort_values(by="C")

Unnamed: 0,A,B,C,D
2021-01-06,1.067907,-0.30837,-2.556374,-0.479453
2021-01-08,-0.448772,2.024578,-1.654048,0.723015
2021-01-07,-0.406669,1.527887,-0.530647,-1.747194
2021-01-11,-0.354186,0.564028,-0.491385,-0.266737
2021-01-09,-0.348636,-1.013858,-0.022972,-0.691032
2021-01-10,-0.649714,0.099438,0.250337,-0.258712


#### Getting Data

In [None]:
df["A"]
# Getting a single column values

2021-01-06    1.067907
2021-01-07   -0.406669
2021-01-08   -0.448772
2021-01-09   -0.348636
2021-01-10   -0.649714
2021-01-11   -0.354186
Freq: D, Name: A, dtype: float64

In [None]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-01-06,1.067907,-0.30837,-2.556374,-0.479453
2021-01-07,-0.406669,1.527887,-0.530647,-1.747194
2021-01-08,-0.448772,2.024578,-1.654048,0.723015


In [None]:
df["20210108":]

Unnamed: 0,A,B,C,D
2021-01-08,-0.448772,2.024578,-1.654048,0.723015
2021-01-09,-0.348636,-1.013858,-0.022972,-0.691032
2021-01-10,-0.649714,0.099438,0.250337,-0.258712
2021-01-11,-0.354186,0.564028,-0.491385,-0.266737


#### Selection by label

In [None]:
df.loc[dates[2]]

A   -0.448772
B    2.024578
C   -1.654048
D    0.723015
Name: 2021-01-08 00:00:00, dtype: float64

In [None]:
df.loc[dates[1],"B"]
# To fetch a scalar value of DataFrame

1.5278870814867447

#### Selection by Position

* row and column

In [None]:
df.iloc[2]

A   -0.448772
B    2.024578
C   -1.654048
D    0.723015
Name: 2021-01-08 00:00:00, dtype: float64

In [None]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2021-01-06,-0.30837,-2.556374
2021-01-07,1.527887,-0.530647
2021-01-08,2.024578,-1.654048
2021-01-09,-1.013858,-0.022972
2021-01-10,0.099438,0.250337
2021-01-11,0.564028,-0.491385


#### Boolean Indexing

In [None]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2021-01-06,1.067907,-0.30837,-2.556374,-0.479453


In [None]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-01-06,1.067907,,,
2021-01-07,,1.527887,,
2021-01-08,,2.024578,,0.723015
2021-01-09,,,,
2021-01-10,,0.099438,0.250337,
2021-01-11,,0.564028,,


In [None]:
df2 = df.copy()

In [None]:
df2["E"] = ["one","two","three","four","five","six"]

In [None]:
df2

Unnamed: 0,A,B,C,D,E
2021-01-06,1.067907,-0.30837,-2.556374,-0.479453,one
2021-01-07,-0.406669,1.527887,-0.530647,-1.747194,two
2021-01-08,-0.448772,2.024578,-1.654048,0.723015,three
2021-01-09,-0.348636,-1.013858,-0.022972,-0.691032,four
2021-01-10,-0.649714,0.099438,0.250337,-0.258712,five
2021-01-11,-0.354186,0.564028,-0.491385,-0.266737,six


In [None]:
df2[df2["E"].isin(["three","five"])]

Unnamed: 0,A,B,C,D,E
2021-01-08,-0.448772,2.024578,-1.654048,0.723015,three
2021-01-10,-0.649714,0.099438,0.250337,-0.258712,five


#### Setting

In [None]:
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range("20210106", periods=6))

In [None]:
s1

2021-01-06    1
2021-01-07    2
2021-01-08    3
2021-01-09    4
2021-01-10    5
2021-01-11    6
Freq: D, dtype: int64

In [None]:
df["F"] = s1

In [None]:
df.at[dates[0],"A"] = 2

In [None]:
df.iat[0,1] = 0

In [None]:
df.loc[:,"D"] = np.array([5] * len(df))

In [None]:
df

Unnamed: 0,A,B,C,D,F
2021-01-06,2.0,0.0,-2.556374,5,1
2021-01-07,-0.406669,1.527887,-0.530647,5,2
2021-01-08,-0.448772,2.024578,-1.654048,5,3
2021-01-09,-0.348636,-1.013858,-0.022972,5,4
2021-01-10,-0.649714,0.099438,0.250337,5,5
2021-01-11,-0.354186,0.564028,-0.491385,5,6


In [None]:
df2 = df.copy()

In [None]:
df2[df2 > 0] = -df2

In [None]:
df2

Unnamed: 0,A,B,C,D,F
2021-01-06,-2.0,0.0,-2.556374,-5,-1
2021-01-07,-0.406669,-1.527887,-0.530647,-5,-2
2021-01-08,-0.448772,-2.024578,-1.654048,-5,-3
2021-01-09,-0.348636,-1.013858,-0.022972,-5,-4
2021-01-10,-0.649714,-0.099438,-0.250337,-5,-5
2021-01-11,-0.354186,-0.564028,-0.491385,-5,-6


#### Missing data

In [51]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])

In [52]:
df1.loc[dates[0] : dates[1], "E"] = 1

In [53]:
df1

Unnamed: 0,A,B,C,D,F,E
2021-01-06,2.0,0.0,-1.233281,5,1,1.0
2021-01-07,-1.039917,0.639161,-0.082168,5,2,1.0
2021-01-08,-1.764941,-1.161919,1.159493,5,3,
2021-01-09,0.625021,-1.13728,-0.706602,5,4,


In [54]:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2021-01-06,2.0,0.0,-1.233281,5,1,1.0
2021-01-07,-1.039917,0.639161,-0.082168,5,2,1.0


In [56]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2021-01-06,2.0,0.0,-1.233281,5,1,1.0
2021-01-07,-1.039917,0.639161,-0.082168,5,2,1.0
2021-01-08,-1.764941,-1.161919,1.159493,5,3,5.0
2021-01-09,0.625021,-1.13728,-0.706602,5,4,5.0


In [57]:
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2021-01-06,False,False,False,False,False,False
2021-01-07,False,False,False,False,False,False
2021-01-08,False,False,False,False,False,True
2021-01-09,False,False,False,False,False,True


In [59]:
# Operations in general exclude missing data
df.mean()

A   -0.009501
B   -0.203629
C    0.348555
D    5.000000
F    3.500000
dtype: float64

In [60]:
df1.mean(1)

2021-01-06    1.294453
2021-01-07    1.252846
2021-01-08    1.246527
2021-01-09    1.556228
Freq: D, dtype: float64

In [61]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

In [62]:
s

2021-01-06    NaN
2021-01-07    NaN
2021-01-08    1.0
2021-01-09    3.0
2021-01-10    5.0
2021-01-11    NaN
Freq: D, dtype: float64

In [63]:
df.sub(s, axis="index")

Unnamed: 0,A,B,C,D,F
2021-01-06,,,,,
2021-01-07,,,,,
2021-01-08,-2.764941,-2.161919,0.159493,4.0,2.0
2021-01-09,-2.374979,-4.13728,-3.706602,2.0,1.0
2021-01-10,-5.127768,-3.985697,-2.640157,0.0,0.0
2021-01-11,,,,,


In [64]:
df.apply()

Unnamed: 0,A,B,C,D,F
2021-01-06,2.0,0.0,-1.233281,5,1
2021-01-07,0.960083,0.639161,-1.315449,10,3
2021-01-08,-0.804858,-0.522757,-0.155956,15,6
2021-01-09,-0.179837,-1.660038,-0.862558,20,10
2021-01-10,-0.307605,-0.645735,1.497285,25,15
2021-01-11,-0.057006,-1.221775,2.091331,30,21


#### String Methods

In [65]:
s = pd.Series(["A","B","C","Abba","Baca",np.nan,"CABA"])

In [66]:
s.str.lower()

0       a
1       b
2       c
3    abba
4    baca
5     NaN
6    caba
dtype: object

#### Concat pandas object

* Concat keeps the indexing from the different object as it is.

In [79]:
df = pd.DataFrame(np.random.randn(10,4))

In [69]:
df

Unnamed: 0,0,1,2,3
0,-0.432301,0.590735,1.61509,1.22289
1,0.62739,-0.514128,1.422393,1.678491
2,0.100067,-0.609705,0.761225,-0.224933
3,-0.076163,-0.285814,-1.319732,-1.964078
4,0.095517,-1.10423,-0.731411,1.738359
5,-0.275003,0.971759,0.194789,0.719807
6,-1.520591,0.573221,1.330478,-0.028202
7,1.505982,-0.291092,1.17414,-0.761478
8,1.925473,1.488456,0.065545,-1.343509
9,0.099426,-0.363542,0.271586,-0.895839


In [80]:
pieces = [df[:3],df[3:7],df[7:]]

In [81]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.317809,-0.192941,-1.416179,1.195075
1,0.296277,0.295111,-0.368593,1.354334
2,-0.3562,0.635304,0.091062,1.014288
3,-0.257395,1.146284,0.647549,-0.84046
4,0.810624,-0.299929,0.57741,0.138512
5,-1.947645,-0.871479,-0.067358,0.602588
6,0.615519,-0.339106,-1.318752,0.650193
7,-0.599134,-2.625518,-0.612502,-1.273615
8,-0.511269,-1.407373,1.084909,2.172842
9,0.517993,1.088953,0.847549,2.33975


#### Join

* merge() reindex the two pandas object when merging

In [82]:
left = pd.DataFrame({"key":["foo","foo"], "lval":[1,2]})

In [83]:
right = pd.DataFrame({"key":["foo","foo"], "rval":[4,5]})

In [84]:
  left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [85]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [86]:
pd.merge(left,right,on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5
