## 10 Minutes to Pandas
Written by: Pooja Dinani\
Email: piyadinani@gmail.com\
Date: 18/11/2023


In [1]:
# Importing libraries
import numpy as np
import pandas as pd 

In [4]:
# Object creation with default RangeIndex
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [11]:
# Object creation with datetime index using date_range()
dates = pd.date_range("20131118", periods=6)
dates

DatetimeIndex(['2013-11-18', '2013-11-19', '2013-11-20', '2013-11-21',
               '2013-11-22', '2013-11-23'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-11-18,-2.449298,1.961739,0.016516,1.066031
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-20,1.177539,-0.094514,0.963315,2.367925
2013-11-21,0.648059,0.799593,0.48362,0.38227
2013-11-22,0.73452,0.398593,0.05705,-1.288149
2013-11-23,0.481939,-0.686211,-0.074445,0.516233


In [14]:
# Object creation by passing dictionary (Key (Column name), Value (Column Value))
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20231118"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-11-18,1.0,3,test,foo
1,1.0,2023-11-18,1.0,3,train,foo
2,1.0,2023-11-18,1.0,3,test,foo
3,1.0,2023-11-18,1.0,3,train,foo


In [16]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [18]:
# Viewing data
df.head()

Unnamed: 0,A,B,C,D
2013-11-18,-2.449298,1.961739,0.016516,1.066031
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-20,1.177539,-0.094514,0.963315,2.367925
2013-11-21,0.648059,0.799593,0.48362,0.38227
2013-11-22,0.73452,0.398593,0.05705,-1.288149


In [19]:
df.tail()

Unnamed: 0,A,B,C,D
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-20,1.177539,-0.094514,0.963315,2.367925
2013-11-21,0.648059,0.799593,0.48362,0.38227
2013-11-22,0.73452,0.398593,0.05705,-1.288149
2013-11-23,0.481939,-0.686211,-0.074445,0.516233


In [20]:
df.index

DatetimeIndex(['2013-11-18', '2013-11-19', '2013-11-20', '2013-11-21',
               '2013-11-22', '2013-11-23'],
              dtype='datetime64[ns]', freq='D')

In [21]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
# NumPy representation of the data
df.to_numpy()

array([[-2.4492983 ,  1.96173939,  0.01651636,  1.06603136],
       [ 1.2093626 ,  1.65939548, -0.41414394, -0.25482515],
       [ 1.17753859, -0.0945138 ,  0.96331527,  2.36792457],
       [ 0.64805925,  0.79959287,  0.48361993,  0.38227019],
       [ 0.73452005,  0.39859275,  0.05704983, -1.28814897],
       [ 0.48193884, -0.68621109, -0.07444528,  0.51623311]])

In [24]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [25]:
df2.to_numpy()

array([[1.0, Timestamp('2023-11-18 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-11-18 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2023-11-18 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2023-11-18 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [26]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.300354,0.673099,0.171985,0.464914
std,1.378311,1.01604,0.482696,1.231544
min,-2.449298,-0.686211,-0.414144,-1.288149
25%,0.523469,0.028763,-0.051705,-0.095551
50%,0.69129,0.599093,0.036783,0.449252
75%,1.066784,1.444445,0.376977,0.928582
max,1.209363,1.961739,0.963315,2.367925


In [27]:
df.T

Unnamed: 0,2013-11-18,2013-11-19,2013-11-20,2013-11-21,2013-11-22,2013-11-23
A,-2.449298,1.209363,1.177539,0.648059,0.73452,0.481939
B,1.961739,1.659395,-0.094514,0.799593,0.398593,-0.686211
C,0.016516,-0.414144,0.963315,0.48362,0.05705,-0.074445
D,1.066031,-0.254825,2.367925,0.38227,-1.288149,0.516233


In [30]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-11-18,1.066031,0.016516,1.961739,-2.449298
2013-11-19,-0.254825,-0.414144,1.659395,1.209363
2013-11-20,2.367925,0.963315,-0.094514,1.177539
2013-11-21,0.38227,0.48362,0.799593,0.648059
2013-11-22,-1.288149,0.05705,0.398593,0.73452
2013-11-23,0.516233,-0.074445,-0.686211,0.481939


In [31]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-11-23,0.481939,-0.686211,-0.074445,0.516233
2013-11-20,1.177539,-0.094514,0.963315,2.367925
2013-11-22,0.73452,0.398593,0.05705,-1.288149
2013-11-21,0.648059,0.799593,0.48362,0.38227
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-18,-2.449298,1.961739,0.016516,1.066031


In [33]:
# Selection
df[["A", "B"]]

Unnamed: 0,A,B
2013-11-18,-2.449298,1.961739
2013-11-19,1.209363,1.659395
2013-11-20,1.177539,-0.094514
2013-11-21,0.648059,0.799593
2013-11-22,0.73452,0.398593
2013-11-23,0.481939,-0.686211


In [34]:
df

Unnamed: 0,A,B,C,D
2013-11-18,-2.449298,1.961739,0.016516,1.066031
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-20,1.177539,-0.094514,0.963315,2.367925
2013-11-21,0.648059,0.799593,0.48362,0.38227
2013-11-22,0.73452,0.398593,0.05705,-1.288149
2013-11-23,0.481939,-0.686211,-0.074445,0.516233


In [38]:
df.shape

(6, 4)

In [44]:
df.iloc[0:3, 0:3]

Unnamed: 0,A,B,C
2013-11-18,-2.449298,1.961739,0.016516
2013-11-19,1.209363,1.659395,-0.414144
2013-11-20,1.177539,-0.094514,0.963315


In [51]:
df["20131118": "20131121"]

Unnamed: 0,A,B,C,D
2013-11-18,-2.449298,1.961739,0.016516,1.066031
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-20,1.177539,-0.094514,0.963315,2.367925
2013-11-21,0.648059,0.799593,0.48362,0.38227


In [52]:
# Selection by label
df.loc[dates[0]]

A   -2.449298
B    1.961739
C    0.016516
D    1.066031
Name: 2013-11-18 00:00:00, dtype: float64

In [53]:
df.loc[:, ["A", "B", "C"]]

Unnamed: 0,A,B,C
2013-11-18,-2.449298,1.961739,0.016516
2013-11-19,1.209363,1.659395,-0.414144
2013-11-20,1.177539,-0.094514,0.963315
2013-11-21,0.648059,0.799593,0.48362
2013-11-22,0.73452,0.398593,0.05705
2013-11-23,0.481939,-0.686211,-0.074445


In [57]:
df.loc["20131118": "20131121", ["A", "B"]]

Unnamed: 0,A,B
2013-11-18,-2.449298,1.961739
2013-11-19,1.209363,1.659395
2013-11-20,1.177539,-0.094514
2013-11-21,0.648059,0.799593


In [59]:
df.loc[dates[1],"A"]

1.2093626046789339

In [64]:
# Selection by position
df.iloc[3]

A    0.648059
B    0.799593
C    0.483620
D    0.382270
Name: 2013-11-21 00:00:00, dtype: float64

In [61]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-11-21,0.648059,0.799593
2013-11-22,0.73452,0.398593


In [62]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-11-19,1.209363,-0.414144
2013-11-20,1.177539,0.963315
2013-11-22,0.73452,0.05705


In [65]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-11-19,1.209363,1.659395,-0.414144,-0.254825
2013-11-20,1.177539,-0.094514,0.963315,2.367925


In [66]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-11-18,1.961739,0.016516
2013-11-19,1.659395,-0.414144
2013-11-20,-0.094514,0.963315
2013-11-21,0.799593,0.48362
2013-11-22,0.398593,0.05705
2013-11-23,-0.686211,-0.074445


In [67]:
df.iloc[1,1]

1.6593954837714555

In [68]:
df.iat[1,1]

1.6593954837714555

In [70]:
import seaborn as sns
df_titanic = sns.load_dataset('titanic')
df_titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [71]:
df_titanic.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
374,0,3,female,3.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
302,0,3,male,19.0,0,0,0.0000,S,Third,man,True,,Southampton,no,True
401,0,3,male,26.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
813,0,3,female,6.0,4,2,31.2750,S,Third,child,False,,Southampton,no,False
691,1,3,female,4.0,0,1,13.4167,C,Third,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486,1,1,female,35.0,1,0,90.0000,S,First,woman,False,C,Southampton,yes,False
93,0,3,male,26.0,1,2,20.5750,S,Third,man,True,,Southampton,no,False
246,0,3,female,25.0,0,0,7.7750,S,Third,woman,False,,Southampton,no,True
104,0,3,male,37.0,2,0,7.9250,S,Third,man,True,,Southampton,no,False


In [75]:
# Code adaption 
df_titanic[df_titanic["fare"] < 5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
263,0,1,male,40.0,0,0,0.0,S,First,man,True,B,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
277,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True
413,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
466,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
