# Pandas - Python Library

In [2]:
# Installing libraries
# pip3 install pandas
# pip3 install numpy

- ### Importing libraries

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Objects creation
s = pd.Series([1, 3, np.nan ,5, 7, 8, 9]) # Series -> Column
s

0    1.0
1    3.0
2    NaN
3    5.0
4    7.0
5    8.0
6    9.0
dtype: float64

In [5]:
dates = pd.date_range('20230101', periods=6)
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-02,-0.344853,2.235077,0.975512,1.189974
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-05,-0.092483,0.195029,-0.934935,0.58636
2023-01-06,0.451228,0.435695,-0.00357,0.9666


In [7]:
# Creating Dataframe using Dictionary
df2 = pd.DataFrame(
    {
        'A': 1.0,
        'B': pd.Timestamp('20230723'),
        'C': pd.Series(1, index=list(range(4)), dtype=float),
        'D': np.array([3] * 4, dtype="int32"),
        'E': pd.Categorical(['girl', 'woman', 'girl', 'woman']),
        'F': 'female', 
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2023-07-23,1.0,3,girl,female
1,1.0,2023-07-23,1.0,3,woman,female
2,1.0,2023-07-23,1.0,3,girl,female
3,1.0,2023-07-23,1.0,3,woman,female


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float64
D             int32
E          category
F            object
dtype: object

In [9]:
df.head(3)

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-02,-0.344853,2.235077,0.975512,1.189974
2023-01-03,0.484602,-0.371021,0.273077,-0.722815


In [10]:
df.tail(2)

Unnamed: 0,A,B,C,D
2023-01-05,-0.092483,0.195029,-0.934935,0.58636
2023-01-06,0.451228,0.435695,-0.00357,0.9666


In [11]:
df.index # row's name / head

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df2.index

Index([0, 1, 2, 3], dtype='int64')

In [13]:
df.to_numpy() # To create or convert to numpy's array   

array([[ 0.42983952, -2.30394507,  0.51537526,  0.57972472],
       [-0.34485318,  2.23507708,  0.9755117 ,  1.18997399],
       [ 0.48460216, -0.37102057,  0.27307743, -0.72281481],
       [ 0.61321931, -0.63022726,  0.43749663,  0.92732778],
       [-0.09248275,  0.19502924, -0.93493453,  0.58635987],
       [ 0.4512284 ,  0.43569542, -0.00357007,  0.9665998 ]])

In [14]:
df2.to_numpy()

array([[1.0, Timestamp('2023-07-23 00:00:00'), 1.0, 3, 'girl', 'female'],
       [1.0, Timestamp('2023-07-23 00:00:00'), 1.0, 3, 'woman', 'female'],
       [1.0, Timestamp('2023-07-23 00:00:00'), 1.0, 3, 'girl', 'female'],
       [1.0, Timestamp('2023-07-23 00:00:00'), 1.0, 3, 'woman', 'female']],
      dtype=object)

In [15]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.256926,-0.073232,0.210493,0.587862
std,0.382272,1.485548,0.646758,0.683983
min,-0.344853,-2.303945,-0.934935,-0.722815
25%,0.038098,-0.565426,0.065592,0.581384
50%,0.440534,-0.087996,0.355287,0.756844
75%,0.476259,0.375529,0.495906,0.956782
max,0.613219,2.235077,0.975512,1.189974


In [16]:
df.T # To transpose the data

Unnamed: 0,2023-01-01,2023-01-02,2023-01-03,2023-01-04,2023-01-05,2023-01-06
A,0.42984,-0.344853,0.484602,0.613219,-0.092483,0.451228
B,-2.303945,2.235077,-0.371021,-0.630227,0.195029,0.435695
C,0.515375,0.975512,0.273077,0.437497,-0.934935,-0.00357
D,0.579725,1.189974,-0.722815,0.927328,0.58636,0.9666


In [17]:
# Sorting
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2023-01-01,0.579725,0.515375,-2.303945,0.42984
2023-01-02,1.189974,0.975512,2.235077,-0.344853
2023-01-03,-0.722815,0.273077,-0.371021,0.484602
2023-01-04,0.927328,0.437497,-0.630227,0.613219
2023-01-05,0.58636,-0.934935,0.195029,-0.092483
2023-01-06,0.9666,-0.00357,0.435695,0.451228


In [18]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-02,-0.344853,2.235077,0.975512,1.189974
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-05,-0.092483,0.195029,-0.934935,0.58636
2023-01-06,0.451228,0.435695,-0.00357,0.9666


In [19]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-05,-0.092483,0.195029,-0.934935,0.58636
2023-01-06,0.451228,0.435695,-0.00357,0.9666
2023-01-02,-0.344853,2.235077,0.975512,1.189974


In [20]:
df.sort_values(by="B", ascending=True)

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-05,-0.092483,0.195029,-0.934935,0.58636
2023-01-06,0.451228,0.435695,-0.00357,0.9666
2023-01-02,-0.344853,2.235077,0.975512,1.189974


In [21]:
# Filtering data based on values
df["B"] 

2023-01-01   -2.303945
2023-01-02    2.235077
2023-01-03   -0.371021
2023-01-04   -0.630227
2023-01-05    0.195029
2023-01-06    0.435695
Freq: D, Name: B, dtype: float64

In [22]:
# Row wise selection
df[0:10]

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-02,-0.344853,2.235077,0.975512,1.189974
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-05,-0.092483,0.195029,-0.934935,0.58636
2023-01-06,0.451228,0.435695,-0.00357,0.9666


In [25]:
# Specific index
df.loc[dates[0]]

A    0.429840
B   -2.303945
C    0.515375
D    0.579725
Name: 2023-01-01 00:00:00, dtype: float64

In [28]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2023-01-01,0.42984,-2.303945
2023-01-02,-0.344853,2.235077
2023-01-03,0.484602,-0.371021
2023-01-04,0.613219,-0.630227
2023-01-05,-0.092483,0.195029


In [30]:
df.loc['20230102':'20230104', ['A', 'B']] # From 2023-01-02 to 2023-01-04 of A & B

Unnamed: 0,A,B
2023-01-02,-0.344853,2.235077
2023-01-03,0.484602,-0.371021
2023-01-04,0.613219,-0.630227


In [31]:
df.loc['20230104', ['A', 'B', 'C']] # Only 2023-01-04 of A B & C

A    0.613219
B   -0.630227
C    0.437497
Name: 2023-01-04 00:00:00, dtype: float64

In [33]:
# Scalar values
df.at[dates[3], "A"]

0.6132193054559238

In [37]:
df.iloc[3] # 3rd Index

A    0.613219
B   -0.630227
C    0.437497
D    0.927328
Name: 2023-01-04 00:00:00, dtype: float64

In [38]:
df.iloc[0:5]

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-02,-0.344853,2.235077,0.975512,1.189974
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-05,-0.092483,0.195029,-0.934935,0.58636


In [39]:
df.iloc[0:5, 0:3]

Unnamed: 0,A,B,C
2023-01-01,0.42984,-2.303945,0.515375
2023-01-02,-0.344853,2.235077,0.975512
2023-01-03,0.484602,-0.371021,0.273077
2023-01-04,0.613219,-0.630227,0.437497
2023-01-05,-0.092483,0.195029,-0.934935


In [40]:
df.iloc[0:5, :]

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-02,-0.344853,2.235077,0.975512,1.189974
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-05,-0.092483,0.195029,-0.934935,0.58636


In [42]:
df.iloc[:, 0:2]

Unnamed: 0,A,B
2023-01-01,0.42984,-2.303945
2023-01-02,-0.344853,2.235077
2023-01-03,0.484602,-0.371021
2023-01-04,0.613219,-0.630227
2023-01-05,-0.092483,0.195029
2023-01-06,0.451228,0.435695


### Boolean Operators

In [43]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,-2.303945,0.515375,0.579725
2023-01-03,0.484602,-0.371021,0.273077,-0.722815
2023-01-04,0.613219,-0.630227,0.437497,0.927328
2023-01-06,0.451228,0.435695,-0.00357,0.9666


In [44]:
df[df['B'] > 1.2]

Unnamed: 0,A,B,C,D
2023-01-02,-0.344853,2.235077,0.975512,1.189974


In [46]:
df[df > 0]

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,,0.515375,0.579725
2023-01-02,,2.235077,0.975512,1.189974
2023-01-03,0.484602,,0.273077,
2023-01-04,0.613219,,0.437497,0.927328
2023-01-05,,0.195029,,0.58636
2023-01-06,0.451228,0.435695,,0.9666


In [45]:
df[df[['A', 'B']] > 0] # Multiple cols we use 2D [[]]

Unnamed: 0,A,B,C,D
2023-01-01,0.42984,,,
2023-01-02,,2.235077,,
2023-01-03,0.484602,,,
2023-01-04,0.613219,,,
2023-01-05,,0.195029,,
2023-01-06,0.451228,0.435695,,


In [47]:
df2 = df.copy()

In [51]:
df2['E'] = ['one', 'two', 'three', 'four', 'five', 'six']
df2

Unnamed: 0,A,B,C,D,MyColumn,E
2023-01-01,0.42984,-2.303945,0.515375,0.579725,one,one
2023-01-02,-0.344853,2.235077,0.975512,1.189974,two,two
2023-01-03,0.484602,-0.371021,0.273077,-0.722815,three,three
2023-01-04,0.613219,-0.630227,0.437497,0.927328,four,four
2023-01-05,-0.092483,0.195029,-0.934935,0.58636,five,five
2023-01-06,0.451228,0.435695,-0.00357,0.9666,six,six


In [68]:
df2 = df2.drop(columns='MyColumn') # Deleting or droping a column

In [69]:
df2

Unnamed: 0,A,B,C,D,E
2023-01-01,0.42984,-2.303945,0.515375,0.579725,one
2023-01-02,-0.344853,2.235077,0.975512,1.189974,two
2023-01-03,0.484602,-0.371021,0.273077,-0.722815,three
2023-01-04,0.613219,-0.630227,0.437497,0.927328,four
2023-01-05,-0.092483,0.195029,-0.934935,0.58636,five
2023-01-06,0.451228,0.435695,-0.00357,0.9666,six


In [71]:
df2['new'] = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6]

In [72]:
df2

Unnamed: 0,A,B,C,D,E,new
2023-01-01,0.42984,-2.303945,0.515375,0.579725,one,1.1
2023-01-02,-0.344853,2.235077,0.975512,1.189974,two,1.2
2023-01-03,0.484602,-0.371021,0.273077,-0.722815,three,1.3
2023-01-04,0.613219,-0.630227,0.437497,0.927328,four,1.4
2023-01-05,-0.092483,0.195029,-0.934935,0.58636,five,1.5
2023-01-06,0.451228,0.435695,-0.00357,0.9666,six,1.6


In [82]:
abcd = df2['A'] + df2['B'] + df2['C'] + df2['D']

In [83]:
# Assignment 1 to add a new column named 'mean' to set the average of all A, B, C & D
df2['mean'] = abcd / 4

In [84]:
df2

Unnamed: 0,A,B,C,D,E,new,mean
2023-01-01,0.42984,-2.303945,0.515375,0.579725,one,1.1,-0.194751
2023-01-02,-0.344853,2.235077,0.975512,1.189974,two,1.2,1.013927
2023-01-03,0.484602,-0.371021,0.273077,-0.722815,three,1.3,-0.084039
2023-01-04,0.613219,-0.630227,0.437497,0.927328,four,1.4,0.336954
2023-01-05,-0.092483,0.195029,-0.934935,0.58636,five,1.5,-0.061507
2023-01-06,0.451228,0.435695,-0.00357,0.9666,six,1.6,0.462488


In [90]:
df2 = df2.iloc[:, 0:2]
df2

Unnamed: 0,A,B
2023-01-01,0.42984,-2.303945
2023-01-02,-0.344853,2.235077
2023-01-03,0.484602,-0.371021
2023-01-04,0.613219,-0.630227
2023-01-05,-0.092483,0.195029
2023-01-06,0.451228,0.435695
