In [1]:
%%html
<h2>Chapter 5 - Getting started with pandas</h2>

In [56]:
import pandas as pd
import numpy as np

In [34]:
%%html
<h3>Pandas Data Structures - Series</h3>

Code examples are taken from <a href="https://github.com/wesm/pydata-book/blob/3rd-edition/ch05.ipynb">https://github.com/wesm/pydata-book/blob/3rd-edition/ch05.ipynb</a>

In [4]:
# series is a one dimension array like object containing sequence of values
obj = pd.Series([4,7,-5, 3])
obj
# left most column indicates the indices of the series

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
# we can manually specify the index
obj2 = pd.Series([4,7,-5,3], index=["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
#We can get the value based on the index
obj2["d"]

4

In [12]:
# we can get multiple values by providing a list
obj2[["d","b"]]

d    4
b    7
dtype: int64

In [13]:
# we can filter

obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

In [14]:
# We can create a series from python dictionary. In this case key become index and value become series's value.

In [15]:
# a series can be converted to dictionary
obj2.to_dict()

{'d': 4, 'b': 7, 'a': -5, 'c': 3}

In [16]:
# we can check got NaN

pd.isna(obj2)

d    False
b    False
a    False
c    False
dtype: bool

In [17]:
obj2.isna()

d    False
b    False
a    False
c    False
dtype: bool

In [18]:
pd.notna(obj2)

d    True
b    True
a    True
c    True
dtype: bool

In [19]:
obj2.notna()

d    True
b    True
a    True
c    True
dtype: bool

In [21]:
# We can do arithmentic - the index ahould be same other wise we will get NaN
obj+ obj2

0   NaN
1   NaN
2   NaN
3   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64

In [22]:
obj+obj

0     8
1    14
2   -10
3     6
dtype: int64

In [26]:
# We can add the name to series and index
obj2.index.name="somethig"
obj2.name = "series_name"
obj2

somethig
d    4
b    7
a   -5
c    3
Name: series_name, dtype: int64

In [27]:
# we can change the index by assigning a new value

In [30]:
obj2.index = ["w","x","y","z"]
obj2

w    4
x    7
y   -5
z    3
Name: series_name, dtype: int64

In [32]:
%%html
<h3>Pandas Data Structures - DataFrame</h3>

In [35]:
# A DF represents a rectangular table of data containe sordered,named collection of coluns , each of which can be difeerent value types(numeric string, boolean etc).
#It's similar to a table in the 2 dimensional df. It can be used to represent higher dimenional data in a tabbular formt

In [37]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [38]:
df.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [39]:
df.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [40]:
# We can arrange th order of the columns
pd.DataFrame(data, columns=["year","state","pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [64]:
# we can pass a column that is not in the original data
df = pd.DataFrame(data, columns=["year","state","pop","debt s"])

In [49]:
%%html
<h4>Data retrieval from column</h4>

In [44]:
# We can retrieve the data in the column using dictionary type or dot attribute type. Note: Column name should ne aligned with variable syntax in in order to use dot attribute type
df["year"]

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [45]:
df.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [46]:
df["debt s"]

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
Name: debt s, dtype: object

In [48]:
df.debt s

SyntaxError: invalid syntax (3402918386.py, line 1)

In [50]:
%%html
<h4>Data retrieval from row</h4>

In [51]:
# we can use loc & iloc(mainly for integer location)
# in the above example index is integer so we can use both loc & iloc

df.loc[1]

year      2001
state     Ohio
pop        1.7
debt s     NaN
Name: 1, dtype: object

In [52]:
df.iloc[1]

year      2001
state     Ohio
pop        1.7
debt s     NaN
Name: 1, dtype: object

In [54]:
# we can modify the column values

df["debt s"] = 16
df

Unnamed: 0,year,state,pop,debt s
0,2000,Ohio,1.5,16
1,2001,Ohio,1.7,16
2,2002,Ohio,3.6,16
3,2001,Nevada,2.4,16
4,2002,Nevada,2.9,16
5,2003,Nevada,3.2,16


In [65]:
df["debt s"] = np.arange(6)
df

Unnamed: 0,year,state,pop,debt s
0,2000,Ohio,1.5,0
1,2001,Ohio,1.7,1
2,2002,Ohio,3.6,2
3,2001,Nevada,2.4,3
4,2002,Nevada,2.9,4
5,2003,Nevada,3.2,5


In [66]:
# we can delete the column
del df["debt s"]
df

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [73]:
# we can do transpose which swap rows and columns
new_df = df.T
new_df.index

Index(['year', 'state', 'pop'], dtype='object')

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [74]:
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
               "Nevada": {2001: 2.4, 2002: 2.9}}
df3 = pd.DataFrame(populations)
df3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [76]:
df3.index.name = "year"
df3.columns.name = "state"
df

state,year,state.1,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [77]:
# We can comvert it to numpy
df.to_numpy()

array([[2000, 'Ohio', 1.5],
       [2001, 'Ohio', 1.7],
       [2002, 'Ohio', 3.6],
       [2001, 'Nevada', 2.4],
       [2002, 'Nevada', 2.9],
       [2003, 'Nevada', 3.2]], dtype=object)

In [81]:
%%html
<h3>Index objects</h3>

Index objects are responsible for holding the axis labels inclusing a DF's column names

In [82]:
df.columns

Index(['year', 'state', 'pop'], dtype='object', name='state')

In [83]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [84]:
# row index is refered by df.index and column index is referred by df.columns

In [85]:
%%html
<h3>Reindexing</h3>

In [86]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [88]:
# For the series, we can refer it by directly passing the new index list
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [92]:
#For dataframes, you need to mention index for the row and columns for the columns
df3 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=["a", "c", "d"],
                     columns=["Ohio", "Texas", "California"])
df3

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [95]:
df3.reindex(index=["a","b", "c","d"])

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [96]:
df3.reindex(columns=["Texas", "Utah", "California"])

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [102]:
#Another way to reindex is using axis which can be similar to series with only addition of axis=index or columns or 0 or 1
df3 = df3.reindex(["Texas", "Utah", "California"], axis=1)


In [103]:
%%html
<h3>Dropping entries from an Axis</h3>


In [104]:
df3.drop(columns="Utah")

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


In [108]:
df3 = df3.reindex(columns=["Texas", "Utah", "California"])
df3.drop(columns=["Texas", "Utah"])

Unnamed: 0,California
a,2
c,5
d,8


In [111]:
df3.drop(index=["d"])

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5


In [112]:
%%html
<h3>Indexing, Selection, and Filtering</h3>

In [113]:
%%html
Pandas indexing methods
<img src='images/pandas_slicing.png', width=500/>

In [128]:
# similar to numpy, we can do the slicing with df as  show in the picture above

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [117]:
#df.loc[rows]
data.loc["Ohio"]

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [119]:
#df.loc[rows,columns]
data.loc["Ohio","one"]

0

In [121]:
# In pnadas end index is inclusive in slicing. if you do just : means all
#df.loc[rowIndexStart: rowIndexEnd, columnIndexStart:columnIndexEnd]

data.loc["Ohio": "Utah", :]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


In [122]:
data.loc["Ohio": "Utah", "one":"two"]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9


In [123]:
data.loc["Ohio": "Utah", "two":]

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11


In [124]:
data.loc["Ohio": "Utah", "two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [129]:
data.loc[data["three"]>2]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [126]:
# We can do conditional filling - the below fill all the values in the row with 0 if the conditio matches

data.loc[data["four"]>5] = 3
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,3,3,3,3
Utah,3,3,3,3
New York,3,3,3,3


In [131]:
%%html
<h3>Arithmetic & Data Alignment<h3>

In [132]:
# pg.no.152