In [1]:
import numpy as np
import pandas as pd

## Data Structures in Pandas

### Series 

In [4]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.array

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = pd.Series([4, 7, -5, 3], index=["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [9]:
obj2["d"]

4

In [10]:
obj2[["d","a"]]

d    4
a   -5
dtype: int64

In [11]:
obj2[:] = 7
obj2

d    7
b    7
a    7
c    7
dtype: int64

In [12]:
obj2 = pd.Series(np.random.randint(10, size=4))
obj2

0    8
1    2
2    6
3    0
dtype: int32

In [13]:
obj2[obj2 > 3]

0    8
2    6
dtype: int32

In [14]:
obj2 * obj2

0    64
1     4
2    36
3     0
dtype: int32

In [15]:
obj2 + obj2

0    16
1     4
2    12
3     0
dtype: int32

In [16]:
obj2 - obj2

0    0
1    0
2    0
3    0
dtype: int32

In [17]:
np.exp(obj2)

0    2980.957987
1       7.389056
2     403.428793
3       1.000000
dtype: float64

In [18]:
np.mean(obj2)

4.0

In [19]:
obj3 = pd.Series([7,13,0,4], index=["a","b","c","d"])
obj3

a     7
b    13
c     0
d     4
dtype: int64

In [20]:
"d" in obj3

True

In [21]:
"e" in obj3

False

In [22]:
sdata = {"İstanbul": 19000000, "Amasya": 500000, "Giresun": 450000} # series from a dictionary
obj4 = pd.Series(sdata)
obj4

İstanbul    19000000
Amasya        500000
Giresun       450000
dtype: int64

In [23]:
obj4.to_dict() # converting a series back to a dictionary

{'İstanbul': 19000000, 'Amasya': 500000, 'Giresun': 450000}

In [24]:
indexes = ["Amasya", "Giresun", "İstanbul", "Krakow"] # overriding the order of the series
obj4 = pd.Series(sdata, index=indexes)
obj4

Amasya        500000.0
Giresun       450000.0
İstanbul    19000000.0
Krakow             NaN
dtype: float64

In [25]:
# isna and notna can be used for detecting missing data

In [26]:
pd.isna(obj4) 

Amasya      False
Giresun     False
İstanbul    False
Krakow       True
dtype: bool

In [27]:
pd.notna(obj4)

Amasya       True
Giresun      True
İstanbul     True
Krakow      False
dtype: bool

In [28]:
obj4.isna() # also this is an instance method used for same purpose like pd.isna

Amasya      False
Giresun     False
İstanbul    False
Krakow       True
dtype: bool

In [29]:
obj4

Amasya        500000.0
Giresun       450000.0
İstanbul    19000000.0
Krakow             NaN
dtype: float64

In [30]:
obj5 = pd.Series({"İstanbul": 19000000, "Amasya": 500000, "Giresun": 450000})
obj5

İstanbul    19000000
Amasya        500000
Giresun       450000
dtype: int64

In [31]:
obj4 + obj5 # automatic alignment in arithmetic operations

Amasya       1000000.0
Giresun       900000.0
Krakow             NaN
İstanbul    38000000.0
dtype: float64

In [32]:
obj4.name = "population"
obj4.index.name = "province"
obj4

province
Amasya        500000.0
Giresun       450000.0
İstanbul    19000000.0
Krakow             NaN
Name: population, dtype: float64

### DataFrame

In [34]:
data = {
    "name" : ["Doğukan", "Atakan", "Hazal Asu", "Ekinsu", "Berke Kaan", "Ece Dilara", "Yavuzhan", "Muhammed Ali"],
    "height" : [170, 170, 135, 160, 187, 160, 178, 179],
    "weight" : [75.4, 66.8, 23.2, 52.5, 85.0, 55.5, 88.7, 80.4]
}

df1 = pd.DataFrame(data)

In [35]:
df1

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [36]:
df1.head()

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0


In [37]:
df1.tail()

Unnamed: 0,name,height,weight
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [38]:
df2 = pd.DataFrame(data, columns = ["name", "weight", "height"]) # using columns attribute to arrange order of columns
df2

Unnamed: 0,name,weight,height
0,Doğukan,75.4,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [39]:
df3 = pd.DataFrame(data, columns = ["name", "weight", "height", "BMI"]) # passing a column name that it is not in the dictionary.
df3

Unnamed: 0,name,weight,height,BMI
0,Doğukan,75.4,170,
1,Atakan,66.8,170,
2,Hazal Asu,23.2,135,
3,Ekinsu,52.5,160,
4,Berke Kaan,85.0,187,
5,Ece Dilara,55.5,160,
6,Yavuzhan,88.7,178,
7,Muhammed Ali,80.4,179,


In [40]:
df3.columns # displaying columns of a data frame

Index(['name', 'weight', 'height', 'BMI'], dtype='object')

In [41]:
df2

Unnamed: 0,name,weight,height
0,Doğukan,75.4,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [42]:
# retrieving a column from a data frame

In [43]:
df2["name"]

0         Doğukan
1          Atakan
2       Hazal Asu
3          Ekinsu
4      Berke Kaan
5      Ece Dilara
6        Yavuzhan
7    Muhammed Ali
Name: name, dtype: object

In [44]:
df2.name

0         Doğukan
1          Atakan
2       Hazal Asu
3          Ekinsu
4      Berke Kaan
5      Ece Dilara
6        Yavuzhan
7    Muhammed Ali
Name: name, dtype: object

In [45]:
df2["name"][0]

'Doğukan'

In [46]:
# rows can also be retrieved by position or name with special "iloc" and "loc" attributes.

In [47]:
df2.loc[0] # you can use "loc" also for string indexes.

name      Doğukan
weight       75.4
height        170
Name: 0, dtype: object

In [48]:
df2.iloc[0]

name      Doğukan
weight       75.4
height        170
Name: 0, dtype: object

In [49]:
df2["weight"][0] = 75.2 # you can modify a value in a specific column and row and also you can change whole columns, for ex. df2["weight"] = 75

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df2["weight"][0] = 75.2 # you can modify a value in a specific column and row and also you can change whole columns, for ex. df2["weight"] = 75
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.

In [50]:
df2

Unnamed: 0,name,weight,height
0,Doğukan,75.2,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [51]:
df3

Unnamed: 0,name,weight,height,BMI
0,Doğukan,75.4,170,
1,Atakan,66.8,170,
2,Hazal Asu,23.2,135,
3,Ekinsu,52.5,160,
4,Berke Kaan,85.0,187,
5,Ece Dilara,55.5,160,
6,Yavuzhan,88.7,178,
7,Muhammed Ali,80.4,179,


In [52]:
df3["BMI"] = np.random.randint(20, 30, size=8)
df3

Unnamed: 0,name,weight,height,BMI
0,Doğukan,75.4,170,28
1,Atakan,66.8,170,20
2,Hazal Asu,23.2,135,27
3,Ekinsu,52.5,160,22
4,Berke Kaan,85.0,187,20
5,Ece Dilara,55.5,160,24
6,Yavuzhan,88.7,178,29
7,Muhammed Ali,80.4,179,27


In [53]:
del df3["BMI"] # to remove a specific column
df3

Unnamed: 0,name,weight,height
0,Doğukan,75.4,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [54]:
populations = {
    "İstanbul" : {2000: 10000000, 2015: 15000000, 2025: 20000000},
    "Krakow": {2000: 300000, 2015: 700000, 2025: 1800000}
}
df4 = pd.DataFrame(populations)
df4
# If the nested dictionary is passed to the DataFrame, pandas will interpret the outer dictionary keys as the columns, and the inner keys as the row indices.

Unnamed: 0,İstanbul,Krakow
2000,10000000,300000
2015,15000000,700000
2025,20000000,1800000


In [55]:
df4.T # transposing a data frame

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000
Krakow,300000,700000,1800000


In [56]:
df2.index.name = "personal number"
df2

Unnamed: 0_level_0,name,weight,height
personal number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Doğukan,75.2,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [57]:
df2.columns.name = "personal informations"
df2

personal informations,name,weight,height
personal number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Doğukan,75.2,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [58]:
df3.to_numpy() # return the data contained in the DataFrame as a two-dimensional ndarray

array([['Doğukan', 75.4, 170],
       ['Atakan', 66.8, 170],
       ['Hazal Asu', 23.2, 135],
       ['Ekinsu', 52.5, 160],
       ['Berke Kaan', 85.0, 187],
       ['Ece Dilara', 55.5, 160],
       ['Yavuzhan', 88.7, 178],
       ['Muhammed Ali', 80.4, 179]], dtype=object)

### Index Objects

In [60]:
myObj = pd.Series(np.arange(3), index=["a","b","c"])
index = myObj.index

In [61]:
index

Index(['a', 'b', 'c'], dtype='object')

In [62]:
index[1:]

Index(['b', 'c'], dtype='object')

In [63]:
# index[0] = "d" # index objects are immutable and thus cannot be modified by the user. Immutability makes it safer to share Index objects among data structures.

In [64]:
labels = pd.Index(np.arange(3))
labels

Index([0, 1, 2], dtype='int32')

In [65]:
myObj2 = pd.Series([1.5, -2.5, 0], index=labels)
myObj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [66]:
myObj2.index is labels

True

In [67]:
df1

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [68]:
"name" in df1.columns

True

In [69]:
df4

Unnamed: 0,İstanbul,Krakow
2000,10000000,300000
2015,15000000,700000
2025,20000000,1800000


In [70]:
2000 in df4.index

True

In [71]:
"2000" in df4.index

False

In [72]:
pd.Index(["doğukan", "atakan", "hazal asu", "doğukan"]) # unlike Python sets, a pandas Index can contain duplicate labels.

# and selection with duplicate labels will select all occurences of that label.

Index(['doğukan', 'atakan', 'hazal asu', 'doğukan'], dtype='object')

In [73]:
# some useful index methods and properties: 
# append(), difference(), intersection(), union(), isin(), delete(), drop(), insert(), is_monotonic, is_unique, unique()

## Essential Functionality

### Reindexing

In [76]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [77]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [78]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [79]:
obj3.reindex(np.arange(6), method="ffill") # other method is bfill (fill backward)

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [80]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "b", "c"], columns=["Ohio", "Texas", "California"])
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [81]:
df2 = df.reindex(index=["a", "b", "c", "d"])
df2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [82]:
states = ["Texas", "Utah", "California"]
df.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [83]:
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [84]:
df.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [85]:
df.reindex(states, axis="rows")

Unnamed: 0,Ohio,Texas,California
Texas,,,
Utah,,,
California,,,


In [86]:
df.loc[["c","b","a"], ["California", "Texas", "Ohio"]]

Unnamed: 0,California,Texas,Ohio
c,8,7,6
b,5,4,3
a,2,1,0


### Dropping Entries from an Axis

In [88]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [89]:
new_obj = obj.drop("c")
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [90]:
obj.drop(["d","c"])

a    0.0
b    1.0
e    4.0
dtype: float64

In [91]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [92]:
df.drop(index=[4, 5, 7])

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
6,Yavuzhan,178,88.7


In [93]:
df.drop([0, 1, 2], axis=0) # you do not need to write axis=0 because it is default.

Unnamed: 0,name,height,weight
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [94]:
df.drop(columns=["height"])

Unnamed: 0,name,weight
0,Doğukan,75.4
1,Atakan,66.8
2,Hazal Asu,23.2
3,Ekinsu,52.5
4,Berke Kaan,85.0
5,Ece Dilara,55.5
6,Yavuzhan,88.7
7,Muhammed Ali,80.4


In [95]:
df.drop(["weight"], axis=1)

Unnamed: 0,name,height
0,Doğukan,170
1,Atakan,170
2,Hazal Asu,135
3,Ekinsu,160
4,Berke Kaan,187
5,Ece Dilara,160
6,Yavuzhan,178
7,Muhammed Ali,179


### Indexing, Selection, and Filtering

In [97]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [98]:
obj["b"]

1.0

In [99]:
obj[1]

  obj[1]


1.0

In [100]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [101]:
obj[["a","d"]]

a    0.0
d    3.0
dtype: float64

In [102]:
obj[[0, 1]]

  obj[[0, 1]]


a    0.0
b    1.0
dtype: float64

In [103]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [104]:
obj.loc[["b","a"]] # preferred way to select index values

b    1.0
a    0.0
dtype: float64

In [105]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])

In [106]:
obj1

2    1
0    2
1    3
dtype: int64

In [107]:
obj2

a    1
b    2
c    3
dtype: int64

In [108]:
obj1[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [109]:
obj2[[0, 1, 2]]

  obj2[[0, 1, 2]]


a    1
b    2
c    3
dtype: int64

In [110]:
# obj.loc[[0, 1, 2]] # will throw an error instead use obj.iloc[[0,1,2]]
obj.iloc[[0,1,2]]

a    0.0
b    1.0
c    2.0
dtype: float64

In [111]:
obj1.loc[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [112]:
obj2.loc["b":"c"]

b    2
c    3
dtype: int64

In [113]:
# slicing by using loc is differently working than Python built-in slicing, the former works inclusively, latter is not.

In [114]:
obj1.loc[2:0]

2    1
0    2
dtype: int64

In [115]:
obj1[0:1]

2    1
dtype: int64

In [116]:
# modifying a part of series using loc

In [117]:
obj1

2    1
0    2
1    3
dtype: int64

In [118]:
obj1.loc[2:0] = 7
obj1

2    7
0    7
1    3
dtype: int64

In [119]:
# indexing into a data frame

In [120]:
 df

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [121]:
df["name"]

0         Doğukan
1          Atakan
2       Hazal Asu
3          Ekinsu
4      Berke Kaan
5      Ece Dilara
6        Yavuzhan
7    Muhammed Ali
Name: name, dtype: object

In [122]:
df[["name", "height"]]

Unnamed: 0,name,height
0,Doğukan,170
1,Atakan,170
2,Hazal Asu,135
3,Ekinsu,160
4,Berke Kaan,187
5,Ece Dilara,160
6,Yavuzhan,178
7,Muhammed Ali,179


In [123]:
df[0:3]

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2


In [124]:
df[df["height"] > 175]

Unnamed: 0,name,height,weight
4,Berke Kaan,187,85.0
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [125]:
df[(df["height"] >= 170 ) & (df["height"] < 180)]

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [126]:
df[["height", "weight"]] < 175

Unnamed: 0,height,weight
0,True,True
1,True,True
2,True,True
3,True,True
4,False,True
5,True,True
6,False,True
7,False,True


In [127]:
df2 = pd.DataFrame(data)
df2

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [128]:
df2.loc[0]

name      Doğukan
height        170
weight       75.4
Name: 0, dtype: object

In [129]:
df2.loc[[0, 3]]

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
3,Ekinsu,160,52.5


In [130]:
df2.loc[[0, 3], ["name"]] # combining both row and column selection in loc

Unnamed: 0,name
0,Doğukan
3,Ekinsu


In [131]:
df4 = df4.T
df4

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000
Krakow,300000,700000,1800000


In [132]:
df4.loc["İstanbul"]

2000    10000000
2015    15000000
2025    20000000
Name: İstanbul, dtype: int64

In [133]:
df4.iloc[0]

2000    10000000
2015    15000000
2025    20000000
Name: İstanbul, dtype: int64

In [134]:
df4.iloc[[0,1]]

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000
Krakow,300000,700000,1800000


In [135]:
df4.iloc[:, [0, 2]]

Unnamed: 0,2000,2025
İstanbul,10000000,20000000
Krakow,300000,1800000


In [136]:
df4.iloc[:1, :]

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000


In [137]:
df4.iloc[:, 2][df4[2025] > 15000000] # selecting provinces which have populations greater than 15 millions.

İstanbul    20000000
Name: 2025, dtype: int64

In [138]:
df.iloc[-1] # df[-1] will not work, instead it throw an error

name      Muhammed Ali
height             179
weight            80.4
Name: 7, dtype: object

### Arithmetic and Data Alignment

In [140]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])

In [141]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [142]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [143]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [144]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])

In [145]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [146]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [147]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


#### Arithmetic methods with fill values

In [149]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns=list("abcd"))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)), columns=list("abcde"))

In [150]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [151]:
df2.loc[1, "b"] = np.nan

In [152]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [153]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [154]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [155]:
df1.add(df2, fill_value=0) # fill_value method will substitutes the passed value on it for any missing values in the operation.

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [156]:
# each method starting with the letter r are for reversed arithmetic operations.

In [157]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [158]:
df1.rdiv(1) # above and this statement are equivalent.

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [159]:
# also when reindexing a Series or DataFrame you can specify a different fill value.
df1.reindex(columns=df2.columns, fill_value=7)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,7
1,4.0,5.0,6.0,7.0,7
2,8.0,9.0,10.0,11.0,7


In [160]:
# other common and flexible arithmetic methods are: 
# add/radd, sub/rsub, div/rdiv, floordiv/rfloordiv, mul/rmul, pow/rpow

In [161]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [162]:
df1.add(10)

Unnamed: 0,a,b,c,d
0,10.0,11.0,12.0,13.0
1,14.0,15.0,16.0,17.0
2,18.0,19.0,20.0,21.0


In [163]:
df1.sub(10)

Unnamed: 0,a,b,c,d
0,-10.0,-9.0,-8.0,-7.0
1,-6.0,-5.0,-4.0,-3.0
2,-2.0,-1.0,0.0,1.0


In [164]:
df1.mul(3)

Unnamed: 0,a,b,c,d
0,0.0,3.0,6.0,9.0
1,12.0,15.0,18.0,21.0
2,24.0,27.0,30.0,33.0


#### Operations between DataFrame and Series

In [166]:
arr = np.arange(12.).reshape((3,4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [167]:
arr[0]

array([0., 1., 2., 3.])

In [168]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [169]:
# Operations between a DataFrame and a Series are similar:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [170]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [171]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [172]:
frame + series

Unnamed: 0,b,d,e
Utah,0.0,2.0,4.0
Ohio,3.0,5.0,7.0
Texas,6.0,8.0,10.0
Oregon,9.0,11.0,13.0


In [173]:
series2 = pd.Series(np.arange(3), index=["b","e","f"])
series2

b    0
e    1
f    2
dtype: int32

In [174]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [175]:
frame + series2 # if an index value is not found either the df's columns or the series' index, the objects will be reindexed to form the union.

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [176]:
series3 = frame["d"]
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [177]:
frame.sub(series3, axis="index")

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function Application and Mapping

In [179]:
frame = pd.DataFrame(np.random.standard_normal((4,3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
frame

Unnamed: 0,b,d,e
Utah,-0.085377,0.122968,-1.166457
Ohio,0.6558,-1.639551,-1.069526
Texas,0.054946,0.196788,-0.511177
Oregon,0.851057,1.065464,0.149282


In [180]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.085377,0.122968,1.166457
Ohio,0.6558,1.639551,1.069526
Texas,0.054946,0.196788,0.511177
Oregon,0.851057,1.065464,0.149282


In [181]:
def f1(x):
    return x.max() - x.min()

In [182]:
frame.apply(f1)

b    0.936435
d    2.705015
e    1.315739
dtype: float64

In [183]:
def f2(x):
    return x.mean()

In [184]:
frame.apply(f2)

b    0.369106
d   -0.063583
e   -0.649469
dtype: float64

In [185]:
frame.apply(f2, axis="columns") 

Utah     -0.376289
Ohio     -0.684426
Texas    -0.086481
Oregon    0.688601
dtype: float64

In [186]:
frame2 = pd.DataFrame(np.arange(12.).reshape((3,4)))
frame2

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [187]:
frame2.apply(f2)

0    4.0
1    5.0
2    6.0
3    7.0
dtype: float64

In [188]:
frame2.apply(f2, axis="columns") # the function will be invoked once per row

0    1.5
1    5.5
2    9.5
dtype: float64

In [189]:
def f3(x):
    return pd.Series([x.min(), x.max()], index=["min","max"])

frame2.apply(f3)

Unnamed: 0,0,1,2,3
min,0.0,1.0,2.0,3.0
max,8.0,9.0,10.0,11.0


In [190]:
def my_format(x):
    return f"{x:.2f}"

In [191]:
frame2.applymap(my_format)

  frame2.applymap(my_format)


Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [192]:
frame2[0].map(my_format)

0    0.00
1    4.00
2    8.00
Name: 0, dtype: object

### Sorting and Ranking

In [194]:
obj = pd.Series(np.arange(4.), index=["d","a","c","b"])
obj

d    0.0
a    1.0
c    2.0
b    3.0
dtype: float64

In [195]:
obj.sort_index()

a    1.0
b    3.0
c    2.0
d    0.0
dtype: float64

In [196]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=["three","one"], columns = ["d","a","c","b"])
frame

Unnamed: 0,d,a,c,b
three,0,1,2,3
one,4,5,6,7


In [197]:
frame.sort_index()

Unnamed: 0,d,a,c,b
one,4,5,6,7
three,0,1,2,3


In [198]:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,3,2,0
one,5,7,6,4


In [199]:
frame.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0,2,3,1
one,4,6,7,5


In [200]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values() # if you want to sort your object according to its values

2   -3
3    2
0    4
1    7
dtype: int64

In [391]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [393]:
obj.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [395]:
frame = pd.DataFrame({
    "b": [4, 7, -3, 2],
    "a": [0, 1, 0, 1]
})

frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [397]:
frame.sort_values("b")

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [401]:
frame.sort_values(["b","a"]) # in order to sort multiple columns, pass a list of name of columns
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [405]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [415]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [407]:
obj.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [409]:
obj.rank(ascending=False)

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [411]:
frame = pd.DataFrame({
    "b": [4.3, 7, -3, 2],
    "a": [0, 1, 0, 1],
    "c": [-2, 5, 8, -2.5]
})

frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [413]:
frame.rank(axis="columns")

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis Indexes with Duplicate Labels

In [419]:
obj = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [423]:
obj.index.is_unique # tell us whether or not labels are unique

False

In [429]:
obj["a"] # indexing a label with multiple entries returns a Series, while single entries reutnr a scalar value like below

a    0
a    1
dtype: int32

In [427]:
obj["c"]

4

In [431]:
df = pd.DataFrame(np.random.standard_normal((5,3)), index=["a","a","b","b","c"])
df

Unnamed: 0,0,1,2
a,1.033892,-0.516807,0.137354
a,-0.573334,0.361826,0.900257
b,0.138303,-1.162369,0.827953
b,0.409495,1.495077,1.767411
c,0.370605,-0.617271,0.012261


In [433]:
df.loc["a"]

Unnamed: 0,0,1,2
a,1.033892,-0.516807,0.137354
a,-0.573334,0.361826,0.900257


In [435]:
df.loc["c"]

0    0.370605
1   -0.617271
2    0.012261
Name: c, dtype: float64

## Summarizing and Computing Descriptive Statistics

In [438]:
df = pd.DataFrame([
    [1.4, np.nan],
    [7.1, -4.5],
    [np.nan, np.nan],
    [0.75, -1.3],
], index=["a", "b", "c", "d"], columns=["one", "two"])

df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [440]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [444]:
df.sum(axis=1) # or df.sum(axis="columns")

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [446]:
df.sum(axis="index")

one    9.25
two   -5.80
dtype: float64

In [450]:
df.sum(axis="index", skipna=False)

one   NaN
two   NaN
dtype: float64

In [456]:
df.sum(axis="columns", skipna=False) # by default, skipna is True

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [454]:
df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [462]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [460]:
df.idxmax()

one    b
two    d
dtype: object

In [464]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [466]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [468]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [470]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [472]:
df.count()

one    3
two    2
dtype: int64

In [474]:
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [476]:
df.std()

one    3.493685
two    2.262742
dtype: float64

### Unique Values, Value Counts, and Membership

In [480]:
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])

In [492]:
uniques = obj.unique() # gives you an array of the unique values in a Series
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [494]:
uniques.sort()
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [496]:
obj.value_counts() # computes value frequencies

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [499]:
pd.value_counts(obj.to_numpy(), sort=False)

  pd.value_counts(obj.to_numpy(), sort=False)


c    3
a    3
d    1
b    2
Name: count, dtype: int64

In [501]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [505]:
mask = obj.isin(["b", "c"]) # isin performs a vectorized set membership check and can be useful in filtering a dataset down to a subset of values in a Series or column in a DataFrame
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [507]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [509]:
to_match = pd.Series(["c", "a", "b", "b", "c", "a"])
unique_vals = pd.Series(["c", "b", "a"])
indices = pd.Index(unique_vals).get_indexer(to_match)
indices

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [511]:
data = pd.DataFrame({
    "Qui1": [1, 3, 4, 3, 4],
    "Qui2": [2, 3, 1, 2, 3],
    "Qui3": [1, 5, 2, 4, 4]
})

data

Unnamed: 0,Qui1,Qui2,Qui3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [513]:
data["Qui1"].value_counts().sort_index()

Qui1
1    1
3    2
4    2
Name: count, dtype: int64

In [517]:
result = data.apply(pd.value_counts).fillna(0)
result

  result = data.apply(pd.value_counts).fillna(0)


Unnamed: 0,Qui1,Qui2,Qui3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [519]:
data = pd.DataFrame({
    "a": [1, 1, 1, 2, 2],
    "b": [0, 0, 1, 0, 0]
})

data

Unnamed: 0,a,b
0,1,0
1,1,0
2,1,1
3,2,0
4,2,0


In [521]:
data.value_counts()

a  b
1  0    2
2  0    2
1  1    1
Name: count, dtype: int64