In [1]:
import numpy as np
import pandas as pd

## Data Structures in Pandas

### Series 

In [4]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [5]:
obj.array

<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64

In [6]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = pd.Series([4, 7, -5, 3], index=["d","b","a","c"])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [9]:
obj2["d"]

4

In [10]:
obj2[["d","a"]]

d    4
a   -5
dtype: int64

In [11]:
obj2[:] = 7
obj2

d    7
b    7
a    7
c    7
dtype: int64

In [12]:
obj2 = pd.Series(np.random.randint(10, size=4))
obj2

0    0
1    7
2    3
3    1
dtype: int32

In [13]:
obj2[obj2 > 3]

1    7
dtype: int32

In [14]:
obj2 * obj2

0     0
1    49
2     9
3     1
dtype: int32

In [15]:
obj2 + obj2

0     0
1    14
2     6
3     2
dtype: int32

In [16]:
obj2 - obj2

0    0
1    0
2    0
3    0
dtype: int32

In [17]:
np.exp(obj2)

0       1.000000
1    1096.633158
2      20.085537
3       2.718282
dtype: float64

In [18]:
np.mean(obj2)

2.75

In [19]:
obj3 = pd.Series([7,13,0,4], index=["a","b","c","d"])
obj3

a     7
b    13
c     0
d     4
dtype: int64

In [20]:
"d" in obj3

True

In [21]:
"e" in obj3

False

In [22]:
sdata = {"İstanbul": 19000000, "Amasya": 500000, "Giresun": 450000} # series from a dictionary
obj4 = pd.Series(sdata)
obj4

İstanbul    19000000
Amasya        500000
Giresun       450000
dtype: int64

In [23]:
obj4.to_dict() # converting a series back to a dictionary

{'İstanbul': 19000000, 'Amasya': 500000, 'Giresun': 450000}

In [24]:
indexes = ["Amasya", "Giresun", "İstanbul", "Krakow"] # overriding the order of the series
obj4 = pd.Series(sdata, index=indexes)
obj4

Amasya        500000.0
Giresun       450000.0
İstanbul    19000000.0
Krakow             NaN
dtype: float64

In [25]:
# isna and notna can be used for detecting missing data

In [26]:
pd.isna(obj4) 

Amasya      False
Giresun     False
İstanbul    False
Krakow       True
dtype: bool

In [27]:
pd.notna(obj4)

Amasya       True
Giresun      True
İstanbul     True
Krakow      False
dtype: bool

In [28]:
obj4.isna() # also this is an instance method used for same purpose like pd.isna

Amasya      False
Giresun     False
İstanbul    False
Krakow       True
dtype: bool

In [29]:
obj4

Amasya        500000.0
Giresun       450000.0
İstanbul    19000000.0
Krakow             NaN
dtype: float64

In [30]:
obj5 = pd.Series({"İstanbul": 19000000, "Amasya": 500000, "Giresun": 450000})
obj5

İstanbul    19000000
Amasya        500000
Giresun       450000
dtype: int64

In [31]:
obj4 + obj5 # automatic alignment in arithmetic operations

Amasya       1000000.0
Giresun       900000.0
Krakow             NaN
İstanbul    38000000.0
dtype: float64

In [32]:
obj4.name = "population"
obj4.index.name = "province"
obj4

province
Amasya        500000.0
Giresun       450000.0
İstanbul    19000000.0
Krakow             NaN
Name: population, dtype: float64

### DataFrame

In [34]:
data = {
    "name" : ["Doğukan", "Atakan", "Hazal Asu", "Ekinsu", "Berke Kaan", "Ece Dilara", "Yavuzhan", "Muhammed Ali"],
    "height" : [170, 170, 135, 160, 187, 160, 178, 179],
    "weight" : [75.4, 66.8, 23.2, 52.5, 85.0, 55.5, 88.7, 80.4]
}

df1 = pd.DataFrame(data)

In [35]:
df1

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [36]:
df1.head()

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0


In [37]:
df1.tail()

Unnamed: 0,name,height,weight
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [38]:
df2 = pd.DataFrame(data, columns = ["name", "weight", "height"]) # using columns attribute to arrange order of columns
df2

Unnamed: 0,name,weight,height
0,Doğukan,75.4,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [39]:
df3 = pd.DataFrame(data, columns = ["name", "weight", "height", "BMI"]) # passing a column name that it is not in the dictionary.
df3

Unnamed: 0,name,weight,height,BMI
0,Doğukan,75.4,170,
1,Atakan,66.8,170,
2,Hazal Asu,23.2,135,
3,Ekinsu,52.5,160,
4,Berke Kaan,85.0,187,
5,Ece Dilara,55.5,160,
6,Yavuzhan,88.7,178,
7,Muhammed Ali,80.4,179,


In [40]:
df3.columns # displaying columns of a data frame

Index(['name', 'weight', 'height', 'BMI'], dtype='object')

In [41]:
df2

Unnamed: 0,name,weight,height
0,Doğukan,75.4,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [42]:
# retrieving a column from a data frame

In [43]:
df2["name"]

0         Doğukan
1          Atakan
2       Hazal Asu
3          Ekinsu
4      Berke Kaan
5      Ece Dilara
6        Yavuzhan
7    Muhammed Ali
Name: name, dtype: object

In [44]:
df2.name

0         Doğukan
1          Atakan
2       Hazal Asu
3          Ekinsu
4      Berke Kaan
5      Ece Dilara
6        Yavuzhan
7    Muhammed Ali
Name: name, dtype: object

In [45]:
df2["name"][0]

'Doğukan'

In [46]:
# rows can also be retrieved by position or name with special "iloc" and "loc" attributes.

In [47]:
df2.loc[0] # you can use "loc" also for string indexes.

name      Doğukan
weight       75.4
height        170
Name: 0, dtype: object

In [48]:
df2.iloc[0]

name      Doğukan
weight       75.4
height        170
Name: 0, dtype: object

In [49]:
df2["weight"][0] = 75.2 # you can modify a value in a specific column and row and also you can change whole columns, for ex. df2["weight"] = 75

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df2["weight"][0] = 75.2 # you can modify a value in a specific column and row and also you can change whole columns, for ex. df2["weight"] = 75
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.

In [50]:
df2

Unnamed: 0,name,weight,height
0,Doğukan,75.2,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [51]:
df3

Unnamed: 0,name,weight,height,BMI
0,Doğukan,75.4,170,
1,Atakan,66.8,170,
2,Hazal Asu,23.2,135,
3,Ekinsu,52.5,160,
4,Berke Kaan,85.0,187,
5,Ece Dilara,55.5,160,
6,Yavuzhan,88.7,178,
7,Muhammed Ali,80.4,179,


In [52]:
df3["BMI"] = np.random.randint(20, 30, size=8)
df3

Unnamed: 0,name,weight,height,BMI
0,Doğukan,75.4,170,25
1,Atakan,66.8,170,29
2,Hazal Asu,23.2,135,20
3,Ekinsu,52.5,160,22
4,Berke Kaan,85.0,187,25
5,Ece Dilara,55.5,160,20
6,Yavuzhan,88.7,178,24
7,Muhammed Ali,80.4,179,20


In [53]:
del df3["BMI"] # to remove a specific column
df3

Unnamed: 0,name,weight,height
0,Doğukan,75.4,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [54]:
populations = {
    "İstanbul" : {2000: 10000000, 2015: 15000000, 2025: 20000000},
    "Krakow": {2000: 300000, 2015: 700000, 2025: 1800000}
}
df4 = pd.DataFrame(populations)
df4
# If the nested dictionary is passed to the DataFrame, pandas will interpret the outer dictionary keys as the columns, and the inner keys as the row indices.

Unnamed: 0,İstanbul,Krakow
2000,10000000,300000
2015,15000000,700000
2025,20000000,1800000


In [55]:
df4.T # transposing a data frame

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000
Krakow,300000,700000,1800000


In [56]:
df2.index.name = "personal number"
df2

Unnamed: 0_level_0,name,weight,height
personal number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Doğukan,75.2,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [57]:
df2.columns.name = "personal informations"
df2

personal informations,name,weight,height
personal number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Doğukan,75.2,170
1,Atakan,66.8,170
2,Hazal Asu,23.2,135
3,Ekinsu,52.5,160
4,Berke Kaan,85.0,187
5,Ece Dilara,55.5,160
6,Yavuzhan,88.7,178
7,Muhammed Ali,80.4,179


In [58]:
df3.to_numpy() # return the data contained in the DataFrame as a two-dimensional ndarray

array([['Doğukan', 75.4, 170],
       ['Atakan', 66.8, 170],
       ['Hazal Asu', 23.2, 135],
       ['Ekinsu', 52.5, 160],
       ['Berke Kaan', 85.0, 187],
       ['Ece Dilara', 55.5, 160],
       ['Yavuzhan', 88.7, 178],
       ['Muhammed Ali', 80.4, 179]], dtype=object)

### Index Objects

In [60]:
myObj = pd.Series(np.arange(3), index=["a","b","c"])
index = myObj.index

In [61]:
index

Index(['a', 'b', 'c'], dtype='object')

In [62]:
index[1:]

Index(['b', 'c'], dtype='object')

In [63]:
# index[0] = "d" # index objects are immutable and thus cannot be modified by the user. Immutability makes it safer to share Index objects among data structures.

In [64]:
labels = pd.Index(np.arange(3))
labels

Index([0, 1, 2], dtype='int32')

In [65]:
myObj2 = pd.Series([1.5, -2.5, 0], index=labels)
myObj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [66]:
myObj2.index is labels

True

In [67]:
df1

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [68]:
"name" in df1.columns

True

In [69]:
df4

Unnamed: 0,İstanbul,Krakow
2000,10000000,300000
2015,15000000,700000
2025,20000000,1800000


In [70]:
2000 in df4.index

True

In [71]:
"2000" in df4.index

False

In [72]:
pd.Index(["doğukan", "atakan", "hazal asu", "doğukan"]) # unlike Python sets, a pandas Index can contain duplicate labels.

# and selection with duplicate labels will select all occurences of that label.

Index(['doğukan', 'atakan', 'hazal asu', 'doğukan'], dtype='object')

In [73]:
# some useful index methods and properties: 
# append(), difference(), intersection(), union(), isin(), delete(), drop(), insert(), is_monotonic, is_unique, unique()

## Essential Functionality

### Reindexing

In [146]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [148]:
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [150]:
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [168]:
obj3.reindex(np.arange(6), method="ffill") # other method is bfill (fill backward)

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [170]:
df = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "b", "c"], columns=["Ohio", "Texas", "California"])
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [172]:
df2 = df.reindex(index=["a", "b", "c", "d"])
df2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [174]:
states = ["Texas", "Utah", "California"]
df.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [176]:
df

Unnamed: 0,Ohio,Texas,California
a,0,1,2
b,3,4,5
c,6,7,8


In [178]:
df.reindex(states, axis="columns")

Unnamed: 0,Texas,Utah,California
a,1,,2
b,4,,5
c,7,,8


In [180]:
df.reindex(states, axis="rows")

Unnamed: 0,Ohio,Texas,California
Texas,,,
Utah,,,
California,,,


In [190]:
df.loc[["c","b","a"], ["California", "Texas", "Ohio"]]

Unnamed: 0,California,Texas,Ohio
c,8,7,6
b,5,4,3
a,2,1,0


### Dropping Entries from an Axis

In [193]:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [195]:
new_obj = obj.drop("c")
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [197]:
obj.drop(["d","c"])

a    0.0
b    1.0
e    4.0
dtype: float64

In [199]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [203]:
df.drop(index=[4, 5, 7])

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
6,Yavuzhan,178,88.7


In [211]:
df.drop([0, 1, 2], axis=0) # you do not need to write axis=0 because it is default.

Unnamed: 0,name,height,weight
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [205]:
df.drop(columns=["height"])

Unnamed: 0,name,weight
0,Doğukan,75.4
1,Atakan,66.8
2,Hazal Asu,23.2
3,Ekinsu,52.5
4,Berke Kaan,85.0
5,Ece Dilara,55.5
6,Yavuzhan,88.7
7,Muhammed Ali,80.4


In [213]:
df.drop(["weight"], axis=1)

Unnamed: 0,name,height
0,Doğukan,170
1,Atakan,170
2,Hazal Asu,135
3,Ekinsu,160
4,Berke Kaan,187
5,Ece Dilara,160
6,Yavuzhan,178
7,Muhammed Ali,179


### Indexing, Selection, and Filtering

In [216]:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [218]:
obj["b"]

1.0

In [220]:
obj[1]

  obj[1]


1.0

In [224]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [226]:
obj[["a","d"]]

a    0.0
d    3.0
dtype: float64

In [228]:
obj[[0, 1]]

  obj[[0, 1]]


a    0.0
b    1.0
dtype: float64

In [230]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [232]:
obj.loc[["b","a"]] # preferred way to select index values

b    1.0
a    0.0
dtype: float64

In [234]:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])

In [236]:
obj1

2    1
0    2
1    3
dtype: int64

In [238]:
obj2

a    1
b    2
c    3
dtype: int64

In [252]:
obj1[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [242]:
obj2[[0, 1, 2]]

  obj2[[0, 1, 2]]


a    1
b    2
c    3
dtype: int64

In [246]:
# obj.loc[[0, 1, 2]] # will throw an error instead use obj.iloc[[0,1,2]]
obj.iloc[[0,1,2]]

a    0.0
b    1.0
c    2.0
dtype: float64

In [256]:
obj1.loc[[0, 1, 2]]

0    2
1    3
2    1
dtype: int64

In [258]:
obj2.loc["b":"c"]

b    2
c    3
dtype: int64

In [266]:
# slicing by using loc is differently working than Python built-in slicing, the former works inclusively, latter is not.

In [272]:
obj1.loc[2:0]

2    1
0    2
dtype: int64

In [274]:
obj1[0:1]

2    1
dtype: int64

In [278]:
# modifying a part of series using loc

In [280]:
obj1

2    1
0    2
1    3
dtype: int64

In [282]:
obj1.loc[2:0] = 7
obj1

2    7
0    7
1    3
dtype: int64

In [286]:
# indexing into a data frame

In [288]:
 df

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [290]:
df["name"]

0         Doğukan
1          Atakan
2       Hazal Asu
3          Ekinsu
4      Berke Kaan
5      Ece Dilara
6        Yavuzhan
7    Muhammed Ali
Name: name, dtype: object

In [294]:
df[["name", "height"]]

Unnamed: 0,name,height
0,Doğukan,170
1,Atakan,170
2,Hazal Asu,135
3,Ekinsu,160
4,Berke Kaan,187
5,Ece Dilara,160
6,Yavuzhan,178
7,Muhammed Ali,179


In [298]:
df[0:3]

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2


In [316]:
df[df["height"] > 175]

Unnamed: 0,name,height,weight
4,Berke Kaan,187,85.0
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [318]:
df[(df["height"] >= 170 ) & (df["height"] < 180)]

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [336]:
df[["height", "weight"]] < 175

Unnamed: 0,height,weight
0,True,True
1,True,True
2,True,True
3,True,True
4,False,True
5,True,True
6,False,True
7,False,True


In [348]:
df2 = pd.DataFrame(data)
df2

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
1,Atakan,170,66.8
2,Hazal Asu,135,23.2
3,Ekinsu,160,52.5
4,Berke Kaan,187,85.0
5,Ece Dilara,160,55.5
6,Yavuzhan,178,88.7
7,Muhammed Ali,179,80.4


In [350]:
df2.loc[0]

name      Doğukan
height        170
weight       75.4
Name: 0, dtype: object

In [356]:
df2.loc[[0, 3]]

Unnamed: 0,name,height,weight
0,Doğukan,170,75.4
3,Ekinsu,160,52.5


In [360]:
df2.loc[[0, 3], ["name"]] # combining both row and column selection in loc

Unnamed: 0,name
0,Doğukan
3,Ekinsu


In [368]:
df4 = df4.T
df4

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000
Krakow,300000,700000,1800000


In [376]:
df4.loc["İstanbul"]

2000    10000000
2015    15000000
2025    20000000
Name: İstanbul, dtype: int64

In [378]:
df4.iloc[0]

2000    10000000
2015    15000000
2025    20000000
Name: İstanbul, dtype: int64

In [380]:
df4.iloc[[0,1]]

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000
Krakow,300000,700000,1800000


In [396]:
df4.iloc[:, [0, 2]]

Unnamed: 0,2000,2025
İstanbul,10000000,20000000
Krakow,300000,1800000


In [408]:
df4.iloc[:1, :]

Unnamed: 0,2000,2015,2025
İstanbul,10000000,15000000,20000000


In [418]:
df4.iloc[:, 2][df4[2025] > 15000000] # selecting provinces which have populations greater than 15 millions.

İstanbul    20000000
Name: 2025, dtype: int64

In [426]:
df.iloc[-1] # df[-1] will not work, instead it throw an error

name      Muhammed Ali
height             179
weight              :(
Name: 7, dtype: object

### Arithmetic and Data Alignment