- what is a dataframe (pandas)
- create df
- vectorized operations
- access rows, columns, cells (.loc, .iloc,.at)
- change values
- sort values

In [30]:
import pandas as pd

In [31]:
from typing import List
data_list: List[int] = [1, 2, 3, 4, 5]

s1 = pd.Series(data_list)
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [32]:
s1 = pd.Series(data=data_list, index=["a", "b", "c", "d", "e"])
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [33]:
type(s1)

pandas.core.series.Series

In [34]:
# Df = collection of Series objects
#columns must be homogenous and have same amount of data

#dictionary creation
data = {
    "name": ["nate", "rebecca", "edwin", "preston"],
    "age": [39, 40, 11, 7],
    "year": ["senior", "junior", "sophomore", "freshman"]
}
students_df = pd.DataFrame(data = data)
students_df = students_df.set_index("name")
students_df

Unnamed: 0_level_0,age,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
nate,39,senior
rebecca,40,junior
edwin,11,sophomore
preston,7,freshman


In [35]:
students_df = students_df.reset_index()
students_df

Unnamed: 0,name,age,year
0,nate,39,senior
1,rebecca,40,junior
2,edwin,11,sophomore
3,preston,7,freshman


In [36]:
#row-based creation
import random

names = ["Messi", "Ronaldo", "Neymar", "Pulisc", "Mbappe"]
goals_per_game = [random.random() for _ in names]
games = [round(random.random() * 1000) for _ in names]

cols = ["name", "goals/game", "games"]
stats = [[names[i], goals_per_game[i], games[i]] for i in range(5)]
stats

stats_df = pd.DataFrame(data=stats, columns=cols) #turns list of lists into dataframe
stats_df


Unnamed: 0,name,goals/game,games
0,Messi,0.642968,966
1,Ronaldo,0.784916,315
2,Neymar,0.063639,345
3,Pulisc,0.669259,671
4,Mbappe,0.215689,439


In [37]:
stats_df["new_col"] = 0
stats_df

Unnamed: 0,name,goals/game,games,new_col
0,Messi,0.642968,966,0
1,Ronaldo,0.784916,315,0
2,Neymar,0.063639,345,0
3,Pulisc,0.669259,671,0
4,Mbappe,0.215689,439,0


In [38]:
stats_df = stats_df.drop(columns=["new_col"])
stats_df

Unnamed: 0,name,goals/game,games
0,Messi,0.642968,966
1,Ronaldo,0.784916,315
2,Neymar,0.063639,345
3,Pulisc,0.669259,671
4,Mbappe,0.215689,439


In [39]:
stats_df['goals'] = stats_df["goals/game"] * stats_df["games"]
stats_df

Unnamed: 0,name,goals/game,games,goals
0,Messi,0.642968,966,621.107495
1,Ronaldo,0.784916,315,247.248484
2,Neymar,0.063639,345,21.95539
3,Pulisc,0.669259,671,449.072705
4,Mbappe,0.215689,439,94.68754


In [40]:
stats_df = stats_df.rename(columns={"goals/game": "goals_p_game"})
#stats_df.rename(columns={"goals/game": "goals_p_game"}, inplace=True) #inplace = True means it will change the original df instead of creating a new one
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,Messi,0.642968,966,621.107495
1,Ronaldo,0.784916,315,247.248484
2,Neymar,0.063639,345,21.95539
3,Pulisc,0.669259,671,449.072705
4,Mbappe,0.215689,439,94.68754


In [41]:
stats_df["name"]

0      Messi
1    Ronaldo
2     Neymar
3     Pulisc
4     Mbappe
Name: name, dtype: object

In [42]:
stats_df["name"] = stats_df["name"].str.lower()
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,messi,0.642968,966,621.107495
1,ronaldo,0.784916,315,247.248484
2,neymar,0.063639,345,21.95539
3,pulisc,0.669259,671,449.072705
4,mbappe,0.215689,439,94.68754


In [43]:
stats_df[["name", "games"]]

Unnamed: 0,name,games
0,messi,966
1,ronaldo,315
2,neymar,345
3,pulisc,671
4,mbappe,439


In [44]:
#filtering with .loc

stats_df.loc[0, "name"] 

'messi'

In [45]:
stats_df.loc[0]

name                 messi
goals_p_game      0.642968
games                  966
goals           621.107495
Name: 0, dtype: object

In [46]:
stats_df.loc[:, "name"]

0      messi
1    ronaldo
2     neymar
3     pulisc
4     mbappe
Name: name, dtype: object

In [50]:
1 = [1,2,3,4,5]
1[1:2]# in a list the second item is exclusive

SyntaxError: cannot assign to literal here. Maybe you meant '==' instead of '='? (127928173.py, line 1)

In [51]:
stats_df.loc[:, "name": "goals"]

Unnamed: 0,name,goals_p_game,games,goals
0,messi,0.642968,966,621.107495
1,ronaldo,0.784916,315,247.248484
2,neymar,0.063639,345,21.95539
3,pulisc,0.669259,671,449.072705
4,mbappe,0.215689,439,94.68754


In [54]:
# boolean logic with .loc

stats_df.loc[stats_df["goals_p_game"] > 0.7]

Unnamed: 0,name,goals_p_game,games,goals
1,ronaldo,0.784916,315,247.248484


In [55]:
stats_df.loc[stats_df["goals_p_game"] > 0.7, "name"]

1    ronaldo
Name: name, dtype: object

In [56]:
stats_df.loc[stats_df["name"] == "messi", "name"] = "ronaldo"
stats_df.loc[stats_df["goals_p_game"] > 0.7, "name"] = "messi"
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,ronaldo,0.642968,966,621.107495
1,messi,0.784916,315,247.248484
2,neymar,0.063639,345,21.95539
3,pulisc,0.669259,671,449.072705
4,mbappe,0.215689,439,94.68754


In [None]:
# .iloc, .at, .iat