- what is a dataframe (df)?
- create df
- vectorized operations
- access rows, columns, cells (.loc, .iloc, .at)
- change values
- sort values

In [176]:
import pandas as pd

In [177]:
from typing import List
data_list: List[int] = [1, 2, 3, 4, 5]

s1 = pd.Series(data_list)
print(s1)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [178]:
s1.index

s1 = pd.Series(data=data_list, index=['a', 'b', 'c', 'd', 'e'])
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [179]:
s1["a"]

np.int64(1)

In [180]:
s1 = s1 + 23
s1

a    24
b    25
c    26
d    27
e    28
dtype: int64

In [181]:
type(s1)

pandas.core.series.Series

In [182]:
# DF = dataframe = collection of Series objects
    # Rules : columns need to be homogenous (cant mix strings and ints in the same column), but rows can be heterogenous (can have different types in different rows)
    # every column must have same number of data points 

# dictionary creation - when we know all the data at once
data = {
    "Name": ["Terry", "Tanja", "Erica", "Erin"],
    "Age": [55, 58, 24, 21],
    "Year": ["associate", "bachelor", "master", "bachelor"]
}

people_df = pd.DataFrame(data=data)
people_df = people_df.set_index("Name")
people_df


Unnamed: 0_level_0,Age,Year
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Terry,55,associate
Tanja,58,bachelor
Erica,24,master
Erin,21,bachelor


In [183]:
people_df = people_df.reset_index()
people_df

Unnamed: 0,Name,Age,Year
0,Terry,55,associate
1,Tanja,58,bachelor
2,Erica,24,master
3,Erin,21,bachelor


In [184]:
# Row-based creation - list of lists or lists of pandas series
    # useful for generating df in a loop/from a loop

import random

names = ["Messi", "Ronaldo", "Neymar", "Mbappe", "Pulisc"]
goals_per_game = [random.random() for _ in names]
games = [round(random.random()* 1000) for _ in names]
games

cols = ["Name", "Goals Per Game", "Games"]
stats = [[names[i], goals_per_game[i], games[i]] for i in range (5)]

stats_df = pd.DataFrame(data=stats, columns=cols)
stats_df

Unnamed: 0,Name,Goals Per Game,Games
0,Messi,0.83796,273
1,Ronaldo,0.851759,816
2,Neymar,0.4243,821
3,Mbappe,0.406063,283
4,Pulisc,0.961243,230


In [185]:
stats_df["new_col"] = 0
stats_df

Unnamed: 0,Name,Goals Per Game,Games,new_col
0,Messi,0.83796,273,0
1,Ronaldo,0.851759,816,0
2,Neymar,0.4243,821,0
3,Mbappe,0.406063,283,0
4,Pulisc,0.961243,230,0


In [186]:
stats_df = stats_df.drop(columns=["new_col"])
stats_df

Unnamed: 0,Name,Goals Per Game,Games
0,Messi,0.83796,273
1,Ronaldo,0.851759,816
2,Neymar,0.4243,821
3,Mbappe,0.406063,283
4,Pulisc,0.961243,230


In [187]:
stats_df["Goals"] = stats_df["Goals Per Game"] * stats_df["Games"]
stats_df

Unnamed: 0,Name,Goals Per Game,Games,Goals
0,Messi,0.83796,273,228.763112
1,Ronaldo,0.851759,816,695.034968
2,Neymar,0.4243,821,348.349936
3,Mbappe,0.406063,283,114.915911
4,Pulisc,0.961243,230,221.085804


In [188]:
# stats_df = stats.df.rename(columns={"Goals Per Game": "Goals per Game"}
# stats_df.rename(columns={"Goals Per Game": "Goals per Game"}, inplace=True)   
# if wanting to rename column and save it to memory, or use inplace

In [189]:
stats_df["Name"]

0      Messi
1    Ronaldo
2     Neymar
3     Mbappe
4     Pulisc
Name: Name, dtype: object

In [190]:
stats_df["Name"] = stats_df["Name"].str.upper()
stats_df

Unnamed: 0,Name,Goals Per Game,Games,Goals
0,MESSI,0.83796,273,228.763112
1,RONALDO,0.851759,816,695.034968
2,NEYMAR,0.4243,821,348.349936
3,MBAPPE,0.406063,283,114.915911
4,PULISC,0.961243,230,221.085804


In [191]:
stats_df[["Name", "Games"]]


Unnamed: 0,Name,Games
0,MESSI,273
1,RONALDO,816
2,NEYMAR,821
3,MBAPPE,283
4,PULISC,230


In [192]:
# filtering with .loc

stats_df.loc[0, "Name"]


'MESSI'

In [193]:
stats_df.loc[0]

Name                   MESSI
Goals Per Game       0.83796
Games                    273
Goals             228.763112
Name: 0, dtype: object

In [194]:
stats_df.loc[:, "Name"]

0      MESSI
1    RONALDO
2     NEYMAR
3     MBAPPE
4     PULISC
Name: Name, dtype: object

In [195]:
stats_df.loc[1:2, "Name"] # in df, second item after colon is inclusive

1    RONALDO
2     NEYMAR
Name: Name, dtype: object

In [196]:
l = [1, 2, 3, 4, 5]
l[1:2] # in a lest the second item is exclusive]]

[2]

In [197]:
stats_df.loc[:, "Name":"Goals"]

Unnamed: 0,Name,Goals Per Game,Games,Goals
0,MESSI,0.83796,273,228.763112
1,RONALDO,0.851759,816,695.034968
2,NEYMAR,0.4243,821,348.349936
3,MBAPPE,0.406063,283,114.915911
4,PULISC,0.961243,230,221.085804


In [198]:
# boolian logic with .loc

stats_df.loc[stats_df["Goals Per Game"] > 0.5]

Unnamed: 0,Name,Goals Per Game,Games,Goals
0,MESSI,0.83796,273,228.763112
1,RONALDO,0.851759,816,695.034968
4,PULISC,0.961243,230,221.085804


In [199]:
stats_df.loc[stats_df["Goals Per Game"] > 0.5, "Name"]

0      MESSI
1    RONALDO
4     PULISC
Name: Name, dtype: object

In [None]:
stats_df.loc[stats_df["Name"] == "MESSI", "Name"] = "RONALDO"
stats_df.loc[stats_df["Goals Per Game"] > 0.5, "Name"] = "MESSI"
stats_df

# not sure why I have three messis now??? 

Unnamed: 0,Name,Goals Per Game,Games,Goals
0,MESSI,0.83796,273,228.763112
1,MESSI,0.851759,816,695.034968
2,NEYMAR,0.4243,821,348.349936
3,MBAPPE,0.406063,283,114.915911
4,MESSI,0.961243,230,221.085804
