In [31]:
import pandas as pd

# Intro to DataFrames

In [32]:
# Create a DataFrame
df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [33]:
# Display the first two rows of the DataFrame
df.head(2)

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6


In [34]:
# Display the last two rows of the DataFrame
df.tail(2)

Unnamed: 0,A,B,C
1,4,5,6
2,7,8,9


In [35]:
# Get DataFrame information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 204.0 bytes


In [36]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [37]:
# Get the different unique values in column 'A'
df['A'].unique()

array([1, 4, 7])

In [38]:
# Get the shape of the DataFrame
df.shape

(3, 3)

# Warm-up data-set analysis

In [39]:
coffee_file_path = r'./warm-up_data/coffee.csv'

In [40]:
coffee_df = pd.read_csv(coffee_file_path)
coffee_df.head(10)

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20
4,Wednesday,Espresso,35
5,Wednesday,Latte,25
6,Thursday,Espresso,40
7,Thursday,Latte,30
8,Friday,Espresso,45
9,Friday,Latte,35


##  Accessing DataFrame elements using .loc and .iloc

In [41]:
# Using .loc to access by label
coffee_df.loc[0] # Access the first row by label

Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object

In [42]:
coffee_df.loc[0:3] # Access the first four rows by label    

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,30
3,Tuesday,Latte,20


In [43]:
coffee_df.loc[0:3, ['Day', 'Units Sold']]

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20


In [44]:
coffee_df.loc[:, ['Day', 'Units Sold']]

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20
4,Wednesday,35
5,Wednesday,25
6,Thursday,40
7,Thursday,30
8,Friday,45
9,Friday,35


In [45]:
# Using .iloc to access by index
coffee_df.iloc[:, [0, 2]]

Unnamed: 0,Day,Units Sold
0,Monday,25
1,Monday,15
2,Tuesday,30
3,Tuesday,20
4,Wednesday,35
5,Wednesday,25
6,Thursday,40
7,Thursday,30
8,Friday,45
9,Friday,35


In [46]:
# Something interesting here is that both loc and iloc they are using the indexing in the Dataframe, what it means that if we are changing the index for names we need to access the data differently. For example: 
coffee_df.index = coffee_df['Day']
coffee_df

Unnamed: 0_level_0,Day,Coffee Type,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Monday,Monday,Espresso,25
Monday,Monday,Latte,15
Tuesday,Tuesday,Espresso,30
Tuesday,Tuesday,Latte,20
Wednesday,Wednesday,Espresso,35
Wednesday,Wednesday,Latte,25
Thursday,Thursday,Espresso,40
Thursday,Thursday,Latte,30
Friday,Friday,Espresso,45
Friday,Friday,Latte,35


In [50]:
coffee_df.loc["Monday":"Thursday", ['Day', 'Units Sold']]

Unnamed: 0_level_0,Day,Units Sold
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Monday,Monday,25
Monday,Monday,15
Tuesday,Tuesday,30
Tuesday,Tuesday,20
Wednesday,Wednesday,35
Wednesday,Wednesday,25
Thursday,Thursday,40
Thursday,Thursday,30


In [53]:
coffee_df.index = range(len(coffee_df))

# Changing a specific cell value
coffee_df.loc[2, 'Units Sold'] = 10
coffee_df.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,10
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [55]:
#Sorting values
coffee_df.sort_values(by='Units Sold', ascending=False)

Unnamed: 0,Day,Coffee Type,Units Sold
10,Saturday,Espresso,45
8,Friday,Espresso,45
12,Sunday,Espresso,45
6,Thursday,Espresso,40
4,Wednesday,Espresso,35
11,Saturday,Latte,35
13,Sunday,Latte,35
9,Friday,Latte,35
7,Thursday,Latte,30
0,Monday,Espresso,25


In [57]:
# Iterating through rows
for index, row in coffee_df.iterrows():
    print(index)
    print(row)
    print('\n\n')

0
Day              Monday
Coffee Type    Espresso
Units Sold           25
Name: 0, dtype: object



1
Day            Monday
Coffee Type     Latte
Units Sold         15
Name: 1, dtype: object



2
Day             Tuesday
Coffee Type    Espresso
Units Sold           10
Name: 2, dtype: object



3
Day            Tuesday
Coffee Type      Latte
Units Sold          20
Name: 3, dtype: object



4
Day            Wednesday
Coffee Type     Espresso
Units Sold            35
Name: 4, dtype: object



5
Day            Wednesday
Coffee Type        Latte
Units Sold            25
Name: 5, dtype: object



6
Day            Thursday
Coffee Type    Espresso
Units Sold           40
Name: 6, dtype: object



7
Day            Thursday
Coffee Type       Latte
Units Sold           30
Name: 7, dtype: object



8
Day              Friday
Coffee Type    Espresso
Units Sold           45
Name: 8, dtype: object



9
Day            Friday
Coffee Type     Latte
Units Sold         35
Name: 9, dtype: object



10
Day   

# Real dataset analysis

## Importing datasets

In [59]:
bios_df = pd.read_csv(r'./data/bios.csv')
results_df = pd.read_csv(r'./data/results.csv')
olympics_df = pd.read_excel(r'./data/olympics-data.xlsx')

In [60]:
bios_df.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


## Filtering Data

In [62]:
bios_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   athlete_id    145500 non-null  int64  
 1   name          145500 non-null  object 
 2   born_date     143693 non-null  object 
 3   born_city     110908 non-null  object 
 4   born_region   110908 non-null  object 
 5   born_country  110908 non-null  object 
 6   NOC           145499 non-null  object 
 7   height_cm     106651 non-null  float64
 8   weight_kg     102070 non-null  float64
 9   died_date     33940 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 11.1+ MB


In [None]:
# Filtering tallest athletes from USA
bios_df.loc[(bios_df['height_cm']>215) & (bios_df['born_country'] == 'USA'), ['name', 'height_cm']]

Unnamed: 0,name,height_cm
5781,Tommy Burleson,223.0
6722,Shaquille O'Neal,216.0
6937,David Robinson,216.0
123850,Tyson Chandler,216.0


In [None]:
# Filtering buy some string inside the name (Regular Expression)
bios_df.loc[bios_df['name'].str.contains('Daniel')]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
22,23,Daniel Lawton,1881-11-27,Cantenac,Gironde,FRA,France,,,1979-03-27
184,185,Daniel Tsiokas,1971-06-19,Cluj-Napoca,Cluj,ROU,Greece,180.0,68.0,
370,371,Daniel Desnoyers,1957-02-27,Québec City,Québec,CAN,Canada,187.0,90.0,
716,720,Daniela Gergelcheva,1964-05-20,Momchilgrad,Kardzhali,BUL,Bulgaria,162.0,48.0,
753,757,Daniel Nestor,1972-09-04,Beograd (Belgrade),Beograd,SRB,Canada,191.0,87.0,
...,...,...,...,...,...,...,...,...,...,...
144792,148493,Daniel Grassl,2002-04-04,Merano,Bolzano-Bozen,ITA,Italy,175.0,,
144799,148500,Daniele Bagozza,1995-07-03,Bressanone,Bolzano-Bozen,ITA,Italy,,,
145163,148880,Daniel Andrei Cacina,2001-10-17,Brașov,Brașov,ROU,Romania,190.0,,
145279,149000,Daniel Magnusson,2000-03-08,Karlstad,Värmland,SWE,Sweden,,,


In [69]:
bios_df.loc[bios_df['name'].str.contains('Daniel', case=False)] # Case insensitive search, what it means that doesn't matter if is upper or lower case

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
22,23,Daniel Lawton,1881-11-27,Cantenac,Gironde,FRA,France,,,1979-03-27
184,185,Daniel Tsiokas,1971-06-19,Cluj-Napoca,Cluj,ROU,Greece,180.0,68.0,
370,371,Daniel Desnoyers,1957-02-27,Québec City,Québec,CAN,Canada,187.0,90.0,
716,720,Daniela Gergelcheva,1964-05-20,Momchilgrad,Kardzhali,BUL,Bulgaria,162.0,48.0,
753,757,Daniel Nestor,1972-09-04,Beograd (Belgrade),Beograd,SRB,Canada,191.0,87.0,
...,...,...,...,...,...,...,...,...,...,...
144792,148493,Daniel Grassl,2002-04-04,Merano,Bolzano-Bozen,ITA,Italy,175.0,,
144799,148500,Daniele Bagozza,1995-07-03,Bressanone,Bolzano-Bozen,ITA,Italy,,,
145163,148880,Daniel Andrei Cacina,2001-10-17,Brașov,Brașov,ROU,Romania,190.0,,
145279,149000,Daniel Magnusson,2000-03-08,Karlstad,Värmland,SWE,Sweden,,,


In [70]:
bios_df.loc[bios_df['name'].str.contains('Daniel|keith', case=False)]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
22,23,Daniel Lawton,1881-11-27,Cantenac,Gironde,FRA,France,,,1979-03-27
184,185,Daniel Tsiokas,1971-06-19,Cluj-Napoca,Cluj,ROU,Greece,180.0,68.0,
370,371,Daniel Desnoyers,1957-02-27,Québec City,Québec,CAN,Canada,187.0,90.0,
716,720,Daniela Gergelcheva,1964-05-20,Momchilgrad,Kardzhali,BUL,Bulgaria,162.0,48.0,
753,757,Daniel Nestor,1972-09-04,Beograd (Belgrade),Beograd,SRB,Canada,191.0,87.0,
...,...,...,...,...,...,...,...,...,...,...
144792,148493,Daniel Grassl,2002-04-04,Merano,Bolzano-Bozen,ITA,Italy,175.0,,
144799,148500,Daniele Bagozza,1995-07-03,Bressanone,Bolzano-Bozen,ITA,Italy,,,
145163,148880,Daniel Andrei Cacina,2001-10-17,Brașov,Brașov,ROU,Romania,190.0,,
145279,149000,Daniel Magnusson,2000-03-08,Karlstad,Värmland,SWE,Sweden,,,


In [81]:
bios_df.loc[(bios_df['born_country'].isin(['CUB', 'FRA'])) & (bios_df['name'].str.contains('Mijaín'))]

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
106500,107601,Mijaín López,1982-08-20,Consolación del Sur,Pinar del Río,CUB,Cuba,198.0,130.0,


In [83]:
# Query method
bios_df.query("height_cm > 215 and born_country == 'USA'")

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,
6722,6755,Shaquille O'Neal,1972-03-06,Newark,New Jersey,USA,United States,216.0,137.0,
6937,6972,David Robinson,1965-08-06,Key West,Florida,USA,United States,216.0,107.0,
123850,126093,Tyson Chandler,1982-10-02,Hanford,California,USA,United States,216.0,107.0,


## Adding/Removing columns

In [84]:
coffee_df.head()

Unnamed: 0,Day,Coffee Type,Units Sold
0,Monday,Espresso,25
1,Monday,Latte,15
2,Tuesday,Espresso,10
3,Tuesday,Latte,20
4,Wednesday,Espresso,35


In [86]:
coffee_df['price'] = 4.99
coffee_df

Unnamed: 0,Day,Coffee Type,Units Sold,price
0,Monday,Espresso,25,4.99
1,Monday,Latte,15,4.99
2,Tuesday,Espresso,10,4.99
3,Tuesday,Latte,20,4.99
4,Wednesday,Espresso,35,4.99
5,Wednesday,Latte,25,4.99
6,Thursday,Espresso,40,4.99
7,Thursday,Latte,30,4.99
8,Friday,Espresso,45,4.99
9,Friday,Latte,35,4.99


In [87]:
import numpy as np
coffee_df['new_price'] = np.where(coffee_df["Coffee Type"] == 'Espresso', 5.99, 4.99)
coffee_df

Unnamed: 0,Day,Coffee Type,Units Sold,price,new_price
0,Monday,Espresso,25,4.99,5.99
1,Monday,Latte,15,4.99,4.99
2,Tuesday,Espresso,10,4.99,5.99
3,Tuesday,Latte,20,4.99,4.99
4,Wednesday,Espresso,35,4.99,5.99
5,Wednesday,Latte,25,4.99,4.99
6,Thursday,Espresso,40,4.99,5.99
7,Thursday,Latte,30,4.99,4.99
8,Friday,Espresso,45,4.99,5.99
9,Friday,Latte,35,4.99,4.99


In [88]:
coffee_df.drop(columns=['price'], inplace=True)
coffee_df

Unnamed: 0,Day,Coffee Type,Units Sold,new_price
0,Monday,Espresso,25,5.99
1,Monday,Latte,15,4.99
2,Tuesday,Espresso,10,5.99
3,Tuesday,Latte,20,4.99
4,Wednesday,Espresso,35,5.99
5,Wednesday,Latte,25,4.99
6,Thursday,Espresso,40,5.99
7,Thursday,Latte,30,4.99
8,Friday,Espresso,45,5.99
9,Friday,Latte,35,4.99


In [90]:
# If you want to create a copy of the dataframe before making changes
coffee_df_new = coffee_df.copy()
# Because if you just do:
# coffee_df_new = coffee_df
# any changes made to coffee_df_new will also affect coffee_df, because they are both pointing to the same space in memory.

In [91]:
coffee_df['revenue'] = coffee_df['Units Sold'] * coffee_df['new_price']
coffee_df

Unnamed: 0,Day,Coffee Type,Units Sold,new_price,revenue
0,Monday,Espresso,25,5.99,149.75
1,Monday,Latte,15,4.99,74.85
2,Tuesday,Espresso,10,5.99,59.9
3,Tuesday,Latte,20,4.99,99.8
4,Wednesday,Espresso,35,5.99,209.65
5,Wednesday,Latte,25,4.99,124.75
6,Thursday,Espresso,40,5.99,239.6
7,Thursday,Latte,30,4.99,149.7
8,Friday,Espresso,45,5.99,269.55
9,Friday,Latte,35,4.99,174.65


In [92]:
coffee_df.sort_values(by='revenue', ascending=False)

Unnamed: 0,Day,Coffee Type,Units Sold,new_price,revenue
10,Saturday,Espresso,45,5.99,269.55
8,Friday,Espresso,45,5.99,269.55
12,Sunday,Espresso,45,5.99,269.55
6,Thursday,Espresso,40,5.99,239.6
4,Wednesday,Espresso,35,5.99,209.65
11,Saturday,Latte,35,4.99,174.65
13,Sunday,Latte,35,4.99,174.65
9,Friday,Latte,35,4.99,174.65
0,Monday,Espresso,25,5.99,149.75
7,Thursday,Latte,30,4.99,149.7


In [95]:
coffee_df.rename(columns={'new_price':'price'}, inplace=True)
coffee_df

Unnamed: 0,Day,Coffee Type,Units Sold,price,revenue
0,Monday,Espresso,25,5.99,149.75
1,Monday,Latte,15,4.99,74.85
2,Tuesday,Espresso,10,5.99,59.9
3,Tuesday,Latte,20,4.99,99.8
4,Wednesday,Espresso,35,5.99,209.65
5,Wednesday,Latte,25,4.99,124.75
6,Thursday,Espresso,40,5.99,239.6
7,Thursday,Latte,30,4.99,149.7
8,Friday,Espresso,45,5.99,269.55
9,Friday,Latte,35,4.99,174.65
