# Basic select #

In [2]:
import pandas as pd
pd.set_option('max_rows', 6)

oo = pd.read_csv('olympics.csv', skiprows=4)
oo.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [5]:
# One column (also known as Series instead of Dataframe)
oo['City']
# same result: oo.City

0         Athens
1         Athens
2         Athens
3         Athens
4         Athens
          ...   
29211    Beijing
29212    Beijing
29213    Beijing
29214    Beijing
29215    Beijing
Name: City, Length: 29216, dtype: object

In [8]:
# Multiple columns by name
oo[['City', 'Sport', 'Event']]

Unnamed: 0,City,Sport,Event
0,Athens,Aquatics,100m freestyle
1,Athens,Aquatics,100m freestyle
2,Athens,Aquatics,100m freestyle for sailors
3,Athens,Aquatics,100m freestyle for sailors
4,Athens,Aquatics,100m freestyle for sailors
...,...,...,...
29211,Beijing,Wrestling,84 - 96kg
29212,Beijing,Wrestling,96 - 120kg
29213,Beijing,Wrestling,96 - 120kg
29214,Beijing,Wrestling,96 - 120kg


In [31]:
# Add new concatenated column. First parameter is the position (starting from 0)
# Edition is numeric so it has to be casted to string
oo.insert(2, 'City_Edition', oo['City'] + ' ' + oo['Edition'].astype('string'))
oo.head()

Unnamed: 0,City,Edition,City_Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Athens 1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Athens 1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Athens 1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Athens 1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Athens 1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [34]:
# Another way to add a column at the end.
oo['Gold'] = ['Yes' if x =='Gold' else 'No' for x in oo['Medal']]
oo.head()

Unnamed: 0,City,Edition,City_Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal,Gold
0,Athens,1896,Athens 1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold,Yes
1,Athens,1896,Athens 1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver,No
2,Athens,1896,Athens 1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze,No
3,Athens,1896,Athens 1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold,Yes
4,Athens,1896,Athens 1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver,No


In [3]:
# Rename column NOC to Country
oo.rename(columns={'NOC':'Country'})

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,Country,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
...,...,...,...,...,...,...,...,...,...,...
29213,Beijing,2008,Wrestling,Wrestling Gre-R,"PATRIKEEV, Yuri",ARM,Men,96 - 120kg,M,Bronze
29214,Beijing,2008,Wrestling,Wrestling Gre-R,"LOPEZ, Mijain",CUB,Men,96 - 120kg,M,Gold
29215,Beijing,2008,Wrestling,Wrestling Gre-R,"BAROEV, Khasan",RUS,Men,96 - 120kg,M,Silver


In [10]:
# Distinct
oo.Sport.unique()

array(['Aquatics', 'Athletics', 'Cycling', 'Fencing', 'Gymnastics',
       'Shooting', 'Tennis', 'Weightlifting', 'Wrestling', 'Archery',
       'Basque Pelota', 'Cricket', 'Croquet', 'Equestrian', 'Football',
       'Golf', 'Polo', 'Rowing', 'Rugby', 'Sailing', 'Tug of War',
       'Boxing', 'Lacrosse', 'Roque', 'Hockey', 'Jeu de paume', 'Rackets',
       'Skating', 'Water Motorsports', 'Modern Pentathlon', 'Ice Hockey',
       'Basketball', 'Canoe / Kayak', 'Handball', 'Judo', 'Volleyball',
       'Table Tennis', 'Badminton', 'Baseball', 'Softball', 'Taekwondo',
       'Triathlon'], dtype=object)

In [12]:
# Count by column
# Number of medals given each Edition
oo.Edition.value_counts()

2008    2042
2000    2015
2004    1998
1996    1859
1992    1705
1988    1546
1984    1459
1980    1387
1976    1305
1920    1298
1972    1185
1968    1031
1964    1010
1952     889
1912     885
1956     885
1924     884
1960     882
1936     875
1948     814
1908     804
1928     710
1932     615
1900     512
1904     470
1896     151
Name: Edition, dtype: int64

In [21]:
# Sort
oo.sort_values('NOC')
# Change to descending: oo.sort_values('NOC', ascending = False)

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
28965,Beijing,2008,Taekwondo,Taekwondo,"NIKPAI, Rohullah",AFG,Men,- 58 kg,M,Bronze
19323,Seoul,1988,Sailing,Sailing,"BOERSMA, Jan D.",AHO,Men,board (division II),M,Silver
19874,Barcelona,1992,Athletics,Athletics,"BOULMERKA, Hassiba",ALG,Women,1500m,W,Gold
21960,Atlanta,1996,Boxing,Boxing,"BAHARI, Mohamed",ALG,Men,71-75kg,M,Bronze
21610,Atlanta,1996,Athletics,Athletics,"MORCELI, Nourredine",ALG,Men,1500m,M,Gold
...,...,...,...,...,...,...,...,...,...,...
133,Athens,1896,Tennis,Tennis,"ROBERTSON, George Stuart",ZZX,Men,doubles,M,Bronze
493,Paris,1900,Rowing,Rowing,"BRANDT, Francois Antoine",ZZX,Men,pair-oared shell with coxswain (2+),M,Gold
132,Athens,1896,Tennis,Tennis,"FLACK, Edwin",ZZX,Men,doubles,M,Bronze
495,Paris,1900,Rowing,Rowing,"KLEIN, Roelof",ZZX,Men,pair-oared shell with coxswain (2+),M,Gold


## Slicing ##

Basic form allows to cut specific rows.

In [9]:
# Reference
oo.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [14]:
# select first two rows
oo[:2]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver


In [16]:
# select rows 3 and 4, not inclusive
oo[3:5]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [18]:
# slice specific column
# first 10 values
oo.Athlete.values[:10]

array(['HAJOS, Alfred', 'HERSCHMANN, Otto', 'DRIVAS, Dimitrios',
       'MALOKINIS, Ioannis', 'CHASAPIS, Spiridon',
       'CHOROPHAS, Efstathios', 'HAJOS, Alfred', 'ANDREOU, Joannis',
       'CHOROPHAS, Efstathios', 'NEUMANN, Paul'], dtype=object)

In [19]:
# slice full column
oo.Event.values[:]

array(['100m freestyle', '100m freestyle', '100m freestyle for sailors',
       ..., '96 - 120kg', '96 - 120kg', '96 - 120kg'], dtype=object)

## loc and iloc ##

**loc[]** the function selects the data by labels of rows or columns.

**iloc[ ]** is used for selection based on position. It is similar to loc[] indexer but it takes only integer values to make selections.

In [7]:
# slicing with loc, inclusive to the last value
oo.loc[3:5]

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver
5,Athens,1896,Aquatics,Swimming,"CHOROPHAS, Efstathios",GRE,Men,1200m freestyle,M,Bronze


In [8]:
# get specific value based on row (index) and colum
oo.loc[4, 'Athlete']

'CHASAPIS, Spiridon'

In [9]:
# same as previous but with iloc to use numeric notation
oo.iloc[4,4]

'CHASAPIS, Spiridon'

In [20]:
# select specific row with iloc
oo.iloc[3]

City                                Athens
Edition                               1896
Sport                             Aquatics
Discipline                        Swimming
Athlete                 MALOKINIS, Ioannis
NOC                                    GRE
Gender                                 Men
Event           100m freestyle for sailors
Event_gender                             M
Medal                                 Gold
Name: 3, dtype: object

In [21]:
# select all the rows for specific columns
oo.iloc[:, [2, 3, 5]]

Unnamed: 0,Sport,Discipline,NOC
0,Aquatics,Swimming,HUN
1,Aquatics,Swimming,AUT
2,Aquatics,Swimming,GRE
3,Aquatics,Swimming,GRE
4,Aquatics,Swimming,GRE
...,...,...,...
29211,Wrestling,Wrestling Gre-R,GER
29212,Wrestling,Wrestling Gre-R,LTU
29213,Wrestling,Wrestling Gre-R,ARM
29214,Wrestling,Wrestling Gre-R,CUB


## Indexing ##

Like a dict, a DataFrame's index is backed by a hash table. Looking up rows based on index values is like looking up dict values based on a key. Looking up rows based on index values is faster than looking up rows based on column values. 


Note however, it takes time to build the index. So having the index is only advantageous when you have many lookups of this type to perform.

In [7]:
# Default index is a consecutive, bold column in the left
oo.index

RangeIndex(start=0, stop=29216, step=1)

In [10]:
# set new index, recommended to always create a new data frame for changes
oo_by_edition = oo.set_index("Edition")
oo_by_edition.index

# remove new index with reset_index()

Int64Index([1896, 1896, 1896, 1896, 1896, 1896, 1896, 1896, 1896, 1896,
            ...
            2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008, 2008],
           dtype='int64', name='Edition', length=29216)

In [11]:
# bold column in the left
oo_by_edition.head()

Unnamed: 0_level_0,City,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
Edition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [12]:
# slice with loc
oo_by_edition.loc[1950:2000]  

Unnamed: 0_level_0,City,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
Edition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1952,Helsinki,Aquatics,Diving,"HAASE, Günther",GER,Men,10m platform,M,Bronze
1952,Helsinki,Aquatics,Diving,"LEE, Samuel",USA,Men,10m platform,M,Gold
1952,Helsinki,Aquatics,Diving,"CAPILLA PEREZ, Joaquin",MEX,Men,10m platform,M,Silver
...,...,...,...,...,...,...,...,...,...
2000,Sydney,Wrestling,Wrestling Gre-R,"DEBELKA, Dmitry",BLR,Men,97 - 130kg,M,Bronze
2000,Sydney,Wrestling,Wrestling Gre-R,"GARDNER, Rulon",USA,Men,97 - 130kg,M,Gold
2000,Sydney,Wrestling,Wrestling Gre-R,"KARELIN, Aleksandr",RUS,Men,97 - 130kg,M,Silver


In [13]:
# slice with iloc
oo_by_edition.iloc[0:10]

Unnamed: 0_level_0,City,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
Edition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
...,...,...,...,...,...,...,...,...,...
1896,Athens,Aquatics,Swimming,"ANDREOU, Joannis",GRE,Men,1200m freestyle,M,Silver
1896,Athens,Aquatics,Swimming,"CHOROPHAS, Efstathios",GRE,Men,400m freestyle,M,Bronze
1896,Athens,Aquatics,Swimming,"NEUMANN, Paul",AUT,Men,400m freestyle,M,Gold


In [22]:
# sort an index
oo_by_sport = oo.set_index("Sport").sort_index()
oo_by_sport

Unnamed: 0_level_0,City,Edition,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aquatics,Athens,1896,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
Aquatics,London,1948,Swimming,"ZIMMERMAN, Suzanne Winona",USA,Women,100m backstroke,W,Silver
Aquatics,London,1948,Swimming,"HARUP, Karen Margrethe",DEN,Women,100m backstroke,W,Gold
...,...,...,...,...,...,...,...,...,...
Wrestling,Los Angeles,1984,Wrestling Gre-R,"JOHANSSON, Kent-Olle",SWE,Men,57 - 62kg (featherweight),M,Silver
Wrestling,Helsinki,1952,Wrestling Gre-R,"GRÖNDAHL, Kelpo Olavi",FIN,Men,79 - 87kg (light-heavyweight),M,Gold
Wrestling,Beijing,2008,Wrestling Gre-R,"BAROEV, Khasan",RUS,Men,96 - 120kg,M,Silver


In [24]:
oo_by_sport.loc['Football']

Unnamed: 0_level_0,City,Edition,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
Sport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Football,Amsterdam,1928,Football,"CANAVESI, Adhemar",URU,Men,football,M,Gold
Football,Amsterdam,1928,Football,"CASTRO, Hector",URU,Men,football,M,Gold
Football,Amsterdam,1928,Football,"SCHIAVIO, Angelo",ITA,Men,football,M,Bronze
...,...,...,...,...,...,...,...,...,...
Football,Montreal,1976,Football,"WIECZOREK, Henryk",POL,Men,football,M,Silver
Football,London,1948,Football,"JOVANOVIC, Miodrag",YUG,Men,football,M,Silver
Football,Montreal,1976,Football,"ZMUDA, Wladyslaw",POL,Men,football,M,Silver


In [27]:
# we can define multiple indexes
oo_multi = oo.set_index(['Edition','NOC'])
oo_multi

Unnamed: 0_level_0,Unnamed: 1_level_0,City,Sport,Discipline,Athlete,Gender,Event,Event_gender,Medal
Edition,NOC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1896,HUN,Athens,Aquatics,Swimming,"HAJOS, Alfred",Men,100m freestyle,M,Gold
1896,AUT,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",Men,100m freestyle,M,Silver
1896,GRE,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",Men,100m freestyle for sailors,M,Bronze
...,...,...,...,...,...,...,...,...,...
2008,ARM,Beijing,Wrestling,Wrestling Gre-R,"PATRIKEEV, Yuri",Men,96 - 120kg,M,Bronze
2008,CUB,Beijing,Wrestling,Wrestling Gre-R,"LOPEZ, Mijain",Men,96 - 120kg,M,Gold
2008,RUS,Beijing,Wrestling,Wrestling Gre-R,"BAROEV, Khasan",Men,96 - 120kg,M,Silver
