# Pandas: Basics

### First Steps

In [3]:
import pandas as pd
pd.options.display.max_rows = 60
pd.options.display.min_rows = None

In [7]:
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#pandas.read_csv
titanic = pd.read_csv("titanic.csv")
# titanic = pd.read_csv("titanic.csv", sep=',', header=None, na_values="other_null")

In [8]:
print(titanic)

     survived  pclass     sex   age  sibsp  parch      fare embarked deck
0           0       3    male  22.0      1      0    7.2500        S  NaN
1           1       1  female  38.0      1      0   71.2833        C    C
2           1       3  female  26.0      0      0    7.9250        S  NaN
3           1       1  female  35.0      1      0   53.1000        S    C
4           0       3    male  35.0      0      0    8.0500        S  NaN
5           0       3    male   NaN      0      0    8.4583        Q  NaN
6           0       1    male  54.0      0      0   51.8625        S    E
7           0       3    male   2.0      3      1   21.0750        S  NaN
8           1       3  female  27.0      0      2   11.1333        S  NaN
9           1       2  female  14.0      1      0   30.0708        C  NaN
10          1       3  female   4.0      1      1   16.7000        S    G
11          1       1  female  58.0      0      0   26.5500        S    C
12          0       3    male  20.0   

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
titanic.to_csv('titanic_from_df.csv')
titanic.to_csv('titanic_from_df.csv', sep='|', header=False, index=False, na_rep="NAN")

In [12]:
titanic.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C


In [13]:
titanic.tail(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
889,1,1,male,26.0,0,0,30.0,C,C
890,0,3,male,32.0,0,0,7.75,Q,


In [14]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'deck'],
      dtype='object')

In [15]:
titanic.index

RangeIndex(start=0, stop=891, step=1)

In [16]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [18]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Built-in Functions, Attributes and Methods

In [19]:
type(titanic)

pandas.core.frame.DataFrame

#### DataFrame and Built-in Functions

In [20]:
len(titanic)

891

In [21]:
round(titanic, 0).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.0,S,
1,1,1,female,38.0,1,0,71.0,C,C
2,1,3,female,26.0,0,0,8.0,S,
3,1,1,female,35.0,1,0,53.0,S,C
4,0,3,male,35.0,0,0,8.0,S,


In [32]:
titanic.size

8019

In [33]:
titanic.index

RangeIndex(start=0, stop=891, step=1)

In [34]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'deck'],
      dtype='object')

### Selecting Columns

In [35]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [40]:
titanic["age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5       NaN
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17      NaN
18     31.0
19      NaN
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26      NaN
27     19.0
28      NaN
29      NaN
       ... 
861    21.0
862    48.0
863     NaN
864    24.0
865    42.0
866    27.0
867    31.0
868     NaN
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878     NaN
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [38]:
type(titanic["age"])

pandas.core.series.Series

In [39]:
titanic[["age"]]

Unnamed: 0,age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
5,
6,54.0
7,2.0
8,27.0
9,14.0


In [9]:
type(titanic[["age"]])

pandas.core.frame.DataFrame

In [11]:
# titanic["age", "sex"]

In [12]:
titanic[["age", "sex"]]

Unnamed: 0,age,sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male
5,,male
6,54.0,male
7,2.0,male
8,27.0,female
9,14.0,female


In [46]:
titanic[["sex", "age", "fare"]]

Unnamed: 0,sex,age,fare
0,male,22.0,7.2500
1,female,38.0,71.2833
2,female,26.0,7.9250
3,female,35.0,53.1000
4,male,35.0,8.0500
5,male,,8.4583
6,male,54.0,51.8625
7,male,2.0,21.0750
8,female,27.0,11.1333
9,female,14.0,30.0708


Series -> a one-dimensional labelled array.

DataFrame -> a two-dimensional labelled data structure with columns of possibly diverse types. 

Each component of a series has a unique identification thanks to an index. It is possible to create new Series by using lists, arrays, dictionaries, and existing Series objects

In [13]:
data = [1000, 2000, 3000, 4000, 5000]
s = pd.Series(data)
print(s)

0    1000
1    2000
2    3000
3    4000
4    5000
dtype: int64


In [14]:
data = [1000, 2000, 3000, 4000, 5000]
df = pd.DataFrame(data, columns=['Column1'])
print(df)

   Column1
0     1000
1     2000
2     3000
3     4000
4     5000


In [47]:
titanic.age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5       NaN
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17      NaN
18     31.0
19      NaN
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26      NaN
27     19.0
28      NaN
29      NaN
       ... 
861    21.0
862    48.0
863     NaN
864    24.0
865    42.0
866    27.0
867    31.0
868     NaN
869     4.0
870    26.0
871    47.0
872    33.0
873    47.0
874    28.0
875    15.0
876    20.0
877    19.0
878     NaN
879    56.0
880    25.0
881    33.0
882    22.0
883    28.0
884    25.0
885    39.0
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [48]:
titanic.age.equals(titanic["age"])

True

### Selecting Rows with Square Brackets (not advisable)

In [None]:
titanic.head()

In [None]:
titanic[0:1]

In [57]:
titanic[4:8]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,


In [None]:
titanic[:10]

In [None]:
titanic[-10:]

### Indexing Operator iloc (location based indexing) 

#### Selecting Rows with iloc

In [49]:
titanic.iloc[0]

survived       0
pclass         3
sex         male
age         22.0
sibsp          1
parch          0
fare        7.25
embarked       S
deck         NaN
Name: 0, dtype: object

In [50]:
type(titanic.iloc[0])

pandas.core.series.Series

In [52]:
titanic.iloc[-1]

survived       0
pclass         3
sex         male
age         32.0
sibsp          0
parch          0
fare        7.75
embarked       Q
deck         NaN
Name: 890, dtype: object

In [53]:
titanic.iloc[:5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [54]:
titanic.iloc[-5:]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
886,0,2,male,27.0,0,0,13.0,S,
887,1,1,female,19.0,0,0,30.0,S,B
888,0,3,female,,1,2,23.45,S,
889,1,1,male,26.0,0,0,30.0,C,C
890,0,3,male,32.0,0,0,7.75,Q,


In [55]:
titanic.iloc[456:459]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
456,0,1,male,65.0,0,0,26.55,S,E
457,1,1,female,,1,0,51.8625,S,D
458,1,2,female,50.0,0,0,10.5,S,


In [56]:
titanic.iloc[[2,45,765]]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
2,1,3,female,26.0,0,0,7.925,S,
45,0,3,male,,0,0,8.05,S,
765,1,1,female,51.0,1,0,77.9583,S,D


In [59]:
titanic.iloc[0,0:3]

survived       0
pclass         3
sex         male
Name: 0, dtype: object

In [60]:
titanic.iloc[0,[0,2,6,8]]

survived       0
sex         male
fare        7.25
deck         NaN
Name: 0, dtype: object

In [62]:
titanic.iloc[34:39,[0,2,6,8]]

Unnamed: 0,survived,sex,fare,deck
34,0,male,82.1708,
35,0,male,52.0,
36,1,male,7.2292,
37,0,male,8.05,
38,0,female,18.0,


#### Selecting Columns with iloc

In [66]:
titanic.iloc[:, 0].equals(titanic.survived)

True

In [68]:
titanic["survived"]

0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     1
24     0
25     1
26     0
27     0
28     1
29     0
      ..
861    0
862    1
863    0
864    0
865    1
866    1
867    0
868    0
869    1
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    0
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

### Index Operator loc (label based indexing)

In [71]:
medals = pd.read_csv("summer.csv", index_col="Athlete")

medals_wo_index = pd.read_csv("summer.csv")

In [72]:
medals_wo_index.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [70]:
medals_index.head()

Unnamed: 0_level_0,Year,City,Sport,Discipline,Country,Gender,Event,Medal
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"HAJOS, Alfred",1896,Athens,Aquatics,Swimming,HUN,Men,100M Freestyle,Gold
"HERSCHMANN, Otto",1896,Athens,Aquatics,Swimming,AUT,Men,100M Freestyle,Silver
"DRIVAS, Dimitrios",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Bronze
"MALOKINIS, Ioannis",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Gold
"CHASAPIS, Spiridon",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Silver


#### Selecting Rows with loc

In [73]:
medals.loc["DRIVAS, Dimitrios"]

Year                                1896
City                              Athens
Sport                           Aquatics
Discipline                      Swimming
Country                              GRE
Gender                               Men
Event         100M Freestyle For Sailors
Medal                             Bronze
Name: DRIVAS, Dimitrios, dtype: object

In [76]:
medals.loc["PHELPS, Michael"].iloc[0]

Year                    2004
City                  Athens
Sport               Aquatics
Discipline          Swimming
Country                  USA
Gender                   Men
Event         100M Butterfly
Medal                   Gold
Name: PHELPS, Michael, dtype: object

#### Slicing Rows and Columns with loc

In [77]:
medals.loc["PHELPS, Michael", "Medal"]

Athlete
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael    Bronze
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael    Bronze
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael      Gold
PHELPS, Michael    Silver
PHELPS, Michael      Gold
PHELPS, Michael    Silver
PHELPS, Michael      Gold
PHELPS, Michael      Gold
Name: Medal, dtype: object

In [78]:
medals.loc["PHELPS, Michael", ["Event","Medal"]]

Unnamed: 0_level_0,Event,Medal
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1
"PHELPS, Michael",100M Butterfly,Gold
"PHELPS, Michael",200M Butterfly,Gold
"PHELPS, Michael",200M Freestyle,Bronze
"PHELPS, Michael",200M Individual Medley,Gold
"PHELPS, Michael",400M Individual Medley,Gold
"PHELPS, Michael",4X100M Freestyle Relay,Bronze
"PHELPS, Michael",4X100M Medley Relay,Gold
"PHELPS, Michael",4X200M Freestyle Relay,Gold
"PHELPS, Michael",100M Butterfly,Gold
"PHELPS, Michael",200M Butterfly,Gold


In [79]:
medals.loc[["PHELPS, Michael", "LEWIS, Carl"], ["Event","Medal"]]

Unnamed: 0_level_0,Event,Medal
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1
"PHELPS, Michael",100M Butterfly,Gold
"PHELPS, Michael",200M Butterfly,Gold
"PHELPS, Michael",200M Freestyle,Bronze
"PHELPS, Michael",200M Individual Medley,Gold
"PHELPS, Michael",400M Individual Medley,Gold
"PHELPS, Michael",4X100M Freestyle Relay,Bronze
"PHELPS, Michael",4X100M Medley Relay,Gold
"PHELPS, Michael",4X200M Freestyle Relay,Gold
"PHELPS, Michael",100M Butterfly,Gold
"PHELPS, Michael",200M Butterfly,Gold


In [81]:
medals.loc["DRIVAS, Dimitrios":"BLAKE, Arthur"]

Unnamed: 0_level_0,Year,City,Sport,Discipline,Country,Gender,Event,Medal
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"DRIVAS, Dimitrios",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Bronze
"MALOKINIS, Ioannis",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Gold
"CHASAPIS, Spiridon",1896,Athens,Aquatics,Swimming,GRE,Men,100M Freestyle For Sailors,Silver
"CHOROPHAS, Efstathios",1896,Athens,Aquatics,Swimming,GRE,Men,1200M Freestyle,Bronze
"HAJOS, Alfred",1896,Athens,Aquatics,Swimming,HUN,Men,1200M Freestyle,Gold
"ANDREOU, Joannis",1896,Athens,Aquatics,Swimming,GRE,Men,1200M Freestyle,Silver
"CHOROPHAS, Efstathios",1896,Athens,Aquatics,Swimming,GRE,Men,400M Freestyle,Bronze
"NEUMANN, Paul",1896,Athens,Aquatics,Swimming,AUT,Men,400M Freestyle,Gold
"PEPANOS, Antonios",1896,Athens,Aquatics,Swimming,GRE,Men,400M Freestyle,Silver
"LANE, Francis",1896,Athens,Athletics,Athletics,USA,Men,100M,Bronze


In [82]:
medals.loc["HAJOS, Alfred", "Year":"Discipline"]

Unnamed: 0_level_0,Year,City,Sport,Discipline
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"HAJOS, Alfred",1896,Athens,Aquatics,Swimming
"HAJOS, Alfred",1896,Athens,Aquatics,Swimming


In [84]:
medals.loc["PHELPS, Michael", ["Year", "Age"]]

KeyError: "['Age'] not in index"

In [85]:
medals.loc["Other", ["Year", "City"]]


KeyError: 'Other'