## Slicing and Indexing DataFrames

#### Explicit indexes

In [21]:
# importing pandas
import pandas as pd

# importing sales dataset
temp = pd.read_csv("../datasets/temperatures.csv")
temp.head()

Unnamed: 0.1,Unnamed: 0,date,city,country,avg_temp_c
0,0,2000-01-01,Abidjan,Côte D'Ivoire,27.293
1,1,2000-02-01,Abidjan,Côte D'Ivoire,27.685
2,2,2000-03-01,Abidjan,Côte D'Ivoire,29.061
3,3,2000-04-01,Abidjan,Côte D'Ivoire,28.162
4,4,2000-05-01,Abidjan,Côte D'Ivoire,27.547


#### .columns and .index

In [22]:
temp.columns

Index(['Unnamed: 0', 'date', 'city', 'country', 'avg_temp_c'], dtype='object')

In [23]:
temp.index

RangeIndex(start=0, stop=16500, step=1)

#### Setting a column as the index

In [14]:
temp_ind = temp.set_index("country")
temp_ind

Unnamed: 0_level_0,Unnamed: 0,date,city,avg_temp_c
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Côte D'Ivoire,0,2000-01-01,Abidjan,27.293
Côte D'Ivoire,1,2000-02-01,Abidjan,27.685
Côte D'Ivoire,2,2000-03-01,Abidjan,29.061
Côte D'Ivoire,3,2000-04-01,Abidjan,28.162
Côte D'Ivoire,4,2000-05-01,Abidjan,27.547
...,...,...,...,...
China,16495,2013-05-01,Xian,18.979
China,16496,2013-06-01,Xian,23.522
China,16497,2013-07-01,Xian,25.251
China,16498,2013-08-01,Xian,24.528


#### Removing an index

In [12]:
temp_ind.reset_index()

Unnamed: 0.1,country,Unnamed: 0,date,city,avg_temp_c
0,Côte D'Ivoire,0,2000-01-01,Abidjan,27.293
1,Côte D'Ivoire,1,2000-02-01,Abidjan,27.685
2,Côte D'Ivoire,2,2000-03-01,Abidjan,29.061
3,Côte D'Ivoire,3,2000-04-01,Abidjan,28.162
4,Côte D'Ivoire,4,2000-05-01,Abidjan,27.547
...,...,...,...,...,...
16495,China,16495,2013-05-01,Xian,18.979
16496,China,16496,2013-06-01,Xian,23.522
16497,China,16497,2013-07-01,Xian,25.251
16498,China,16498,2013-08-01,Xian,24.528


#### Dropping an index

In [7]:
temp_ind.reset_index(drop=True)

Unnamed: 0.1,Unnamed: 0,city,country,avg_temp_c
0,0,Abidjan,Côte D'Ivoire,27.293
1,1,Abidjan,Côte D'Ivoire,27.685
2,2,Abidjan,Côte D'Ivoire,29.061
3,3,Abidjan,Côte D'Ivoire,28.162
4,4,Abidjan,Côte D'Ivoire,27.547
...,...,...,...,...
16495,16495,Xian,China,18.979
16496,16496,Xian,China,23.522
16497,16497,Xian,China,25.251
16498,16498,Xian,China,24.528


#### Indexes make subsetting simpler

In [9]:
temp[temp["country"].isin(["China", "Australia"])]

Unnamed: 0.1,Unnamed: 0,date,city,country,avg_temp_c
3135,3135,2000-01-01,Changchun,China,-18.759
3136,3136,2000-02-01,Changchun,China,-13.105
3137,3137,2000-03-01,Changchun,China,-1.089
3138,3138,2000-04-01,Changchun,China,7.297
3139,3139,2000-05-01,Changchun,China,16.970
...,...,...,...,...,...
16495,16495,2013-05-01,Xian,China,18.979
16496,16496,2013-06-01,Xian,China,23.522
16497,16497,2013-07-01,Xian,China,25.251
16498,16498,2013-08-01,Xian,China,24.528


In [15]:
temp_ind.loc[["China", "Australia"]]

Unnamed: 0_level_0,Unnamed: 0,date,city,avg_temp_c
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,3135,2000-01-01,Changchun,-18.759
China,3136,2000-02-01,Changchun,-13.105
China,3137,2000-03-01,Changchun,-1.089
China,3138,2000-04-01,Changchun,7.297
China,3139,2000-05-01,Changchun,16.970
...,...,...,...,...
Australia,15010,2013-05-01,Sydney,16.947
Australia,15011,2013-06-01,Sydney,15.911
Australia,15012,2013-07-01,Sydney,15.518
Australia,15013,2013-08-01,Sydney,16.126


#### Index values don't need to be unique

In [16]:
temp_ind2 = temp.set_index("city")
temp_ind2

Unnamed: 0_level_0,Unnamed: 0,date,country,avg_temp_c
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abidjan,0,2000-01-01,Côte D'Ivoire,27.293
Abidjan,1,2000-02-01,Côte D'Ivoire,27.685
Abidjan,2,2000-03-01,Côte D'Ivoire,29.061
Abidjan,3,2000-04-01,Côte D'Ivoire,28.162
Abidjan,4,2000-05-01,Côte D'Ivoire,27.547
...,...,...,...,...
Xian,16495,2013-05-01,China,18.979
Xian,16496,2013-06-01,China,23.522
Xian,16497,2013-07-01,China,25.251
Xian,16498,2013-08-01,China,24.528


#### Subsetting on duplicated index values

In [17]:
temp_ind2.loc["Bangkok"]

Unnamed: 0_level_0,Unnamed: 0,date,country,avg_temp_c
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bangkok,1320,2000-01-01,Thailand,25.980
Bangkok,1321,2000-02-01,Thailand,26.564
Bangkok,1322,2000-03-01,Thailand,28.626
Bangkok,1323,2000-04-01,Thailand,28.881
Bangkok,1324,2000-05-01,Thailand,28.978
...,...,...,...,...
Bangkok,1480,2013-05-01,Thailand,30.927
Bangkok,1481,2013-06-01,Thailand,28.771
Bangkok,1482,2013-07-01,Thailand,28.155
Bangkok,1483,2013-08-01,Thailand,28.351


#### Multi-level indexes a.k.a hierarchical indexes

In [27]:
temp_ind3 = temp.set_index(["country", "city"])
temp_ind3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,date,avg_temp_c
country,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Côte D'Ivoire,Abidjan,0,2000-01-01,27.293
Côte D'Ivoire,Abidjan,1,2000-02-01,27.685
Côte D'Ivoire,Abidjan,2,2000-03-01,29.061
Côte D'Ivoire,Abidjan,3,2000-04-01,28.162
Côte D'Ivoire,Abidjan,4,2000-05-01,27.547
...,...,...,...,...
China,Xian,16495,2013-05-01,18.979
China,Xian,16496,2013-06-01,23.522
China,Xian,16497,2013-07-01,25.251
China,Xian,16498,2013-08-01,24.528


In [28]:
temp_ind3.loc[["Germany", "China"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,date,avg_temp_c
country,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Germany,Berlin,1650,2000-01-01,1.324
Germany,Berlin,1651,2000-02-01,4.718
Germany,Berlin,1652,2000-03-01,5.806
Germany,Berlin,1653,2000-04-01,11.805
Germany,Berlin,1654,2000-05-01,16.376
...,...,...,...,...
China,Xian,16495,2013-05-01,18.979
China,Xian,16496,2013-06-01,23.522
China,Xian,16497,2013-07-01,25.251
China,Xian,16498,2013-08-01,24.528


#### Subset inner levels with a list of tuples

In [29]:
temp_ind3.loc[[("Germany", "Berlin"), ("China", "Xian")]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,date,avg_temp_c
country,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Germany,Berlin,1650,2000-01-01,1.324
Germany,Berlin,1651,2000-02-01,4.718
Germany,Berlin,1652,2000-03-01,5.806
Germany,Berlin,1653,2000-04-01,11.805
Germany,Berlin,1654,2000-05-01,16.376
...,...,...,...,...
China,Xian,16495,2013-05-01,18.979
China,Xian,16496,2013-06-01,23.522
China,Xian,16497,2013-07-01,25.251
China,Xian,16498,2013-08-01,24.528


#### Sorting by index values

In [30]:
temp_ind3.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,date,avg_temp_c
country,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,Kabul,7260,2000-01-01,3.326
Afghanistan,Kabul,7261,2000-02-01,3.454
Afghanistan,Kabul,7262,2000-03-01,9.612
Afghanistan,Kabul,7263,2000-04-01,17.925
Afghanistan,Kabul,7264,2000-05-01,24.658
...,...,...,...,...
Zimbabwe,Harare,5605,2013-05-01,18.298
Zimbabwe,Harare,5606,2013-06-01,17.020
Zimbabwe,Harare,5607,2013-07-01,16.299
Zimbabwe,Harare,5608,2013-08-01,19.232


#### Controlling sort_index

In [32]:
temp_ind3.sort_index(level=["country", "city"], ascending=[True, False])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,date,avg_temp_c
country,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,Kabul,7260,2000-01-01,3.326
Afghanistan,Kabul,7261,2000-02-01,3.454
Afghanistan,Kabul,7262,2000-03-01,9.612
Afghanistan,Kabul,7263,2000-04-01,17.925
Afghanistan,Kabul,7264,2000-05-01,24.658
...,...,...,...,...
Zimbabwe,Harare,5605,2013-05-01,18.298
Zimbabwe,Harare,5606,2013-06-01,17.020
Zimbabwe,Harare,5607,2013-07-01,16.299
Zimbabwe,Harare,5608,2013-08-01,19.232


#### Now you have two problems
* Index values are just data
* Indexes violate "tidy data" principles
* You need to learn two syntaxes