# Selecting Rows and Columns

---

### 1. Read Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('large_countries_2015.csv', index_col=0)

### 2. Inspect Data

In [3]:
df.shape

(12, 3)

In [4]:
df.head(3)

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia


In [5]:
df.tail(3)

Unnamed: 0,population,fertility,continent
Philippines,100699395.0,2.98,Asia
Russia,143456918.0,1.61,Europe
United States,321773631.0,1.97,North America


### 3. Select Columns

In [6]:
df["population"]

Bangladesh       1.609956e+08
Brazil           2.078475e+08
China            1.376049e+09
India            1.311051e+09
Indonesia        2.575638e+08
Japan            1.265735e+08
Mexico           1.270172e+08
Nigeria          1.822020e+08
Pakistan         1.889249e+08
Philippines      1.006994e+08
Russia           1.434569e+08
United States    3.217736e+08
Name: population, dtype: float64

In [7]:
col_names = ["population", 'fertility']
df[col_names].head(3)

Unnamed: 0,population,fertility
Bangladesh,160995600.0,2.12
Brazil,207847500.0,1.78
China,1376049000.0,1.57


### 4. Select Rows

In [8]:
df.loc['Brazil']

population      207847528.0
fertility              1.78
continent     South America
Name: Brazil, dtype: object

In [9]:
df.loc[['Japan', 'China', 'Brazil']]

Unnamed: 0,population,fertility,continent
Japan,126573500.0,1.45,Asia
China,1376049000.0,1.57,Asia
Brazil,207847500.0,1.78,South America


In [10]:
cont = df.set_index('continent')
cont.loc['Asia'].head(3)

Unnamed: 0_level_0,population,fertility
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,160995600.0,2.12
Asia,1376049000.0,1.57
Asia,1311051000.0,2.43


In [11]:
df.iloc[[1, 3, 5]]  # by positions of the rows

Unnamed: 0,population,fertility,continent
Brazil,207847500.0,1.78,South America
India,1311051000.0,2.43,Asia
Japan,126573500.0,1.45,Asia


In [12]:
df.iloc[1:5]        # slice

Unnamed: 0,population,fertility,continent
Brazil,207847500.0,1.78,South America
China,1376049000.0,1.57,Asia
India,1311051000.0,2.43,Asia
Indonesia,257563800.0,2.28,Asia


In [13]:
df.iloc[::2]  # every second row from original data

Unnamed: 0,population,fertility,continent
Bangladesh,160995600.0,2.12,Asia
China,1376049000.0,1.57,Asia
Indonesia,257563800.0,2.28,Asia
Mexico,127017200.0,2.13,North America
Pakistan,188924900.0,3.04,Asia
Russia,143456900.0,1.61,Europe


### 5. Select Both Rows and Columns

In [14]:
df.loc[['Japan', 'China', 'Brazil'], ['continent', 'fertility']]

Unnamed: 0,continent,fertility
Japan,Asia,1.45
China,Asia,1.57
Brazil,South America,1.78


In [15]:
df.iloc[1:5, 0:2]

Unnamed: 0,population,fertility
Brazil,207847500.0,1.78
China,1376049000.0,1.57
India,1311051000.0,2.43
Indonesia,257563800.0,2.28


### 6. Select by Conditions

In [16]:
df['in_asia'] = df['continent'] == 'Asia'

In [17]:
df.head(3)

Unnamed: 0,population,fertility,continent,in_asia
Bangladesh,160995600.0,2.12,Asia,True
Brazil,207847500.0,1.78,South America,False
China,1376049000.0,1.57,Asia,True


In [18]:
high_pop = df[df['population'] > 250_000_000]
high_pop.shape

(4, 4)

In [19]:
mid_pop = df[df['population'].between(100_000_000, 250_000_000)]
mid_pop.shape

(8, 4)

In [20]:
low_fert_asia = df[(df['fertility'] < 1.8) & (df['continent'] == 'Asia') ]
low_fert_asia.head(5)

Unnamed: 0,population,fertility,continent,in_asia
China,1376049000.0,1.57,Asia,True
Japan,126573500.0,1.45,Asia,True


## introduction_to_pandas.html :: Fix 5 Bugs

In [21]:
import pandas as pd

spices = [
          'One-Hot Chili Peppers',
          'Bayesian Basil',
          'Tensor Thyme',
          'Linear Lavender',
          'Artificial Neural Nutmeg',
          'Polynomial Peppermint',
          'Sigmoid Saffron'
          ]
participants = [2, 6, 9, 9, 9, 8, 4]

df = pd.DataFrame({'name': spices, 'participants': participants})

print(df.sort_values(by='participants', ascending=False))

print("\ntotal participants:", df['participants'].sum())

                       name  participants
2              Tensor Thyme             9
3           Linear Lavender             9
4  Artificial Neural Nutmeg             9
5     Polynomial Peppermint             8
1            Bayesian Basil             6
6           Sigmoid Saffron             4
0     One-Hot Chili Peppers             2

total participants: 47


## introduction_to_pandas.html :: Solve with One-Liners

In [22]:
!pwd
!ls -al data

/Users/maxim/codebase/python/spiced_projects/random-forest-fennel-encounter-notes/week_01
total 40
drwxr-xr-x   4 maxim  staff    128 Sep 28 15:15 [34m.[m[m
drwxr-xr-x  16 maxim  staff    512 Sep 28 15:23 [34m..[m[m
-rw-r--r--@  1 maxim  staff   3597 Sep 28 15:15 continents.csv
-rw-r--r--   1 maxim  staff  12604 Sep 28 13:37 penguins_simple.csv


In [38]:
import pandas as pd

# 1. read the file into a DataFrame
df = pd.read_csv("/Users/maxim/codebase/python/spiced_projects/random-forest-fennel-encounter-notes/week_01/data/continents.csv", sep=";")

In [39]:
type(df)

pandas.core.frame.DataFrame

In [40]:
# 2. display the number of rows and columns
df.shape

(194, 2)

In [41]:
# 3. display the first 5 countries in the alphabet
df.head()

Unnamed: 0,continent,country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [42]:
df.columns

Index(['continent', 'country'], dtype='object')

In [44]:
# 4. which continent has the most countries?
df['continent'].value_counts()

Africa                   54
Europe                   47
Asia                     44
North America            23
Australia and Oceania    14
South America            12
Name: continent, dtype: int64

In [45]:
# 5. find out on which continent Cyprus is
df[df['country'] == 'Cyprus']

Unnamed: 0,continent,country
108,Europe,Cyprus


In [48]:
# 6. define a DataFrame with all African countries
africa = df[df['continent'] == 'Africa'][['country']]

In [49]:
africa

Unnamed: 0,country
0,Algeria
1,Angola
2,Benin
3,Botswana
4,Burkina
5,Burundi
6,Cameroon
7,Cape Verde
8,Central African Republic
9,Chad


In [56]:
# 7. define a DataFrame with countries 80 through 100
subset = df.iloc[80:100]

In [55]:
subset

Unnamed: 0,continent,country
80,Asia,Nepal
81,Asia,Oman
82,Asia,Pakistan
83,Asia,Philippines
84,Asia,Qatar
85,Asia,Russian Federation
86,Asia,Saudi Arabia
87,Asia,Singapore
88,Asia,Sri Lanka
89,Asia,Syria


In [58]:
# 8. select every second country
even = df[df.index % 2 == 0]

In [59]:
even

Unnamed: 0,continent,country
0,Africa,Algeria
2,Africa,Benin
4,Africa,Burkina
6,Africa,Cameroon
8,Africa,Central African Republic
...,...,...
184,South America,Brazil
186,South America,Colombia
188,South America,Guyana
190,South America,Peru
