In [1]:
import pandas as pd
import numpy as np

Creating `Dataframe`s manually can be tedious. 99% of the time you'll be pulling the data from database, a csv file or web. But still, you can create a dataframe by specifying the columns and values:

In [2]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94, 60.665, 127.061, 64.511, 318.523],
    'GDP': [1785387, 2833687, 3874437, 2167744, 4602367, 2950039, 17348075],
    'Surface': [9984670, 640679, 357114, 301336, 377930, 242495, 9525067],
    'HDI': [0.913, 0.888, 0.916, 0.873, 0.891, 0.907, 0.915],
    'Continent': ['America', 'Europe', 'Europe', 'Europe', 'Asia', 'Europe', 'America']
}, columns = ['Population', 'GDP', 'Surface', 'HDI', 'Continent'])

df

Unnamed: 0,Population,GDP,Surface,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [3]:
df.index = ['Canada', 'France', 'Germany', 'Italy', 'Japan', 'UK', 'US']

In [4]:
df

Unnamed: 0,Population,GDP,Surface,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [5]:
df.columns

Index(['Population', 'GDP', 'Surface', 'HDI', 'Continent'], dtype='object')

In [6]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'UK', 'US'], dtype='object')

In [7]:
df.Population

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
US         318.523
Name: Population, dtype: float64

In [8]:
df.index[1]

'France'

In [9]:
df.columns[1]

'GDP'

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to US
Data columns (total 5 columns):
Population    7 non-null float64
GDP           7 non-null int64
Surface       7 non-null int64
HDI           7 non-null float64
Continent     7 non-null object
dtypes: float64(2), int64(2), object(1)
memory usage: 656.0+ bytes


In [11]:
df.size

35

In [12]:
df.shape

(7, 5)

In [13]:
df.dtypes

Population    float64
GDP             int64
Surface         int64
HDI           float64
Continent      object
dtype: object

In [14]:
df.describe()

Unnamed: 0,Population,GDP,Surface,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


## Indexing, Selection, and Slicing

In [15]:
df['Population']

Canada      35.467
France      63.951
Germany     80.940
Italy       60.665
Japan      127.061
UK          64.511
US         318.523
Name: Population, dtype: float64

Note that the `index` of the return series is the same as DataFrame one, and its `name` is the name of the column. If you're working on a notebook and want to see a more DataFrame-like format, you can use the `to_frame` method:

In [16]:
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
UK,64.511
US,318.523


Multiple columns can also be selected similarly to `numpy` and `Series`:

In [17]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
UK,64.511,2950039
US,318.523,17348075


Slicing works differently, it acts at 'row level', and can be counter intuitive:

In [18]:
df[1:3] #It select row 1 until before 3, it's 2.

Unnamed: 0,Population,GDP,Surface,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


Select the rows with `'loc` and `iloc`:

In [19]:
df.loc['Canada']

Population     35.467
GDP           1785387
Surface       9984670
HDI             0.913
Continent     America
Name: Canada, dtype: object

In [20]:
df.iloc[1]

Population     63.951
GDP           2833687
Surface        640679
HDI             0.888
Continent      Europe
Name: France, dtype: object

In [21]:
df.iloc[1:2]

Unnamed: 0,Population,GDP,Surface,HDI,Continent
France,63.951,2833687,640679,0.888,Europe


In [22]:
df.iloc[-1]

Population     318.523
GDP           17348075
Surface        9525067
HDI              0.915
Continent      America
Name: US, dtype: object

In [23]:
US = df.iloc[-1]
US.to_frame()

Unnamed: 0,US
Population,318.523
GDP,17348075
Surface,9525067
HDI,0.915
Continent,America


In [24]:
df.iloc[-1:]

Unnamed: 0,Population,GDP,Surface,HDI,Continent
US,318.523,17348075,9525067,0.915,America


In [25]:
df.iloc[[0, 1, -1]]

Unnamed: 0,Population,GDP,Surface,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
US,318.523,17348075,9525067,0.915,America


In [26]:
df.iloc[1:3, 3] #Selecting rows 1-2 and column 3.

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [27]:
df.iloc[1:3, [1, 3]] #Selecting rows 1-2, column 1 & 3

Unnamed: 0,GDP,HDI
France,2833687,0.888
Germany,3874437,0.916


In [28]:
df.iloc[1:3, 1:3]

Unnamed: 0,GDP,Surface
France,2833687,640679
Germany,3874437,357114


## Conditional Selection (boolean arrays)
We saw conditional selection applied to `Series` and it'll work the same for `DataFrame`s. After all, a `DataFrame` is a collection of `Series`:

In [29]:
df

Unnamed: 0,Population,GDP,Surface,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [30]:
df['Population'] > 70

Canada     False
France     False
Germany     True
Italy      False
Japan       True
UK         False
US          True
Name: Population, dtype: bool

In [31]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
US,318.523,17348075,9525067,0.915,America


The boolean matching is done at index level, so you can filter by any row, as long as it contains the right indexes. Column selection still works as expected:

In [32]:
df.loc[df['Population'] > 70, 'Population']

Germany     80.940
Japan      127.061
US         318.523
Name: Population, dtype: float64

In [33]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
US,318.523,17348075


## Dropping Stuff
Opposed to the concept of selection, we have 'dropping'. Instead of pointing out which values you'd to *select* you could point which ones you'd like to `drop`: 

In [34]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [35]:
df.drop(['Canada', 'Japan'])

Unnamed: 0,Population,GDP,Surface,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [36]:
df.drop(columns=['Population', 'HDI'])

Unnamed: 0,GDP,Surface,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
UK,2950039,242495,Europe
US,17348075,9525067,America


In [37]:
df.drop(['Italy', 'Canada'], axis=0)

Unnamed: 0,Population,GDP,Surface,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


In [38]:
df.drop(['Population', 'HDI'], axis=1)

Unnamed: 0,GDP,Surface,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
UK,2950039,242495,Europe
US,17348075,9525067,America


In [39]:
df.drop(['Population', 'HDI'], axis='columns')

Unnamed: 0,GDP,Surface,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
UK,2950039,242495,Europe
US,17348075,9525067,America


In [40]:
df.drop(['Italy', 'Canada'], axis='rows')

Unnamed: 0,Population,GDP,Surface,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


# Modifying DataFrames
It's simple and intuitive, you can add columns, or replace values for columns without issues:
## Adding a new column

In [41]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index = ['France', 'Germany', 'Italy'],
    name = 'Languange'
)

In [42]:
langs

France      French
Germany     German
Italy      Italian
Name: Languange, dtype: object

In [43]:
df['Language'] = langs
df

Unnamed: 0,Population,GDP,Surface,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
UK,64.511,2950039,242495,0.907,Europe,
US,318.523,17348075,9525067,0.915,America,


## Replacing Values per Column

In [44]:
df['Language'] = 'English'
df

Unnamed: 0,Population,GDP,Surface,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


## Renaming Columns

In [45]:
df.rename(
    columns={
        'HDI': 'Human Development Index', 'Anual Popcorn Consumption': 'APC'
    }, index ={'UK': 'United Kingdom', 'US': 'United States'}
)

Unnamed: 0,Population,GDP,Surface,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


In [46]:
df.rename(index = str.upper)

Unnamed: 0,Population,GDP,Surface,HDI,Continent,Language
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.665,2167744,301336,0.873,Europe,English
JAPAN,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
US,318.523,17348075,9525067,0.915,America,English


In [47]:
df.rename(index = lambda x: x.lower())

Unnamed: 0,Population,GDP,Surface,HDI,Continent,Language
canada,35.467,1785387,9984670,0.913,America,English
france,63.951,2833687,640679,0.888,Europe,English
germany,80.94,3874437,357114,0.916,Europe,English
italy,60.665,2167744,301336,0.873,Europe,English
japan,127.061,4602367,377930,0.891,Asia,English
uk,64.511,2950039,242495,0.907,Europe,English
us,318.523,17348075,9525067,0.915,America,English


## Dropping Columns

In [48]:
df.drop(columns = 'Language', inplace = True)
df

Unnamed: 0,Population,GDP,Surface,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
UK,64.511,2950039,242495,0.907,Europe
US,318.523,17348075,9525067,0.915,America


## Creating Columns from Other Columns
Altering a DataFrame often involves combining different columns into another. For example, in our Countries analysis, we could try to calculate the 'GDP per capita', which is just, `GDP/Population`.

In [49]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
UK,64.511,2950039
US,318.523,17348075


In [50]:
df['GDP']/df['Population']

Canada     50339.385908
France     44310.284437
Germany    47868.013343
Italy      35733.025633
Japan      36221.712406
UK         45729.239975
US         54464.120330
dtype: float64

In [51]:
df['GDP per Capita'] = df['GDP']/df['Population']
df

Unnamed: 0,Population,GDP,Surface,HDI,Continent,GDP per Capita
Canada,35.467,1785387,9984670,0.913,America,50339.385908
France,63.951,2833687,640679,0.888,Europe,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,36221.712406
UK,64.511,2950039,242495,0.907,Europe,45729.239975
US,318.523,17348075,9525067,0.915,America,54464.12033


## Statistical Info
You've already seen the `describe` method, which gives you a good 'summary' of the `DataFrame`. Let's explore othe methods in more detail:

In [52]:
df.head()

Unnamed: 0,Population,GDP,Surface,HDI,Continent,GDP per Capita
Canada,35.467,1785387,9984670,0.913,America,50339.385908
France,63.951,2833687,640679,0.888,Europe,44310.284437
Germany,80.94,3874437,357114,0.916,Europe,47868.013343
Italy,60.665,2167744,301336,0.873,Europe,35733.025633
Japan,127.061,4602367,377930,0.891,Asia,36221.712406


In [53]:
df.describe()

Unnamed: 0,Population,GDP,Surface,HDI,GDP per Capita
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429,44952.254576
std,97.24997,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,62.308,2500716.0,329225.0,0.8895,40265.998421
50%,64.511,2950039.0,377930.0,0.907,45729.239975
75%,104.0005,4238402.0,5082873.0,0.914,49103.699626
max,318.523,17348080.0,9984670.0,0.916,54464.12033


In [54]:
population = df['Population']

In [55]:
population.min(), population.max()

(35.467, 318.523)

In [56]:
population.sum()

751.118

In [57]:
len(population)

7

In [58]:
population.sum()/len(population)

107.30257142857144

In [59]:
population.mean()

107.30257142857144

In [60]:
population.std()

97.24996987121581

In [61]:
population.median()

64.511

In [62]:
population.describe()

count      7.000000
mean     107.302571
std       97.249970
min       35.467000
25%       62.308000
50%       64.511000
75%      104.000500
max      318.523000
Name: Population, dtype: float64

In [63]:
population.quantile(.25)

62.308

In [64]:
population.quantile([.2, .4, .6, .8, 1])

0.2     61.3222
0.4     64.1750
0.6     74.3684
0.8    117.8368
1.0    318.5230
Name: Population, dtype: float64