In [1]:
# This is a tutorial on using Pandas with DataFrame
# all data are not changing because they are immutable operations

# this is importing all the necessary libaries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Waiting for the activating process in Python Environment

In [2]:
# The dataset to use:
# we can consider dataframe as a combination of series
df = pd.DataFrame({
	'Population': [35.467, 63.951, 80.94, 60.665, 127.061, 64.511, 318.523],
	'GDP': [
		1785387,
		2833687,
		3874437,
		2167744,
		4602367,
		2950039,
		17348075
		],
	'Surface': [
		9984678,
		640679,
		357114,
		301336,
		377930,
		242495,
		9525067
		],
	'HDI': [
		0.913,
		0.888,
		0.916,
		0.873,
		0.891,
		0.907,
		0.915
		],
	'Continent': [
		'America',
		'Europe',
		'Europe',
		'Europe',
		'Asia',
		'Europe',
		'America'
		]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,,0.913,America
1,63.951,2833687,,0.888,Europe
2,80.94,3874437,,0.916,Europe
3,60.665,2167744,,0.873,Europe
4,127.061,4602367,,0.891,Asia
5,64.511,2950039,,0.907,Europe
6,318.523,17348075,,0.915,America


In [3]:
# assigning an index (Countries) into the dataframe
df.index =[
    'Canada',
    'France',
    'England',
    'Spain',
    'Japan',
    'Russia',
    'United States'
]
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,,0.913,America
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [14]:
# this is to check the info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  0 non-null      object 
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(1), object(2)
memory usage: 336.0+ bytes


In [10]:
# checking the size of the dataset
df.size

35

In [11]:
#checking the shape
df.shape

(7, 5)

In [12]:
#checking the columns
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [13]:
#checking the index
df.index

Index(['Canada', 'France', 'England', 'Spain', 'Japan', 'Russia',
       'United States'],
      dtype='object')

In [16]:
# describe give the aggregate as count, mean, std, min, %s and Max
df.describe()

Unnamed: 0,Population,GDP,HDI
count,7.0,7.0,7.0
mean,107.302571,5080248.0,0.900429
std,97.24997,5494020.0,0.016592
min,35.467,1785387.0,0.873
25%,62.308,2500716.0,0.8895
50%,64.511,2950039.0,0.907
75%,104.0005,4238402.0,0.914
max,318.523,17348080.0,0.916


In [17]:
# INDEXING, SELECTION AND SLICING
# This talks about how to be selecting data from the series (in this case the dataset)

# we print out dataframe out given it as df
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,,0.913,America
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [19]:
# the .loc attribute will allow to get the data for each row, providing the index, 
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area        NaN
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [20]:
# while .iloc will let you select by sequential position
df.iloc[-2]

Population       64.511
GDP             2950039
Surface Area        NaN
HDI               0.907
Continent        Europe
Name: Russia, dtype: object

In [22]:
# df[] is giving the extire column stated inside the lock (e.g, list of columns we have)
df['Population']

Canada            35.467
France            63.951
England           80.940
Spain             60.665
Japan            127.061
Russia            64.511
United States    318.523
Name: Population, dtype: float64

In [29]:
df.loc['France' : 'Russia', 'Population']

France      63.951
England     80.940
Spain       60.665
Japan      127.061
Russia      64.511
Name: Population, dtype: float64

In [30]:
df.loc['France' : 'Russia', ['Population', 'HDI']]

Unnamed: 0,Population,HDI
France,63.951,0.888
England,80.94,0.916
Spain,60.665,0.873
Japan,127.061,0.891
Russia,64.511,0.907


In [28]:
df.loc['France' : 'Russia']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe


In [31]:
# CONDITIONAL SELECTION (Boolean Array)
# a way to filter data

df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,,0.913,America
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [32]:
# to filter a population > 70
df['Population'] > 70

Canada           False
France           False
England           True
Spain            False
Japan             True
Russia           False
United States     True
Name: Population, dtype: bool

In [34]:
# using .loc works at the index level to populate all the dataset from the query given

df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
England,80.94,3874437,,0.916,Europe
Japan,127.061,4602367,,0.891,Asia
United States,318.523,17348075,,0.915,America


In [40]:
# here we are defining the column we want us to give with .loc
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
England,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


In [42]:
# Dropping stuff
# opposed to the concept of selection, we have "dropping", 
# instead of pointing out which values you'd like to select you could point which ones you'd like to drop:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [45]:
df.drop(['Canada' ,'France'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [46]:
# using .drop() to drop columns
df.drop(columns=['Population'])

Unnamed: 0,GDP,Surface Area,HDI,Continent
Canada,1785387,,0.913,America
France,2833687,,0.888,Europe
England,3874437,,0.916,Europe
Spain,2167744,,0.873,Europe
Japan,4602367,,0.891,Asia
Russia,2950039,,0.907,Europe
United States,17348075,,0.915,America


In [49]:
# OPERATIONS
# operations with series work at a column level, broadcasting down the rows (which can be counter intuitive)
df[['Population', 'HDI', 'GDP']]

Unnamed: 0,Population,HDI,GDP
Canada,35.467,0.913,1785387
France,63.951,0.888,2833687
England,80.94,0.916,3874437
Spain,60.665,0.873,2167744
Japan,127.061,0.891,4602367
Russia,64.511,0.907,2950039
United States,318.523,0.915,17348075


In [54]:
df[['Population', 'HDI', 'GDP']] / 100

Unnamed: 0,Population,HDI,GDP
Canada,0.35467,0.00913,17853.87
France,0.63951,0.00888,28336.87
England,0.8094,0.00916,38744.37
Spain,0.60665,0.00873,21677.44
Japan,1.27061,0.00891,46023.67
Russia,0.64511,0.00907,29500.39
United States,3.18523,0.00915,173480.75


In [55]:
df[['GDP','HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
England,3874437,0.916
Spain,2167744,0.873
Japan,4602367,0.891
Russia,2950039,0.907
United States,17348075,0.915


In [56]:
# creating a variable to add value to the column stated in the index
crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [57]:
df[['GDP','HDI']] + crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
England,2874437.0,0.616
Spain,1167744.0,0.573
Japan,3602367.0,0.591
Russia,1950039.0,0.607
United States,16348075.0,0.615


In [58]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,,0.913,America
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [59]:
# all data are not changing because they are immutable operations
# Modifying Dataframes
# all data are immutable, there are ways to change it.
# it is simple and intuitive, you can add columns, or replace values for columns without issues;

df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,,0.913,America
France,63.951,2833687,,0.888,Europe
England,80.94,3874437,,0.916,Europe
Spain,60.665,2167744,,0.873,Europe
Japan,127.061,4602367,,0.891,Asia
Russia,64.511,2950039,,0.907,Europe
United States,318.523,17348075,,0.915,America


In [65]:
# adding a new Column to existing table
langs = pd.Series(
	['French', 'English', 'English', 'Japanese', 'Russian', 'Spanish', 'English'],
	index=['France', 'England', 'Canada', 'Japan', 'Russia', 'Spain', 'United States'],
	name='Language'
)
df['Language'] = langs
langs

France             French
England           English
Canada            English
Japan            Japanese
Russia            Russian
Spain             Spanish
United States     English
Name: Language, dtype: object

In [64]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,,0.913,America,English
France,63.951,2833687,,0.888,Europe,French
England,80.94,3874437,,0.916,Europe,English
Spain,60.665,2167744,,0.873,Europe,Spanish
Japan,127.061,4602367,,0.891,Asia,Japanese
Russia,64.511,2950039,,0.907,Europe,Russian
United States,318.523,17348075,,0.915,America,English


In [68]:
# Replacing values per column
df['Language'] = 'English'
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,,0.913,America,English
France,63.951,2833687,,0.888,Europe,English
England,80.94,3874437,,0.916,Europe,English
Spain,60.665,2167744,,0.873,Europe,English
Japan,127.061,4602367,,0.891,Asia,English
Russia,64.511,2950039,,0.907,Europe,English
United States,318.523,17348075,,0.915,America,English


In [72]:
df['Language'] = 'English'
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,,0.913,America,English
France,63.951,2833687,,0.888,Europe,English
England,80.94,3874437,,0.916,Europe,English
Spain,60.665,2167744,,0.873,Europe,English
Japan,127.061,4602367,,0.891,Asia,English
Russia,64.511,2950039,,0.907,Europe,English
United States,318.523,17348075,,0.915,America,English


In [73]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,,0.913,America,English
France,63.951,2833687,,0.888,Europe,English
England,80.94,3874437,,0.916,Europe,English
Spain,60.665,2167744,,0.873,Europe,English
Japan,127.061,4602367,,0.891,Asia,English
Russia,64.511,2950039,,0.907,Europe,English
United States,318.523,17348075,,0.915,America,English


In [77]:
# renaming column with df.rename()
df.rename(
columns={
	'HDI': 'Human Development Index',
	'GDP': 'Gross Domestic Product'
    },
index={'United States': 'US'})

Unnamed: 0,Population,Gross Domestic Product,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,,0.913,America,English
France,63.951,2833687,,0.888,Europe,English
England,80.94,3874437,,0.916,Europe,English
Spain,60.665,2167744,,0.873,Europe,English
Japan,127.061,4602367,,0.891,Asia,English
Russia,64.511,2950039,,0.907,Europe,English
US,318.523,17348075,,0.915,America,English


In [78]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,,0.913,America,English
France,63.951,2833687,,0.888,Europe,English
England,80.94,3874437,,0.916,Europe,English
Spain,60.665,2167744,,0.873,Europe,English
Japan,127.061,4602367,,0.891,Asia,English
Russia,64.511,2950039,,0.907,Europe,English
United States,318.523,17348075,,0.915,America,English


In [83]:
# Creating columns from other Columns
df[['Population', 'GDP']]


Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
England,80.94,3874437
Spain,60.665,2167744
Japan,127.061,4602367
Russia,64.511,2950039
United States,318.523,17348075


In [84]:
df['GDP per Capita'] = df['Population']/df['GDP']
df['GDP per Capita']

Canada           0.000020
France           0.000023
England          0.000021
Spain            0.000028
Japan            0.000028
Russia           0.000022
United States    0.000018
Name: GDP per Capita, dtype: float64

In [85]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per Capita
Canada,35.467,1785387,,0.913,America,English,2e-05
France,63.951,2833687,,0.888,Europe,English,2.3e-05
England,80.94,3874437,,0.916,Europe,English,2.1e-05
Spain,60.665,2167744,,0.873,Europe,English,2.8e-05
Japan,127.061,4602367,,0.891,Asia,English,2.8e-05
Russia,64.511,2950039,,0.907,Europe,English,2.2e-05
United States,318.523,17348075,,0.915,America,English,1.8e-05


In [86]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per Capita
Canada,35.467,1785387,,0.913,America,English,2e-05
France,63.951,2833687,,0.888,Europe,English,2.3e-05
England,80.94,3874437,,0.916,Europe,English,2.1e-05
Spain,60.665,2167744,,0.873,Europe,English,2.8e-05
Japan,127.061,4602367,,0.891,Asia,English,2.8e-05
Russia,64.511,2950039,,0.907,Europe,English,2.2e-05
United States,318.523,17348075,,0.915,America,English,1.8e-05


In [90]:
# Statistical Info
# .describe(), population = df['Population'], population.sum(), 
#population.sum() / len(population), population.mean(), 
#population.std(), population.median(), population.desccribe(), population.quanatile(.25)

df.describe()

Unnamed: 0,Population,GDP,HDI,GDP per Capita,HDI per Population
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,0.900429,2.3e-05,97.06326
std,97.24997,5494020.0,0.016592,4e-06,89.264515
min,35.467,1785387.0,0.873,1.8e-05,32.381371
25%,62.308,2500716.0,0.8895,2e-05,54.874516
50%,64.511,2950039.0,0.907,2.2e-05,58.511477
75%,104.0005,4238402.0,0.914,2.5e-05,93.676196
max,318.523,17348080.0,0.916,2.8e-05,291.448545


In [91]:
population = df['Population']
population

Canada            35.467
France            63.951
England           80.940
Spain             60.665
Japan            127.061
Russia            64.511
United States    318.523
Name: Population, dtype: float64

In [92]:
population.mean()

107.30257142857144

In [93]:
population.sum() / len(population)

107.30257142857144

In [94]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language,GDP per Capita,HDI per Population
Canada,35.467,1785387,,0.913,America,English,2e-05,32.381371
France,63.951,2833687,,0.888,Europe,English,2.3e-05,56.788488
England,80.94,3874437,,0.916,Europe,English,2.1e-05,74.14104
Spain,60.665,2167744,,0.873,Europe,English,2.8e-05,52.960545
Japan,127.061,4602367,,0.891,Asia,English,2.8e-05,113.211351
Russia,64.511,2950039,,0.907,Europe,English,2.2e-05,58.511477
United States,318.523,17348075,,0.915,America,English,1.8e-05,291.448545
