In [320]:
import pandas as pd
import numpy as np

Creating a series out of a list

In [321]:
x = pd.Series([10, 20, 30, 40, 50])
x

0    10
1    20
2    30
3    40
4    50
dtype: int64

Allows us to assign the Series our own index

In [322]:
x.index = ['A', 'B', 'C', 'D', 'E']
x

A    10
B    20
C    30
D    40
E    50
dtype: int64

Creating a Series with label-based index

In [323]:
data = [450, 650, 870]
Sales = pd.Series(data, index=['Don', 'Mike', 'Edwin'])
Sales

Don      450
Mike     650
Edwin    870
dtype: int64

Accessing the Index

In [324]:
print(Sales.index)

Index(['Don', 'Mike', 'Edwin'], dtype='object')


Accessing the Values

In [325]:
print(f'Values are {Sales.values}')
print(f'Data type stored in Series is {Sales.dtype}')

Values are [450 650 870]
Data type stored in Series is int64


Accessing specific information

In [326]:
Sales[1]

650

In [327]:
Sales['Mike']

650

In [328]:
Sales > 500

Don      False
Mike      True
Edwin     True
dtype: bool

In [329]:
Sales[Sales > 500]

Mike     650
Edwin    870
dtype: int64

Lets you turn a series into a Dictionary, useful for json

In [330]:
sales_dict = Sales.to_dict()
sales_dict

{'Don': 450, 'Mike': 650, 'Edwin': 870}

In [331]:
sales_ser = pd.Series(sales_dict)
sales_ser

Don      450
Mike     650
Edwin    870
dtype: int64

Creating a new Series from an already existing series. NaN is 'Not a Number'

In [332]:
new_sales = pd.Series(Sales, index=['Don', 'Mike', 'Sally', 'Edwin', 'Lucy'])
new_sales

Don      450.0
Mike     650.0
Sally      NaN
Edwin    870.0
Lucy       NaN
dtype: float64

Checking NaN using numpy. Specifically seeing if 'Sally' has a NaN value

In [333]:
np.isnan(new_sales['Sally'])

True

Checking NaN using pandas. Looking at the entire Series, checking which have NaN values

In [334]:
pd.isna(new_sales)

Don      False
Mike     False
Sally     True
Edwin    False
Lucy      True
dtype: bool

Identifying which keys don't have null values using pandas

In [335]:
new_sales[pd.notna(new_sales)]

Don      450.0
Mike     650.0
Edwin    870.0
dtype: float64

### Pandas DataFrames
Creating a dataframe from a dictionary\
Pandas will assign automatic row labels (index)

In [336]:
new_dict = {
    'Name': ['Tom', 'Jane', 'Steve', 'Lucy'],
    'Sales': [200, 500, 350, 400],
    'Date': [2022, 2020, 2021, 2022]
}

df = pd.DataFrame(new_dict)
df

Unnamed: 0,Name,Sales,Date
0,Tom,200,2022
1,Jane,500,2020
2,Steve,350,2021
3,Lucy,400,2022


Adding a custom index

In [337]:
df_index = ['rank1', 'rank2', 'rank3', 'rank4']

df = pd.DataFrame(new_dict, df_index)
df

Unnamed: 0,Name,Sales,Date
rank1,Tom,200,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


Naming the index column in the DataFrame\
A DataFrame can have multiple index

In [338]:
df.index.name = 'Rank'
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,200,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


The index can also be reset\
The previous custom index now becomes a normal column\
This allows you to retain the index when transferring the DataFrame to other SQL servers\
This reset hasn't been assigned until it has been redefined using either '.reset_index('inplace=True') or assign it to a new variable like 'df2 = df.reset_index()'

In [339]:
df.reset_index()

Unnamed: 0,Rank,Name,Sales,Date
0,rank1,Tom,200,2022
1,rank2,Jane,500,2020
2,rank3,Steve,350,2021
3,rank4,Lucy,400,2022


In [340]:
df_noindex = df.reset_index()
df_noindex

Unnamed: 0,Rank,Name,Sales,Date
0,rank1,Tom,200,2022
1,rank2,Jane,500,2020
2,rank3,Steve,350,2021
3,rank4,Lucy,400,2022


In [341]:
df.columns # Shows the name of the columns
df.index # Shows the the index's and information about them
df.values # Shows the values of the data, in an array

array([['Tom', 200, 2022],
       ['Jane', 500, 2020],
       ['Steve', 350, 2021],
       ['Lucy', 400, 2022]], dtype=object)

In [342]:
new_dict_v2 = {
    'Name':['Tom', 'Jane', 'Steve', 'Lucy'],
    'Sales':[250, 500, 350, 400],
    'Date': [2022, 2020, 2021, 2022],
    'Rank': ['rank1', 'rank2', 'rank3', 'rank4']
    }


df2 = pd.DataFrame(new_dict_v2)

Assign a existing column as the index\
Doesn't overwrite the current DataFrame until re-defined or using argument 'inplace=True'

In [343]:
df2.set_index('Rank')

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


Multi Level indexes (hierachical indexes)

In [344]:
df2.set_index(['Rank', 'Name'], inplace=True)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Date
Rank,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


Allows you to reset a specific index using 'level=""' argument

In [345]:
df2.reset_index(level='Name')

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


Sorting\
Name by descending and Rank by ascending

In [346]:
df2.sort_index(level=['Name', 'Rank'], ascending=[False, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Date
Rank,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,250,2022
rank3,Steve,350,2021
rank4,Lucy,400,2022
rank2,Jane,500,2020


### Subsetting DataFrame
Having indexes makes subsetting simpler\
Column names can be access using 'DataFrame_name.Column_name'\
You can access columns, rows and single elements in your DataFrame a variety of ways
* Square Brackets
* loc
* iloc


In [347]:
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,200,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


Square Brackets\
Double brackets allows you to return the column as a DataFrame instead of a series

In [348]:
df[['Name']]

Unnamed: 0_level_0,Name
Rank,Unnamed: 1_level_1
rank1,Tom
rank2,Jane
rank3,Steve
rank4,Lucy


In [349]:
df[1:3]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank2,Jane,500,2020
rank3,Steve,350,2021


Slicing with multiple conditions

In [350]:
df[(df.Sales > 300) & (df.Date > 2020)]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank3,Steve,350,2021
rank4,Lucy,400,2022


isin() lets you check if the arguements you have given are in a specific column

In [351]:
df[df.Date.isin([2020, 2022])]

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,200,2022
rank2,Jane,500,2020
rank4,Lucy,400,2022


## loc - Label based access
loc[row_lable, column_label]

In [352]:
df

Unnamed: 0_level_0,Name,Sales,Date
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rank1,Tom,200,2022
rank2,Jane,500,2020
rank3,Steve,350,2021
rank4,Lucy,400,2022


In [353]:
df.loc[['rank2'], ['Name', 'Sales']] # Data for rank 2

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
rank2,Jane,500


In [354]:
df.loc[:, ['Name', 'Sales']] # All rows, but only Name and Sales columns

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
rank1,Tom,200
rank2,Jane,500
rank3,Steve,350
rank4,Lucy,400


In [355]:
df.loc['rank1':'rank3', ['Name', 'Sales']] # Rows rank 1 to rank3 and columns Name & Sales

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
rank1,Tom,200
rank2,Jane,500
rank3,Steve,350


In [356]:
df.loc[df.Sales > 300, ['Name', 'Sales']] # Filters the rows and selects specific columns

Unnamed: 0_level_0,Name,Sales
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
rank2,Jane,500
rank3,Steve,350
rank4,Lucy,400


## iloc - Index based access
iloc[row_position, column_position]

In [357]:
df = pd.read_csv('gapminder.csv')
df

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
0,11,Afghanistan,2007,31889923.0,Asia,43.828,974.580338
1,23,Albania,2007,3600523.0,Europe,76.423,5937.029526
2,35,Algeria,2007,33333216.0,Africa,72.301,6223.367465
3,47,Angola,2007,12420476.0,Africa,42.731,4797.231267
4,59,Argentina,2007,40301927.0,Americas,75.320,12779.379640
...,...,...,...,...,...,...,...
137,1655,Vietnam,2007,85262356.0,Asia,74.249,2441.576404
138,1667,West Bank and Gaza,2007,4018332.0,Asia,73.422,3025.349798
139,1679,"Yemen, Rep.",2007,22211743.0,Asia,62.698,2280.769906
140,1691,Zambia,2007,11746035.0,Africa,42.384,1271.211593


In [358]:
df['population'].min() # Perform aggregates on columns

199579.0

Display the where information for an aggregate

In [359]:
df[df.population == df.population.min()] # Display the row where the population is the minimum

Unnamed: 0.1,Unnamed: 0,country,year,population,cont,life_exp,gdp_cap
108,1307,Sao Tome and Principe,2007,199579.0,Africa,65.528,1598.435089


Group by a column and display the aggrgate for one or more columns

In [360]:
df.groupby('cont')['gdp_cap', 'population'].sum()

  df.groupby('cont')['gdp_cap', 'population'].sum()


Unnamed: 0_level_0,gdp_cap,population
cont,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,160629.695446,929539700.0
Americas,275075.790634,898871200.0
Asia,411609.886714,3811954000.0
Europe,751634.449078,586098500.0
Oceania,59620.37655,24549950.0


In [361]:
df['cont'].value_counts(sort=True, normalize=True) # Normalize - Percentage of distribution

Africa      0.366197
Asia        0.232394
Europe      0.211268
Americas    0.176056
Oceania     0.014085
Name: cont, dtype: float64

## Assignment:

In [362]:
# Pre-defined lists
country = ['United States', 'Australia', 'Japan', 'India', 'Russia', 'Morocco', 'Egypt']
drives_right =  [True, False, False, False, True, True, True]
cars_per_cap = [809, 731, 588, 18, 200, 70, 45]
row_labels = ['US', 'AUS', 'JPN', 'IN', 'RU', 'MOR', 'EG']

##### step1: Create dictionary my_dict with three key:value pairs: 

In [363]:
my_dict = {
    'country': country,
    'drives_right': drives_right,
    'cars_per_cap': cars_per_cap
}

##### step2 Build a DataFrame cars from my_dict: 

In [364]:
cars = pd.DataFrame(my_dict)

##### step3 print cars 

In [365]:
cars

Unnamed: 0,country,drives_right,cars_per_cap
0,United States,True,809
1,Australia,False,731
2,Japan,False,588
3,India,False,18
4,Russia,True,200
5,Morocco,True,70
6,Egypt,True,45


##### step4 specify the row labels of cars 

In [366]:
cars = pd.DataFrame(my_dict, row_labels)

##### step5 print cars again 

In [367]:
cars

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809
AUS,Australia,False,731
JPN,Japan,False,588
IN,India,False,18
RU,Russia,True,200
MOR,Morocco,True,70
EG,Egypt,True,45


##### step6 Print out country column as Pandas Series 

In [368]:
cars.country
# print(type(cars.country))

US     United States
AUS        Australia
JPN            Japan
IN             India
RU            Russia
MOR          Morocco
EG             Egypt
Name: country, dtype: object

##### step7 Print out country column as Pandas DataFrame 

In [369]:
cars[['country']]

Unnamed: 0,country
US,United States
AUS,Australia
JPN,Japan
IN,India
RU,Russia
MOR,Morocco
EG,Egypt


##### step8 Print out DataFrame with country and drives_right columns 

In [370]:
cars[['country', 'drives_right']]

Unnamed: 0,country,drives_right
US,United States,True
AUS,Australia,False
JPN,Japan,False
IN,India,False
RU,Russia,True
MOR,Morocco,True
EG,Egypt,True


##### step9 Print out first 3 observations 

In [371]:
cars[:3]

Unnamed: 0,country,drives_right,cars_per_cap
US,United States,True,809
AUS,Australia,False,731
JPN,Japan,False,588


##### step10 Print out fourth, fifth and sixth observation 

In [372]:
cars[3:6]

Unnamed: 0,country,drives_right,cars_per_cap
IN,India,False,18
RU,Russia,True,200
MOR,Morocco,True,70


##### step11 Print out observation for Japan 

In [373]:
cars.loc[cars.country=='Japan']

Unnamed: 0,country,drives_right,cars_per_cap
JPN,Japan,False,588


##### step12 Print out observations for Australia and Egypt 

In [374]:
cars[cars.country.isin(['Australia', 'Egypt'])]

cars.loc[['AUS', 'EG']]

Unnamed: 0,country,drives_right,cars_per_cap
AUS,Australia,False,731
EG,Egypt,True,45


##### step13 Print out drives_right value of Morocco

In [375]:
# cars.loc[cars.country == 'Morocco', ['drives_right']]

cars.loc[['MOR'], ['drives_right']]


Unnamed: 0,drives_right
MOR,True


In [376]:
# cars.rename_axis('CountryCode')
cars.index.name = 'CountryCode' # Name index after its been created
cars

Unnamed: 0_level_0,country,drives_right,cars_per_cap
CountryCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
US,United States,True,809
AUS,Australia,False,731
JPN,Japan,False,588
IN,India,False,18
RU,Russia,True,200
MOR,Morocco,True,70
EG,Egypt,True,45


## Tools

info()\
number of non nulls per column and the data types of each

In [377]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  142 non-null    int64  
 1   country     142 non-null    object 
 2   year        142 non-null    int64  
 3   population  142 non-null    float64
 4   cont        142 non-null    object 
 5   life_exp    142 non-null    float64
 6   gdp_cap     142 non-null    float64
dtypes: float64(3), int64(2), object(2)
memory usage: 7.9+ KB


describe()\
Gives aggregates for each of the columns

In [378]:
df.describe()

Unnamed: 0.1,Unnamed: 0,year,population,life_exp,gdp_cap
count,142.0,142.0,142.0,142.0,142.0
mean,857.0,2007.0,44021220.0,67.007423,11680.07182
std,493.631441,0.0,147621400.0,12.073021,12859.937337
min,11.0,2007.0,199579.0,39.613,277.551859
25%,434.0,2007.0,4508034.0,57.16025,1624.842248
50%,857.0,2007.0,10517530.0,71.9355,6124.371108
75%,1280.0,2007.0,31210040.0,76.41325,18008.83564
max,1703.0,2007.0,1318683000.0,82.603,49357.19017


shape\
The dimensions of the table\
example has 142 rows and 7 columns

In [379]:
df.shape

(142, 7)

In [380]:
# If we have series we want to put into a DataFrame, we can easily combine them together
# If we wanted a DataFrame from a single series, we can do that by passing in the single series 
east = pd.Series([1000,1200,3400],index=['Q1','Q2','Q3'])
west = pd.Series([1100,1300,2400,3500],index=['Q1','Q2','Q3','Q4'])
df_region = pd.DataFrame({'East':east,'West':west})
df_region

Unnamed: 0,East,West
Q1,1000.0,1100
Q2,1200.0,1300
Q3,3400.0,2400
Q4,,3500


Add new columns with their data

In [381]:
# Once we have a DataFrame, we can easily add Series on
df_region['North'] = [2000,3000,2500,4000] # if you want to add a NaN value, you can use 'None'
df_region['South'] = [1500,2000,1500,4000]
df_region

Unnamed: 0,East,West,North,South
Q1,1000.0,1100,2000,1500
Q2,1200.0,1300,3000,2000
Q3,3400.0,2400,2500,1500
Q4,,3500,4000,4000


In [382]:
years = ['2016','2017','2018','2019']
df_region['years'] = years
df_region

Unnamed: 0,East,West,North,South,years
Q1,1000.0,1100,2000,1500,2016
Q2,1200.0,1300,3000,2000,2017
Q3,3400.0,2400,2500,1500,2018
Q4,,3500,4000,4000,2019


In [383]:
# We can use set_index to set the index to a different column in the DataFrame
df_region = df_region.set_index('years')
df_region

Unnamed: 0_level_0,East,West,North,South
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,1000.0,1100,2000,1500
2017,1200.0,1300,3000,2000
2018,3400.0,2400,2500,1500
2019,,3500,4000,4000


In [384]:
# Let's say we want to see different index values, we can use reindex
# reindex will shift our index
new_df = df_region.reindex(['2017','2018','2019','2020','2021']) # reorder index
new_df = df_region.reindex(columns=['North', 'East', 'South', 'West']) # reorder columns
new_df

Unnamed: 0_level_0,North,East,South,West
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,2000,1000.0,1500,1100
2017,3000,1200.0,2000,1300
2018,2500,3400.0,1500,2400
2019,4000,,4000,3500


Handling missing data

fillna()\
Replaces all null values with the data passed into it

In [385]:
new_df.fillna(0)

Unnamed: 0_level_0,North,East,South,West
years,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,2000,1000.0,1500,1100
2017,3000,1200.0,2000,1300
2018,2500,3400.0,1500,2400
2019,4000,0.0,4000,3500
