# Guide to Working with Pandas Dataframes

Obviously, import the required `pandas` module. Importing `numpy` can also be helpful if working with numbers/data.

In [2]:
import pandas as pd
import numpy as np
import statistics as stats

## Importing and Creating Dataframes

The dataset used here was retrieved from https://www.kaggle.com/anandhuh/covid19-in-world-countrieslatest-data

In [6]:
# To import an existing dataset
df = pd.read_csv('worldwide covid data.csv')

# To create a dataframe from scratch
# my_df = pd.DataFrame(columns = ['col1', 'col2', 'col3', 'col4'])

# Or create a dataframe from a dictionary
# my_dict = {'col1': [1, 2, 3], 'col2': [4, 5, 6], 'col3': [7, 8, 9], 'col4': [10, 11, 12]}
# df = pd.DataFrame.from_dict(my_dict)

## Inspecting Data

In [6]:
df.head()
# df.tail()

Unnamed: 0,Country,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Cases/1M population,Deaths/1M population,Total Tests,Tests/1M population,Population
0,Afghanistan,156307,7281,128791.0,20235.0,3898,182,774655.0,19319.0,40097200
1,Albania,186222,2937,175451.0,7834.0,64805,1022,1311540.0,456411.0,2873596
2,Algeria,206649,5927,141811.0,58911.0,4602,132,230861.0,5141.0,44907419
3,Andorra,15516,130,15242.0,144.0,200387,1679,193595.0,2500258.0,77430
4,Angola,64487,1713,53376.0,9398.0,1884,50,1092363.0,31915.0,34227629


In [7]:
print(df.columns)
# Get information about the dataset as a whole
df.info()

# Determine the variable types in the dataset
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    196 non-null    object 
 1   Total Cases                196 non-null    int64  
 2   Total Deaths               196 non-null    int64  
 3   Total Recovered            194 non-null    float64
 4   Active Cases               194 non-null    float64
 5   Total Cases/1M population  196 non-null    int64  
 6   Deaths/1M population       196 non-null    int64  
 7   Total Tests                192 non-null    float64
 8   Tests/1M population        192 non-null    float64
 9   Population                 196 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 15.4+ KB


Country                       object
Total Cases                    int64
Total Deaths                   int64
Total Recovered              float64
Active Cases                 float64
Total Cases/1M population      int64
Deaths/1M population           int64
Total Tests                  float64
Tests/1M population          float64
Population                     int64
dtype: object

## Summarizing Data

*Note: pandas and numpy return slightly different values for their respective `var` and `std` functions.*

In [7]:
df.describe()

Unnamed: 0,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Cases/1M population,Deaths/1M population,Total Tests,Tests/1M population,Population
count,196.0,196.0,194.0,194.0,196.0,196.0,192.0,192.0,196.0
mean,1266763.0,25658.688776,1150839.0,92607.71,56113.617347,914.311224,20941070.0,1060358.0,40085760.0
std,4593222.0,84846.606103,4046744.0,674427.5,51997.542468,955.591341,75125030.0,1876692.0,148214000.0
min,2621.0,3.0,55.0,2.0,68.0,3.0,14689.0,3287.0,34026.0
25%,22376.75,333.5,15901.75,669.5,5636.0,112.5,354052.0,105155.0,1987078.0
50%,177887.5,2434.5,140230.0,7220.0,46790.0,628.0,1978244.0,463271.5,8907116.0
75%,660505.2,11778.5,608550.8,34432.75,91629.75,1560.75,11124830.0,1280447.0,28938740.0
max,46999770.0,768847.0,37009990.0,9220934.0,231328.0,5964.0,703435500.0,14814710.0,1439324000.0


In [24]:
# Various ways to quickly get the mean of a column
np.mean(df['Total Cases'])
np.average(df['Total Cases'])
df['Total Cases'].mean()

# Also various ways to find variance and standard deviation
np.var(df['Total Cases'])
np.std(df['Total Cases'])
df['Total Cases'].var()
df['Total Cases'].std()

# Median and mode
np.median(df['Total Cases'])
stats.mode(df['Total Cases'])
stats.median(df['Total Cases'])

177887.5

In [None]:

df.groupby('col3').sum()
# Get values of categorical variable
df['col'].unique()

# Get a table of proportions
df['col'].value_counts(normalize=True).index[0]

# Convert a variable to categorical
buying_cost_categories = ['low','med','high','vhigh']
df['col']=pd.Categorical(df['col'], buying_cost_categories, ordered = True)

# Create numeric codes from categories
np.median(df['col'].cat.codes)

# Create dummy variables from a categircal variable (drop_first argument is optional)
pd.get_dummies(df['col'], drop_first=True)