# Guide to Working with Pandas Dataframes

Obviously, import the required `pandas` module. Importing `numpy` can also be helpful if working with numbers/data.

In [2]:
import pandas as pd
import numpy as np

## Importing and Creating Dataframes

The dataset used here was retrieved from https://www.kaggle.com/anandhuh/covid19-in-world-countrieslatest-data

In [5]:
# To import an existing dataset
df = pd.read_csv('worldwide covid data.csv')

# To create a dataframe from scratch
# my_df = pd.DataFrame(columns = ['col1', 'col2', 'col3', 'col4'])

## Inspecting Data

In [6]:
df.head()
# df.tail()

Unnamed: 0,Country,Total Cases,Total Deaths,Total Recovered,Active Cases,Total Cases/1M population,Deaths/1M population,Total Tests,Tests/1M population,Population
0,Afghanistan,156307,7281,128791.0,20235.0,3898,182,774655.0,19319.0,40097200
1,Albania,186222,2937,175451.0,7834.0,64805,1022,1311540.0,456411.0,2873596
2,Algeria,206649,5927,141811.0,58911.0,4602,132,230861.0,5141.0,44907419
3,Andorra,15516,130,15242.0,144.0,200387,1679,193595.0,2500258.0,77430
4,Angola,64487,1713,53376.0,9398.0,1884,50,1092363.0,31915.0,34227629


In [7]:
# Get information about the dataset as a whole
df.info()

# Determine the variable types in the dataset
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    196 non-null    object 
 1   Total Cases                196 non-null    int64  
 2   Total Deaths               196 non-null    int64  
 3   Total Recovered            194 non-null    float64
 4   Active Cases               194 non-null    float64
 5   Total Cases/1M population  196 non-null    int64  
 6   Deaths/1M population       196 non-null    int64  
 7   Total Tests                192 non-null    float64
 8   Tests/1M population        192 non-null    float64
 9   Population                 196 non-null    int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 15.4+ KB


Country                       object
Total Cases                    int64
Total Deaths                   int64
Total Recovered              float64
Active Cases                 float64
Total Cases/1M population      int64
Deaths/1M population           int64
Total Tests                  float64
Tests/1M population          float64
Population                     int64
dtype: object

## Summarizing Data

In [None]:
df.describe()
np.mean(df['col1'])
np.average(df['col1'])
np.median(df['col1'])
stats.mode(df['col1'])
var1_mean = df['col1'].mean()
var2_mean = df['col2'].mean()

df.groupby('col3').sum()
# Get values of categorical variable
df['col'].unique()

# Get a table of proportions
df['col'].value_counts(normalize=True).index[0]

# Convert a variable to categorical
buying_cost_categories = ['low','med','high','vhigh']
df['col']=pd.Categorical(df['col'], buying_cost_categories, ordered = True)

# Create numeric codes from categories
np.median(df['col'].cat.codes)

# Create dummy variables from a categircal variable (drop_first argument is optional)
pd.get_dummies(df['col'], drop_first=True)

## Analyzing Data

In [None]:
# Calculating correlations
df.corr()
var, p = pearsonr(df['col1'], df['col2'])   

var = pd.crosstab(df['col1'], df['col2'])