# Guide to Working with Pandas Dataframes

Obviously, import the required `pandas` module. Importing `numpy` can also be helpful if working with numbers/data.

In [2]:
import pandas as pd
import numpy as np

## Importing and Creating Dataframes

In [5]:
# To import an existing dataset
df = pd.read_csv('amazon-orders.csv')

# To create a dataframe from scratch
# my_df = pd.DataFrame(columns = ['col1', 'col2', 'col3', 'col4'])

## Inspecting Data

In [8]:
df.head()
# df.tail()

Unnamed: 0,Order Date,Order ID,Payment Instrument Type,Website,Purchase Order Number,Ordering Customer Email,Shipment Date,Shipping Address Name,Shipping Address Street 1,Shipping Address Street 2,...,Order Status,Carrier Name & Tracking Number,Subtotal,Shipping Charge,Tax Before Promotions,Total Promotions,Tax Charged,Total Charged,Buyer Name,Group Name
0,10/01/21,111-3360468-5029056,Discover1185,Amazon.com,,enl9076@gmail.com,10/02/21,Emily Lasko,1004 SHAW CT,,...,Shipped,USPS(9361289703006253206961),$25.94,$0.00,$1.37,$0.00,$1.37,$27.31,Emily Lasko,
1,10/03/21,111-3603208-6216250,Discover1185,Amazon.com,,enl9076@gmail.com,10/05/21,Emily Lasko,1004 SHAW CT,,...,Shipped,DHL eCommerce(9274899999898298408575),$19.99,$0.00,$1.06,$0.00,$1.06,$21.05,Emily Lasko,
2,10/15/21,112-8245754-2692225,Discover1185,Amazon.com,,enl9076@gmail.com,10/15/21,Emily Lasko,1004 SHAW CT,,...,Shipped,USPS(9374889703006474162448),$32.78,$0.00,$1.74,$0.00,$1.74,$34.52,Emily Lasko,
3,10/20/21,112-0223917-7836246,Visa - 3556,Amazon.com,,enl9076@gmail.com,10/20/21,Emily Lasko,1004 SHAW CT,,...,Shipped,USPS(9374889703006571898394),$18.47,$0.00,$0.68,$5.60,$0.68,$13.55,Emily Lasko,
4,10/23/21,113-7485654-6613823,Discover1185,Amazon.com,,enl9076@gmail.com,10/24/21,Emily Lasko,1004 SHAW CT,,...,Shipped,USPS(9361289703006715554173),$38.99,$0.00,$1.80,$5.00,$1.80,$35.79,Emily Lasko,


In [9]:
# Get information about the dataset as a whole
df.info()

# Determine the variable types in the dataset
df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Order Date                      6 non-null      object 
 1   Order ID                        6 non-null      object 
 2   Payment Instrument Type         6 non-null      object 
 3   Website                         6 non-null      object 
 4   Purchase Order Number           0 non-null      float64
 5   Ordering Customer Email         6 non-null      object 
 6   Shipment Date                   5 non-null      object 
 7   Shipping Address Name           6 non-null      object 
 8   Shipping Address Street 1       6 non-null      object 
 9   Shipping Address Street 2       0 non-null      float64
 10  Shipping Address City           6 non-null      object 
 11  Shipping Address State          6 non-null      object 
 12  Shipping Address Zip            6 non-nu

Order Date                         object
Order ID                           object
Payment Instrument Type            object
Website                            object
Purchase Order Number             float64
Ordering Customer Email            object
Shipment Date                      object
Shipping Address Name              object
Shipping Address Street 1          object
Shipping Address Street 2         float64
Shipping Address City              object
Shipping Address State             object
Shipping Address Zip               object
Order Status                       object
Carrier Name & Tracking Number     object
Subtotal                           object
Shipping Charge                    object
Tax Before Promotions              object
Total Promotions                   object
Tax Charged                        object
Total Charged                      object
Buyer Name                         object
Group Name                        float64
dtype: object

## Summarizing Data

In [None]:
df.describe()
np.mean(df['col1'])
np.average(df['col1'])
np.median(df['col1'])
stats.mode(df['col1'])
var1_mean = df['col1'].mean()
var2_mean = df['col2'].mean()

df.groupby('col3').sum()
# Get values of categorical variable
df['col'].unique()

# Get a table of proportions
df['col'].value_counts(normalize=True).index[0]

# Convert a variable to categorical
buying_cost_categories = ['low','med','high','vhigh']
df['col']=pd.Categorical(df['col'], buying_cost_categories, ordered = True)

# Create numeric codes from categories
np.median(df['col'].cat.codes)

# Create dummy variables from a categircal variable (drop_first argument is optional)
pd.get_dummies(df['col'], drop_first=True)

## Analyzing Data

In [None]:
# Calculating correlations
df.corr()
var, p = pearsonr(df['col1'], df['col2'])   

var = pd.crosstab(df['col1'], df['col2'])