# Jupyter Notebook Example using Pandas

Early notebook with far less Markdown commentary than later notebooks but intention is to show use of Pandas through example.  Read the comments in the code itself for guidance.

**References:**

+ https://www.dataschool.io/python-pandas-tips-and-tricks/

+ https://realpython.com/python-pandas-tricks/


## Additional Techniques

In [None]:
# Import key libraries necessary to support dynamic installation of additional libraries
import sys
# Use subprocess to support running operating system commands from the program, using the "bang" (!)
# symbology is supported, however that does not translate to an actual python script, this is a more
# agnostic approach.
import subprocess
import importlib.util

# Identify the libraries you'd like to add to this Runtime environment.
libraries=["rich", "rich[jupyter]", "unidecode", "icecream",
           "polars[all]", "dask[complete]", "xarray",
           "tqdm", ]

# Loop through each library and test for existence, if not present install quietly
for library in libraries:
    if library == "Pillow":
      spec = importlib.util.find_spec("PIL")
    else:
      spec = importlib.util.find_spec(library)
    if spec is None:
      print("Installing library " + library)
      subprocess.run(["pip", "install" , library, "--quiet"], check=True)
    else:
      print("Library " + library + " already installed.")

Library rich already installed.
Installing library rich[jupyter]
Library unidecode already installed.
Library icecream already installed.
Installing library polars[all]
Installing library dask[complete]
Library xarray already installed.
Library tqdm already installed.


In [None]:
#Data Science
import numpy as np
import pandas as pd
import polars as pl
import dask as da
import xarray as xr

#Pretty Print
from rich import print as rprint
from icecream import ic
from tqdm.notebook import trange, tqdm

#Generally useful / common libraries
import os
import subprocess

In [None]:
#library configurations examples using Pandas

options = {
    'display': {
        'max_columns': None,
        'max_colwidth': 25,
        'expand_frame_repr': False,  # Don't wrap to multiple pages
        'max_rows': 14,
        'max_seq_items': 50,         # Max length of printed sequence
        'precision': 4,
        'show_dimensions': False
    },
    'mode': {
        'chained_assignment': None   # Controls SettingWithCopyWarning
    }
}

for category, option in options.items():
    for op, value in option.items():
        pd.set_option(f'{category}.{op}', value)  # Python 3.6+

#nump equivalent
np.set_printoptions(precision=4)

In [None]:
#Just execute, setting the stage
#DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object.

rprint("Define a dictionary of Series, one with an integer and float array.")
rprint("################################################################################################################")
my_dictionary = {'array_one': pd.Series([1,2,3,4,5,6,7,8,9]),
                 'array_two': pd.Series([1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0]),
                 'array_string': pd.Series(["one","two","three","four","five","six","seven","eight","nine","ten"]),
                 }
rprint("")
rprint("Transform that dictinary of Series into a DataFrame")
my_dataframe=pd.DataFrame(my_dictionary)

rprint("################################################################################################################")
rprint("Simply calling print on the data frame shows the contents as two columns, one for each dictionary element")
rprint(my_dataframe)
rprint("")
rprint("Note that if the dictionaries are not the same size, they are uniond and 'NaN' padded for missing values.")

# Pro-tips for Pandas Use

In [None]:
#examples of extracting data in various ways

rprint("Data has:" + str(my_dataframe.columns))
rprint("    size:" + str(my_dataframe.size))
rprint("    shape:" + str(my_dataframe.shape))
rprint("    ndim:" + str(my_dataframe.ndim))
rprint("    column size:" + str(my_dataframe.columns.size))


In [None]:
#with icecream
ic()
ic(my_dataframe.columns)
ic(my_dataframe.size)
ic(my_dataframe.shape)
ic(my_dataframe.ndim)
ic(my_dataframe.columns.size)

ic| <ipython-input-6-0c269d11aeef>:2 in <cell line: 2>() at 20:56:55.361
ic| my_dataframe.columns: Index(['array_one', 'array_two', 'array_string'], dtype='object')
ic| my_dataframe.size: 30
ic| my_dataframe.shape: (10, 3)
ic| my_dataframe.ndim: 2
ic| my_dataframe.columns.size: 3


3

# Data Science type description of your Pandas Dataframe

In [None]:
rprint(my_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   array_one     9 non-null      float64
 1   array_two     9 non-null      float64
 2   array_string  10 non-null     object 
dtypes: float64(2), object(1)
memory usage: 368.0+ bytes


# Just show the top records

In [None]:
my_dataframe.head(2)

Unnamed: 0,array_one,array_two,array_string
0,1.0,1.0,one
1,2.0,2.0,two


# Or you can get an array of column names and a generic type returned

In [None]:
#show column header names.
print(my_dataframe.columns)

Index(['array_one', 'array_two', 'array_string'], dtype='object')


# Want Stats...Pandas has them

In [None]:
my_dataframe.describe()

Unnamed: 0,array_one,array_two
count,9.0,9.0
mean,5.0,5.0
std,2.7386,2.7386
min,1.0,1.0
25%,3.0,3.0
50%,5.0,5.0
75%,7.0,7.0
max,9.0,9.0


# Column name changes are a joke

## Just remember that the old columns will remain so you'll want to remove them

In [None]:
my_dataframe['A_new_column'] = my_dataframe['array_one'] * 10

In [None]:
my_dataframe.describe()

Unnamed: 0,array_one,array_two,A_new_column
count,9.0,9.0,9.0
mean,5.0,5.0,50.0
std,2.7386,2.7386,27.3861
min,1.0,1.0,10.0
25%,3.0,3.0,30.0
50%,5.0,5.0,50.0
75%,7.0,7.0,70.0
max,9.0,9.0,90.0


# Do we need to remove a column?

In [None]:
my_dataframe.drop(columns=['A_new_column' ], inplace=True)

In [None]:
my_dataframe.describe()

Unnamed: 0,array_one,array_two
count,9.0,9.0
mean,5.0,5.0
std,2.7386,2.7386
min,1.0,1.0
25%,3.0,3.0
50%,5.0,5.0
75%,7.0,7.0
max,9.0,9.0


# Example Lambda Expression and Bulk Modification

In [None]:
my_dataframe['array_one_ZERO'] = my_dataframe.apply(lambda row: row.array_one - row.array_one, axis=1)

my_dataframe['array_one_MINUS_10_PERCENT'] = my_dataframe['array_one'] - (my_dataframe['array_one'] * 0.10)

In [None]:
my_dataframe.describe()

Unnamed: 0,array_one,array_two,array_one_ZERO,array_one_MINUS_10_PERCENT
count,9.0,9.0,9.0,9.0
mean,5.0,5.0,0.0,4.5
std,2.7386,2.7386,0.0,2.4648
min,1.0,1.0,0.0,0.9
25%,3.0,3.0,0.0,2.7
50%,5.0,5.0,0.0,4.5
75%,7.0,7.0,0.0,6.3
max,9.0,9.0,0.0,8.1


### Take Advantage of Accessor Methods

Perhaps you’ve heard of the term accessor, which is somewhat like a getter (although getters and setters are used infrequently in Python). For our purposes here, you can think of a pandas accessor as a property that serves as an interface to additional methods.

In [None]:
my_series=my_dataframe['array_string']
my_series.str.upper()

Unnamed: 0,array_string
0,ONE
1,TWO
2,THREE
3,FOUR
4,FIVE
5,SIX
6,SEVEN
7,EIGHT
8,NINE
9,TEN


### Show Memory Usage

Make adjustments to improve memory utilization.

In [None]:
my_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   array_one                   9 non-null      float64
 1   array_two                   9 non-null      float64
 2   array_string                10 non-null     object 
 3   array_one_ZERO              9 non-null      float64
 4   array_one_MINUS_10_PERCENT  9 non-null      float64
dtypes: float64(4), object(1)
memory usage: 528.0+ bytes


In [None]:
# Get memory usage of each column in bytes
memory_usage_per_column = my_dataframe.memory_usage(deep=True)

# Get total memory usage of the DataFrame in bytes
total_memory_usage = my_dataframe.memory_usage().sum()
rprint(total_memory_usage)

In [None]:
my_dataframe['array_one'] = pd.to_numeric(my_dataframe['array_one'], downcast='integer')
my_dataframe['array_two'] = pd.to_numeric(my_dataframe['array_two'], downcast='integer')

#For columns that have a lot of repeats consider a category instead.
#my_dataframe['array_string'] = my_dataframe['array_string'].astype('category')

# Get memory usage of each column in bytes
memory_usage_per_column = my_dataframe.memory_usage(deep=True)

# Get total memory usage of the DataFrame in bytes
total_memory_usage = my_dataframe.memory_usage().sum()
rprint(total_memory_usage)

In [None]:
format_dict = {'array_one':'{:10,}', 'array_two':'${:.2f}', 'array_string':'{:3}'}
my_dataframe.style.format(format_dict)

(my_dataframe.style.format(format_dict)
 .highlight_min('array_one', color='red')
 .highlight_max('array_one', color='lightgreen')
 .background_gradient(subset='array_two', cmap='Blues')
)

Unnamed: 0,array_one,array_two,array_string,array_one_ZERO,array_one_MINUS_10_PERCENT
0,1.0,$1.00,one,0.0,0.9
1,2.0,$2.00,two,0.0,1.8
2,3.0,$3.00,three,0.0,2.7
3,4.0,$4.00,four,0.0,3.6
4,5.0,$5.00,five,0.0,4.5
5,6.0,$6.00,six,0.0,5.4
6,7.0,$7.00,seven,0.0,6.3
7,8.0,$8.00,eight,0.0,7.2
8,9.0,$9.00,nine,0.0,8.1
9,,$nan,ten,,


In [None]:
(my_dataframe.style.format(format_dict)
 .bar('array_one', color='lightblue', align='zero')
 .set_caption('Example caption')
)

Unnamed: 0,array_one,array_two,array_string,array_one_ZERO,array_one_MINUS_10_PERCENT
0,1.0,$1.00,one,0.0,0.9
1,2.0,$2.00,two,0.0,1.8
2,3.0,$3.00,three,0.0,2.7
3,4.0,$4.00,four,0.0,3.6
4,5.0,$5.00,five,0.0,4.5
5,6.0,$6.00,six,0.0,5.4
6,7.0,$7.00,seven,0.0,6.3
7,8.0,$8.00,eight,0.0,7.2
8,9.0,$9.00,nine,0.0,8.1
9,,$nan,ten,,


### Bonus Input (Profiling)

In [None]:
subprocess.run(["pip", "install" , "ydata-profiling", "--quiet"], check=True)
import pandas_profiling

  import pandas_profiling


In [None]:
pandas_profiling.ProfileReport(my_dataframe)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

