# Jupyter Notebook showing Numpy/Pandas use with data

## Pandas Memory Reduction


In [1]:
# Let's define some variables (information holders) for our project overall
# If using GCP or a similar Cloud Service Provider (CSP), you might be required
# to provide project details.

global PROJECT_ID, BUCKET_NAME, LOCATION
BUCKET_NAME =""
PROJECT_ID  =""
LOCATION    =""

BOLD_START="\033[1m"
BOLD_END="\033[0m"

## Import Base Libraries

In [2]:
# Import key libraries necessary to support dynamic installation of additional libraries
# Use subprocess to support running operating system commands from the program, using the "bang" (!)
# symbology is supported, however that does not translate to an actual python script, this is a more
# agnostic approach.
###########################################
#- Minimal imports to start
###########################################
try:
    import sys
    import subprocess
    import importlib.util
    import atexit
    import os
except ImportError as e:
    print("There was a problem importing the most basic libraries necessary for this code.")
    print(repr(e))
    raise SystemExit("Stop right there!")

###########################################
#- Final Exit Routine
###########################################
@atexit.register
def goodbye():
    print("GOODBYE")

## Load Minimally Required Libraries

In [3]:
libraries=["numpy", "pandas", "scipy", "sklearn", "matplotlib", "seaborn",
           "rich", "rich[jupyter]", "unidecode", "icecream",
           "polars[all]", "dask[complete]", "xarray",
           "tqdm", "watermark",
           "Pillow",]

import importlib.util

for library in libraries:
    if library == "Pillow":
      spec = importlib.util.find_spec("PIL")
    else:
      spec = importlib.util.find_spec(library)
    if spec is None:
      print("Installing library " + library)
      subprocess.run(["pip", "install" , library, "--quiet"])
    else:
      print("Library " + library + " already installed.")

Library numpy already installed.
Library pandas already installed.
Library scipy already installed.
Library sklearn already installed.
Library matplotlib already installed.
Library seaborn already installed.
Library rich already installed.
Installing library rich[jupyter]
Library unidecode already installed.
Library icecream already installed.
Installing library polars[all]
Installing library dask[complete]
Library xarray already installed.
Library tqdm already installed.
Library watermark already installed.
Library Pillow already installed.


## Environment Check

In [4]:
#- Environment Check
import datetime
import platform

RunningInCOLAB = False
RunningInCOLAB = 'google.colab' in str(get_ipython())
current_time   = datetime.datetime.now()
operating_system=platform.system()

if RunningInCOLAB:
    python_environment=" Google Colab "
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = "all"
else:
    python_environment=" Python command-line "

print(f"You are running this notebook in {python_environment} at {BOLD_START}{current_time}{BOLD_END} on {BOLD_START}{operating_system}{BOLD_END} in the {BOLD_START}{PROJECT_ID}{BOLD_END} lab.")

You are running this notebook in  Google Colab  at [1m2025-02-05 19:51:15.049674[0m on [1mLinux[0m in the [1m[0m lab.


## Includes and Libraries

In [5]:
############################################
#INCLUDES
############################################
import csv
from datetime import datetime, timedelta
import fnmatch
import statistics
import pickle

#Generally useful / common libraries
import os
import subprocess
import datetime
import sys
import gc
import getopt
import inspect
import math
import warnings
from pathlib import Path

#Data Science
import numpy as np
import pandas as pd
import polars as pl
import dask as da
import xarray as xr

#Pretty Print
from rich import print as rprint
from icecream import ic
from tqdm.notebook import trange, tqdm
from watermark import watermark as the_watermark

#libraries specific to this example
import scipy as sp
from sklearn.linear_model import LinearRegression

#- Graphics
import matplotlib.pyplot as plt
import matplotlib as matplt
import matplotlib
from matplotlib.cbook import get_sample_data
from matplotlib.offsetbox import (AnnotationBbox, DrawingArea, OffsetImage,
                                  TextArea)
from matplotlib.pyplot import imshow
from matplotlib.patches import Circle
from PIL import Image as PIL_Image
import PIL.ImageOps
import seaborn as sns

#- Image meta-data for Section 508 compliance
import piexif
from piexif.helper import UserComment

#- Additional libraries for this work
import math
from base64 import b64decode
#from IPython.display import Image, Markdown
from IPython.display import Image, Markdown
import pandas, IPython.display as display, io, jinja2, base64
import requests
import unidecode

## Functions

In [6]:
# Functions are like legos that do one thing, this function outputs library version history of effort.
def lib_diagnostics() -> None:

    import pkg_resources

    package_name_length=20
    package_version_length=10

    data_version_release="-".join([str(VERSION_NAME),str(VERSION_MAJOR), str(VERSION_MINOR), str(VERSION_RELEASE)])
    rprint(f"Software Version Information: {data_version_release}")
    rprint("\n")
    # Show notebook details
    #%watermark?
    #%watermark --github_username christophergwood --email christopher.g.wood@gmail.com --date --time --iso8601 --updated --python --conda --hostname --machine --githash --gitrepo --gitbranch --iversions --gpu
    # Watermark
    rprint(the_watermark(author=f"{AUTHOR_NAME}", github_username=f"GITHUB_USERNAME", email=f"{AUTHOR_EMAIL}",iso8601=True, datename=True, current_time=True, python=True, updated=True, hostname=True, machine=True, gitrepo=True, gitbranch=True, githash=True))


    print(f"{BOLD_START}Packages:{BOLD_END}")
    print("")
    # Get installed packages
    the_packages=["nltk", "numpy", "os", "pandas", "seaborn"]
    installed = {pkg.key: pkg.version for pkg in pkg_resources.working_set}
    for package_idx, package_name in enumerate(installed):
         if package_name in the_packages:
             installed_version = installed[package_name]
             rprint(f"{package_name:<40}#: {str(pkg_resources.parse_version(installed_version)):<20}")

    try:
        rprint(f"{'TensorFlow version':<40}#: {str(tf.__version__):<20}")
        rprint(f"{'     gpu.count:':<40}#: {str(len(tf.config.experimental.list_physical_devices('GPU')))}")
        rprint(f"{'     cpu.count:':<40}#: {str(len(tf.config.experimental.list_physical_devices('CPU')))}")
    except Exception as e:
        pass

    try:
        rprint(f"{'Torch version':<40}#: {str(torch.__version__):<20}")
        rprint(f"{'     GPUs available?':<40}#: {torch.cuda.is_available()}")
        rprint(f"{'     count':<40}#: {torch.cuda.device_count()}")
        rprint(f"{'     current':<40}#: {torch.cuda.current_device()}")
    except Exception as e:
        pass


    try:
      print(f"{'OpenAI Azure Version':<40}#: {str(the_openai_version):<20}")
    except Exception as e:
      pass

    return

In [7]:
# Routines designed to support adding ALT text to an image generated through Matplotlib.

def capture(figure):
   buffer = io.BytesIO()
   figure.savefig(buffer)
   #return F"data:image/png;base64,{base64.b64encode(buffer.getvalue()).decode()}"
   return F"data:image/jpg;base64,{base64.b64encode(buffer.getvalue()).decode()}"

def make_accessible(figure, template, **kwargs):
   return display.Markdown(F"""![]({capture(figure)} "{template.render(**globals(), **kwargs)}")""")


# requires JPG's or TIFFs
def add_alt_text(image_path, alt_text):
    try:
        if os.path.isfile(image_path):
          img = PIL_Image.open(image_path)
          if "exif" in img.info:
              exif_dict = piexif.load(img.info["exif"])
          else:
              exif_dict={}

          w, h = img.size
          if "0th" not in exif_dict:
            exif_dict["0th"]={}
          exif_dict["0th"][piexif.ImageIFD.XResolution] = (w, 1)
          exif_dict["0th"][piexif.ImageIFD.YResolution] = (h, 1)

          software_version=" ".join(["STEM-001 with Python v", str(sys.version).split(" ")[0]])
          exif_dict["0th"][piexif.ImageIFD.Software]=software_version.encode("utf-8")

          if "Exif" not in exif_dict:
            exif_dict["Exif"]={}
          exif_dict["Exif"][piexif.ExifIFD.UserComment] = UserComment.dump(alt_text, encoding="unicode")

          exif_bytes = piexif.dump(exif_dict)
          img.save(image_path, "jpeg", exif=exif_bytes)
        else:
          rprint(f"Cound not fine {image_path} for ALT text modification, please check your paths.")

    except (FileExistsError, FileNotFoundError, Exception) as e:
        process_exception(e)

# Appears to solve a problem associated with GPU use on Colab, see: https://github.com/explosion/spaCy/issues/11909
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"


In [8]:
# this function displays the stack trace on errors from a central location making adjustments to the display on an error easier to manage
# functions perform useful solutions for highly repetitive code
def process_exception(inc_exception: Exception) -> None:
  try:
    if DEBUG_STACKTRACE==1:
      traceback.print_exc()
      console.print_exception(show_locals=True)
    else:
      rprint(repr(inc_exception))
  except Exception as e:
    rprint(repr(inc_exception))

In [9]:
#library configurations examples using Pandas
def setup_libary_configuration() -> None:
    options = {
        'display': {
            'max_columns': None,
            'max_colwidth': 25,
            'expand_frame_repr': False,  # Don't wrap to multiple pages
            'max_rows': 14,
            'max_seq_items': 50,         # Max length of printed sequence
            'precision': 4,
            'show_dimensions': False
        },
        'mode': {
            'chained_assignment': None   # Controls SettingWithCopyWarning
        }
    }

    for category, option in options.items():
        for op, value in option.items():
            pd.set_option(f'{category}.{op}', value)  # Python 3.6+

    #nump equivalent
    np.set_printoptions(precision=4)

In [33]:
def quick_df_stats(inc_df:pd.DataFrame,
                   inc_header_count: int,
                   ) -> None:
    '''
    Load the coarse data (ACS) and return as a pd.DataFrame.

            Parameters:
                   inc_df (pd.DataFrame): Dataframe to be inspected, displayed
                   inc_header_count (int): Anticipated number of columns to read in (validation check)

            Returns:
                    Printed output
    '''
    if DEBUG:
      rprint("Coarse Resolution data has: " + str(inc_df.columns))
      rprint("\n")
      rprint(f"""{"size":20} : {inc_df.size:15,} """)
      rprint(f"""{"shape":20} : {str(inc_df.shape):15} """)
      rprint(f"""{"ndim":20} : {inc_df.ndim:15,} """)
      rprint(f"""{"column size":20} : {inc_df.columns.size:15,} """)

      #index added so you get an extra column
      rprint(f"""{"Read":20} : {inc_df.columns.size-1:15,} """)
      rprint(f"""{"Expected":20} : {inc_header_count:15,} """)
      if ( (inc_df.columns.size-1) == inc_header_count):
          print(f"{BOLD_START}Expectations met{BOLD_END}.")
      else:
          print(f"Expectations {BOLD_START}not met{BOLD_END}, check your datafile, columns don't match.")
      rprint("\n")
      #rprint(str(inc_df.describe()))



## Variable declaration

In [24]:
############################################
# GLOBAL VARIABLES
############################################
DEBUG = 1
DEBUG_DATA = 0
# used to fully display the error stack, set to 1 if you want to see a ridiculous amount of debugging information
DEBUG_STACKTRACE=0

VERSION_NAME    = "InSitu-Flow-Thru"
VERSION_MAJOR   = 0
VERSION_MINOR   = 0
VERSION_RELEASE = 1

# API Parameters for things like WordCloud, variables help hold information for later use
# The "constants" represent variables that we don't anticipate changing over the course of the program.
IMG_BACKGROUND="black"     #options are black, white, another color or None
IMG_FONT_SIZE_MIN=10
IMG_WIDTH=1024
IMG_HEIGHT=768
IMG_INTERP="bilinear"
IMG_ALPHA=0.8
IMG_ASPECT="equal"
FIGURE_WIDTH=11
FIGURE_HEIGHT=8.5
WORD_FREQ=10

# specify how image formats will be saved
IMG_EXT=".jpg"

# location of our working files
WORKING_FOLDER="/content/folderOnColab"
SRC_DIR=WORKING_FOLDER

# Notebook Author details
AUTHOR_NAME="Christopher G Wood"
GITHUB_USERNAME="christophergarthwood"
AUTHOR_EMAIL="christopher.g.wood@gmail.com"

# Encoding
ENCODING  ="utf-8"
os.environ['PYTHONIOENCODING']=ENCODING

############################################
#APPLICATION VARIABLES
############################################
#define ACS data setup
ACS_FILE="ACS.txt"
ACS_ID="12L8VRY6J1Sj-B1vIf-ODh4kjHWHqIzm8"

## Library Invocation

In [25]:
rprint("Library Diagnostics")
setup_libary_configuration()
lib_diagnostics()

[1mPackages:[0m



## Data Read

Using pandas read in a data file and establish a log for output.

In [26]:
target_folder=WORKING_FOLDER

target_files=["12L8VRY6J1Sj-B1vIf-ODh4kjHWHqIzm8"]
target_filenames=["ACS.txt"]

rprint(f"Creating a folder ({target_folder}) to store project data.")

try:
  if os.path.isfile(target_folder):
    raise OSError("Cannot create your folder a file of the same name already exists there, work with your instructor or remove it yourself.")
  elif os.path.isdir(target_folder):
    rprint(f"The folder named ({target_folder}) {BOLD_START}already exists{BOLD_END}, we won't try to create a new folder.")
  else:
    subprocess.run(["mkdir", "-p" , target_folder], check=True)
except (subprocess.CalledProcessError, Exception) as e:
  process_exception(e)

for idx, the_name in enumerate(target_files):
  try:
    rprint(f"...downloading {target_filenames[idx]}.")
    subprocess.run(["gdown", f"{the_name}", "--no-check-certificate",  "--continue", "-O", f"{target_folder}{os.sep}{target_filenames[idx]}"], check=True)
  except (subprocess.CalledProcessError, Exception) as e:
    process_exception(e)
    raise SystemError

rprint("\n")
rprint("Files downloaded:")
cmd=["ls", "-alfR", f"{target_folder}/",]
completed_process=subprocess.run(cmd, check=True, shell=True, capture_output=True, text=True)
if (completed_process.returncode==0):
  rprint(completed_process.stdout)
else:
  rprint(f"Command failed with error code of: {completed_process.returncode}")

CompletedProcess(args=['gdown', '12L8VRY6J1Sj-B1vIf-ODh4kjHWHqIzm8', '--no-check-certificate', '--continue', '-O', '/content/folderOnColab/ACS.txt'], returncode=0)

In [45]:
target_filename=f"{target_folder}{os.sep}{target_filenames[idx]}"
target_columns=175

df = pd.read_csv(target_filename, sep="\t", index_col=False)
df.reset_index(inplace=True)

quick_df_stats(df,target_columns)

[1mExpectations met[0m.


In [46]:
df.head()

Unnamed: 0,index,Year,Month,Day,Hour,Minute,Second(UTC),Longitude(deg),Latitude(deg),Pressure(dbar),C400,C404.1,C407.1,C410.5,C413.9,C417.4,C421.5,C426,C430.1,C433.8,C437.9,C442.1,C446.3,C451.1,C455.7,C459.9,C464.1,C468.5,C473.3,C478,C483.1,C487.6,C491.8,C496,C500.1,C504.8,C509.5,C514.4,C519.2,C523.9,C528.2,C532.3,C536.6,C540.9,C545.4,C549.9,C554.3,C558.7,C563.2,C567.6,C571.5,C575.4,C579.3,C582.9,C587.6,C591.7,C596,C600.6,C605,C609.3,C613.9,C618.4,C622.5,C626.5,C630.9,C635,C639.3,C643.6,C647.9,C652.7,C657,C661.4,C665.9,C670.3,C674.7,C678.8,C683.2,C687.1,C690.8,C694.7,C698.9,C702.3,C706.3,C710,C714.2,C717.6,C721.4,C725.6,C729.2,C732.7,C735.9,C739.4,C743.7,A398.4,A402.1,A405.7,A408.9,A412.3,A415.8,A419.8,A424,A428.3,A432.2,A436.1,A440.3,A444.8,A449.5,A454.1,A458.3,A462.4,A466.8,A471.5,A476.4,A481.3,A486,A490.2,A494.4,A498.8,A503.1,A507.8,A512.6,A517.7,A522.5,A526.8,A531.1,A535.3,A539.7,A544.2,A548.7,A553.1,A557.5,A562.2,A566.6,A570.8,A574.6,A578.6,A582,A585.8,A589.6,A594,A598.7,A602.9,A607.6,A611.8,A616.5,A620.9,A625.2,A629.4,A633.7,A637.7,A642.1,A646.4,A650.9,A655.4,A660,A664.7,A669,A673.6,A677.8,A682,A685.9,A690,A693.7,A697.9,A701.7,A705.4,A709.2,A713.4,A717.1,A720.8,A724.6,A728.6,A732.1,A735.6,A738.9,A742.7
0,0,2018.0,7.0,18.0,3.0,54.0,55.25,82.5004,6.3881,56.6742,0.3842,0.379,0.3553,0.3232,0.3223,0.2925,0.2877,0.3031,0.294,0.2948,0.2888,0.2803,0.2727,0.2692,0.2671,0.2655,0.2583,0.2552,0.2468,0.234,0.2345,0.2263,0.2215,0.2181,0.2092,0.2071,0.2018,0.1926,0.1926,0.1882,0.1847,0.1871,0.1792,0.1792,0.1763,0.1703,0.1757,0.1734,0.1733,0.1712,0.1659,0.1637,0.1606,0.1613,0.1585,0.159,0.1596,0.1548,0.1522,0.1518,0.1494,0.1483,0.1476,0.1435,0.1426,0.1389,0.1384,0.137,0.1337,0.1373,0.1336,0.1315,0.1305,0.1318,0.1266,0.1291,0.1293,0.1231,0.1253,0.1257,0.122,0.1207,0.1191,0.1256,0.1272,0.1259,0.1269,0.1198,0.1277,0.1299,0.1249,0.1307,0.1292,0.1362,0.159,0.1813,0.1946,0.1911,0.1882,0.1713,0.161,0.1432,0.1377,0.1333,0.1258,0.1179,0.1109,0.101,0.0939,0.0906,0.0887,0.0822,0.0789,0.0753,0.0702,0.0658,0.063,0.0614,0.0571,0.0538,0.0508,0.0472,0.0443,0.0433,0.0417,0.0391,0.0382,0.037,0.0363,0.0358,0.0355,0.0353,0.0344,0.0335,0.0336,0.0333,0.0319,0.026,0.0287,0.0296,0.0294,0.0278,0.0267,0.0279,0.0309,0.0335,0.0342,0.0341,0.0325,0.0314,0.0304,0.0275,0.0249,0.0229,0.0208,0.0174,0.0155,0.014,0.0133,0.0131,0.0131,0.0107,0.0084,0.0097,0.0112,0.0122,0.0146,0.0181,0.0206,0.0237,0.0262,0.0273,0.0265,0.0275,0.0308,0.0328
1,1,2018.0,7.0,18.0,3.0,54.0,56.25,82.5004,6.3881,56.7638,0.3727,0.3607,0.3435,0.3376,0.3235,0.308,0.304,0.2988,0.2821,0.2783,0.2716,0.2614,0.2568,0.2446,0.2465,0.2445,0.2315,0.2342,0.2278,0.2146,0.2153,0.209,0.2034,0.2023,0.1925,0.1944,0.1872,0.1792,0.1795,0.1737,0.168,0.1699,0.1626,0.1615,0.1606,0.1542,0.1575,0.1543,0.1512,0.1519,0.1469,0.1454,0.1434,0.1432,0.1397,0.141,0.1404,0.1362,0.1361,0.1376,0.1364,0.1349,0.1342,0.1303,0.1294,0.1266,0.125,0.125,0.1213,0.124,0.1202,0.1171,0.1168,0.1192,0.113,0.1136,0.1123,0.1042,0.1082,0.108,0.1039,0.1073,0.1027,0.111,0.1108,0.109,0.1114,0.107,0.1121,0.1147,0.1029,0.1072,0.105,0.1121,0.122,0.1183,0.1167,0.1178,0.1209,0.1392,0.1612,0.1823,0.1807,0.1645,0.1444,0.1297,0.1218,0.116,0.1028,0.0914,0.0811,0.0746,0.0662,0.0644,0.0563,0.0505,0.0487,0.0461,0.0439,0.0427,0.0439,0.0432,0.0445,0.0447,0.0439,0.0437,0.0447,0.0439,0.0427,0.0423,0.0414,0.0385,0.0357,0.0339,0.0311,0.0291,0.0262,0.0251,0.0241,0.024,0.0239,0.0241,0.0246,0.0246,0.025,0.0248,0.0253,0.0259,0.0257,0.0254,0.025,0.0251,0.0247,0.0239,0.0231,0.0223,0.0204,0.0171,0.0147,0.0124,0.0076,0.0057,0.0051,0.005,0.0058,0.0055,0.0076,0.0097,0.0157,0.0194,0.0232,0.0283,0.0309,0.031,0.0321,0.0303
2,2,2018.0,7.0,18.0,3.0,54.0,57.249,82.5004,6.388,56.8605,0.2946,0.3047,0.3036,0.3045,0.3032,0.2952,0.2842,0.2891,0.2787,0.277,0.2826,0.2597,0.2477,0.2387,0.2262,0.2252,0.215,0.2116,0.2073,0.1959,0.19,0.1871,0.1786,0.1838,0.1798,0.1792,0.1743,0.1685,0.1681,0.1672,0.1608,0.1646,0.159,0.1558,0.1551,0.1477,0.1485,0.1474,0.1452,0.1443,0.1395,0.1362,0.1333,0.1331,0.1303,0.1317,0.1314,0.1258,0.1231,0.1231,0.1207,0.1199,0.1197,0.1168,0.1155,0.1141,0.112,0.1129,0.109,0.1126,0.1108,0.1083,0.1087,0.1088,0.1035,0.1063,0.1056,0.0971,0.1013,0.1012,0.096,0.0967,0.0894,0.0955,0.0985,0.0939,0.1009,0.0944,0.1,0.1055,0.0921,0.1,0.102,0.3145,0.3066,0.289,0.2712,0.2482,0.2222,0.1867,0.1572,0.1294,0.1114,0.0924,0.0758,0.0792,0.0766,0.0737,0.0692,0.0699,0.0676,0.0718,0.0754,0.0758,0.0721,0.0685,0.0655,0.0642,0.0635,0.0621,0.0576,0.0525,0.0474,0.0441,0.0391,0.0358,0.0351,0.0338,0.0338,0.0328,0.0311,0.0295,0.0286,0.0294,0.0298,0.0297,0.0287,0.0311,0.0332,0.0337,0.0328,0.032,0.0307,0.0285,0.0264,0.0239,0.0224,0.0217,0.021,0.0193,0.0188,0.0181,0.0183,0.0182,0.017,0.0166,0.0173,0.0201,0.0214,0.0215,0.021,0.0194,0.018,0.0186,0.0184,0.0184,0.0181,0.0162,0.016,0.011,0.0071,0.0076,0.0059,0.0073,0.0073,0.0047
3,3,2018.0,7.0,18.0,3.0,54.0,58.25,82.5004,6.388,56.9583,0.3509,0.3119,0.3212,0.2961,0.2877,0.279,0.2781,0.2723,0.267,0.2634,0.2597,0.2473,0.2432,0.2355,0.235,0.2313,0.2272,0.2199,0.2153,0.2086,0.203,0.1981,0.1907,0.1887,0.1809,0.1779,0.1736,0.1675,0.1658,0.163,0.1578,0.1595,0.1558,0.1531,0.151,0.1471,0.1477,0.1471,0.1461,0.145,0.1403,0.1379,0.1345,0.1347,0.1353,0.1356,0.1373,0.1309,0.1304,0.1297,0.1275,0.1265,0.1263,0.1224,0.1208,0.1175,0.1162,0.1158,0.1122,0.1148,0.1116,0.1085,0.1095,0.1098,0.1068,0.1103,0.1091,0.1032,0.105,0.1062,0.1011,0.1014,0.0983,0.1052,0.1077,0.1055,0.1082,0.1019,0.1082,0.1092,0.1026,0.1048,0.11,0.0881,0.0854,0.1071,0.1329,0.1692,0.1887,0.1865,0.1699,0.154,0.1474,0.1469,0.1448,0.1389,0.1266,0.1101,0.0985,0.0899,0.0797,0.0726,0.0681,0.0653,0.0607,0.0583,0.0557,0.0542,0.0525,0.0507,0.0483,0.0451,0.0417,0.0394,0.0392,0.0383,0.0374,0.036,0.0363,0.0356,0.0356,0.0354,0.0346,0.0336,0.033,0.0328,0.0315,0.0228,0.0235,0.0241,0.0249,0.0256,0.0266,0.0274,0.0285,0.0292,0.0285,0.028,0.027,0.0261,0.0248,0.023,0.0206,0.0187,0.0176,0.017,0.0145,0.0147,0.0135,0.0126,0.0103,0.0076,0.0072,0.0098,0.0129,0.015,0.0181,0.0195,0.021,0.0222,0.0236,0.0315,0.0339,0.0283,0.0262,0.0233
4,4,2018.0,7.0,18.0,3.0,54.0,59.249,82.5004,6.388,57.0568,0.3575,0.3344,0.3338,0.3253,0.3347,0.3223,0.3042,0.3143,0.294,0.2888,0.2929,0.2794,0.2806,0.2675,0.2582,0.2614,0.247,0.2433,0.2385,0.2293,0.2281,0.2209,0.2121,0.2145,0.2064,0.207,0.2044,0.196,0.1982,0.1949,0.1895,0.1923,0.1885,0.1863,0.186,0.1777,0.1794,0.1769,0.1737,0.1721,0.1673,0.1639,0.1617,0.161,0.1607,0.1614,0.1606,0.1553,0.1525,0.1524,0.1495,0.1472,0.1462,0.1424,0.1404,0.1371,0.1363,0.1352,0.1319,0.1345,0.1311,0.1281,0.1287,0.1284,0.1228,0.1269,0.1242,0.1168,0.1189,0.1185,0.1164,0.1147,0.112,0.1178,0.1178,0.1188,0.1232,0.1181,0.1255,0.1277,0.1171,0.1217,0.1188,0.2125,0.2198,0.2129,0.2158,0.2016,0.1883,0.1771,0.1627,0.1482,0.1331,0.1254,0.1156,0.1089,0.1041,0.1031,0.1027,0.1013,0.097,0.0913,0.0854,0.0819,0.0778,0.0723,0.065,0.0602,0.0554,0.0497,0.0466,0.0433,0.0407,0.04,0.0401,0.0399,0.04,0.0396,0.0403,0.0398,0.0396,0.0396,0.038,0.0369,0.0355,0.0338,0.0315,0.0322,0.0326,0.0323,0.0321,0.0314,0.0311,0.0299,0.029,0.0277,0.0259,0.0244,0.0225,0.0218,0.0202,0.0207,0.0207,0.0217,0.022,0.0225,0.0235,0.0241,0.0244,0.0234,0.0217,0.0198,0.0186,0.0172,0.0161,0.0129,0.0125,0.013,0.0127,0.0109,0.0107,0.0121,0.014,0.0156,0.0147,0.0199


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53842 entries, 0 to 53841
Columns: 176 entries, index to A742.7
dtypes: float64(175), int64(1)
memory usage: 72.3 MB


In [48]:
# Get memory usage of each column in bytes
memory_usage_per_column = df.memory_usage(deep=True)
# Get total memory usage of the DataFrame in bytes
total_memory_usage = df.memory_usage().sum()
rprint(f"Original Dataframe memory use: {total_memory_usage:20,}")

In [51]:
def reduce_mem_usage(df:pd.DataFrame) -> None:

    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

In [52]:
reduce_mem_usage(df)

Memory usage of dataframe is 18.18 MB
Memory usage after optimization is: 18.18 MB
Decreased by 0.0%


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53842 entries, 0 to 53841
Columns: 176 entries, index to A742.7
dtypes: float16(175), int32(1)
memory usage: 18.2 MB


In [55]:
# Get memory usage of each column in bytes
memory_usage_per_column = df_new.memory_usage(deep=True)
# Get total memory usage of the DataFrame in bytes
total_memory_usage = df_new.memory_usage().sum()
rprint(f"Transformed Dataframe memory use: {total_memory_usage:15,}")