# Jupyter Notebook showing Numpy/Pandas use with In-situ Data
## Memory reduction technique
## AC9 Flow-thru data

Includes references to plotting using Matplotlib and related tools.

In [None]:
BUCKET_NAME     = "ai-training-2024-08-09-bucket"
PROJECT_ID      = "ai-training-2024-08-09"
LOCATION        = "us-central1"
secret_name     = "ai-training-key-secret"
secret_version  = "latest"
project_id      = "usfs-tf-admin"
resource_name   = f"projects/{project_id}/secrets/{secret_name}/versions/{secret_version}"

In [None]:
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#- Google Colab Check
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
RunningInCOLAB = False
RunningInCOLAB = 'google.colab' in str(get_ipython())

if RunningInCOLAB:
    print("You are running this notebook in Google Colab.")
else:
    print("You are running this notebook with Jupyter iPython runtime.")
    print("Assumption is you have the required libraries to execute this notebook.")

In [None]:
import sys
import subprocess
import importlib.util

In [None]:
libraries=["numpy", "pandas", "scipy", "sklearn", "matplotlib"]
import importlib.util

for library in libraries:
    if library == "Pillow":
      spec = importlib.util.find_spec("PIL")
    else:
      spec = importlib.util.find_spec(library)
    if spec is None:
      print("Installing library " + library)
      subprocess.run(["pip", "install" , library, "--quiet"])
    else:
      print("Library " + library + " already installed.")

# Libraries / Imports

In [None]:
############################################
# INCLUDES
############################################
#libraries specific to this example
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.linear_model import LinearRegression
import matplotlib as matplt
import matplotlib.pyplot as plt

#a set of libraries that perhaps should always be in Python source
import os
import datetime
import sys
import gc
import getopt
import inspect
import math
import warnings

from pydoc import help                          # can type in the python console `help(name of function)` to get the documentation

#a darn useful library for creating paths and one I recommend you load to your environment
from pathlib import Path

warnings.filterwarnings('ignore')               # don't print out warnings


# Pull in Support Tools

In [None]:
#!rm -rf ./folderOnColab && echo "Ok, removed." || { echo "No folder to remove."; exit 1; }
#!mkdir -p ./folderOnColab && echo "Folder created." || { echo "Failed to create folder, it might already exist.";  }
#!gsutil -m cp -r gs://usfs-gcp-rand-test-data-usc1/public_source/jbooks/ANewHope.txt ./folderOnColab

target_folder="./folderOnColab"
target_repo="https://raw.githubusercontent.com//christophergarthwood/jbooks/main"
target_files=["support_debug.ipynb", "support_functions.ipynb"]
print(f"Creating a folder ({target_folder}) to store project data.")
subprocess.run(["mkdir", "-p" , target_folder])
if os.path.isdir(target_folder):
  print("Performing wget on:")
  for idx, filename in enumerate(target_files):
    print(f"...{filename} to target folder: {target_folder}")
    try:
      subprocess.run(["wget", f"--directory-prefix={target_folder}", f"{target_repo}/{filename}"])
    except Exception as e:
      print("")
      print(f"ERROR: There was a problem performing wget on the target file ({filename}), see Exception: {str(e)}")
      print("...talk to the instructor.")
    if os.path.isfile(target_folder+os.sep+filename):
      print("...verified copy.")
      print("...importing code.")
      target_filename=f"{target_folder+os.sep+filename}"
      os.environ["target_filename"]=target_filename
      %run $target_filename
    else:
      print(f"...copy NOT verified, check the {target_folder} for the existence of {filename}")
else:
    print("ERROR: Local folder not found/created.  Check the output to ensure your folder is created.")
    print(f"...target folder: {target_folder}")
    print("...if you can't find the problem contact the instructor.")


In [None]:
msg_info("Setting Library Configuration")
set_library_configuration()

# Variable declaration

In [None]:
############################################
# GLOBAL VARIABLES
############################################
DEBUG = 1
DEBUG_DATA = 0

# CODE CONSTRAINTS
VERSION_NAME    = "AC9_Flow-thru"
VERSION_MAJOR   = 0
VERSION_MINOR   = 0
VERSION_RELEASE = 1

#used for values outside standard ASCII, just do it, you'll need it
ENCODING  ="utf-8"

############################################
# GLOBAL CONSTANTS
############################################


############################################
# APPLICATION VARIABLES
############################################

############################################
# GLOBAL CONFIGURATION
############################################
os.environ['PYTHONIOENCODING']=ENCODING


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


# Library Invocation
### Note that it's also useful to use this code so that you carry around a list of version dependencies and know how you did something (version)

In [None]:
msg_info("Library Diagnostics")
lib_diagnostics()

# Data Read
### Using pandas read in a data file and establish a log for output.

In [None]:
#!rm -rf ./folderOnColab && echo "Ok, removed." || { echo "No folder to remove."; exit 1; }
#!mkdir -p ./folderOnColab && echo "Folder created." || { echo "Failed to create folder, it might already exist.";  }
#!gsutil -m cp -r gs://usfs-gcp-rand-test-data-usc1/public_source/jbooks/ANewHope.txt ./folderOnColab

target_folder="./folderOnColab"
target_files=["All_flo-thru.dat"]
print(f"Creating a folder ({target_folder}) to store project data.")
subprocess.run(["mkdir", "-p" , target_folder])
if os.path.isdir(target_folder):
  for idx, filename in enumerate(target_files):
    print(f"Copying {filename} to target folder: {target_folder}")
    subprocess.run(["gsutil", "-m" , "cp", "-r", f"gs://{BUCKET_NAME}/public_source/jbooks/{filename}",  target_folder], check=True)
else:
    print("ERROR: Local folder not found/created.  Check the output to ensure your folder is created.")
    print(f"...target folder: {target_folder}")
    print("...if you can't find the problem contact the instructor.")


In [None]:
filename=target_folder + os.sep + "All_flo-thru.dat"

#there are an insane number of options, but generally if you have well structured data it's 'magic'
#Reference:http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv
#changed \t delimiter to ^ which made parsing the columns far easier

#straight from the WAP output with the delimiter modified
ac9flothru=pd.read_csv(filename, delimiter='^')



In [None]:
ac9flothru.head()

In [None]:
#show column header names.
print(ac9flothru.columns)

In [None]:
ac9flowthru = reduce_mem_usage(ac9flothru)

In [None]:
ac9flowthru.head()