# Jupyter Notebook showing Numpy/Pandas use with In-situ Data
## Memory reduction technique
## AC9 Flow-thru data

Includes references to plotting using Matplotlib and related tools.

In [1]:
############################################
# INCLUDES
############################################
#libraries specific to this example
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.linear_model import LinearRegression
import matplotlib as matplt
import matplotlib.pyplot as plt

#a set of libraries that perhaps should always be in Python source
import os 
import datetime
import sys
import gc
import getopt
import inspect
import math
import warnings

from pydoc import help                          # can type in the python console `help(name of function)` to get the documentation

#a darn useful library for creating paths and one I recommend you load to your environment
from pathlib import Path

#Import a custom library, in this case a fairly useful logging framework
if os.environ.get('LIB_LOCATION') is not None:
    debug_lib_location = Path(os.getenv('LIB_LOCATION'))
else:
    debug_lib_location = Path("./")
                              
if os.environ.get('DATA_LOCATION') is not None:
    root_location = os.getenv('DATA_LOCATION')
else:
    root_location=".." + os.sep + "data";                              
sys.path.append(str(debug_lib_location))

import debug

warnings.filterwarnings('ignore')               # don't print out warnings


In [9]:
############################################
#JUPYTER NOTEBOOK OUTPUT CONTROL / FORMATTING
############################################
#set floating point to 4 places to things don't run loose
pd.options.display.float_format = '{:,.4f}'.format
np.set_printoptions(precision=4)

# Variable declaration

In [10]:
############################################
# GLOBAL VARIABLES
############################################
DEBUG = 1
DEBUG_DATA = 0

# CODE CONSTRAINTS
VERSION_NAME    = "AC9_Flow-thru"
VERSION_MAJOR   = 0
VERSION_MINOR   = 0
VERSION_RELEASE = 1

#used for values outside standard ASCII, just do it, you'll need it
ENCODING  ="utf-8"

############################################
# GLOBAL CONSTANTS
############################################


############################################
# APPLICATION VARIABLES
############################################

############################################
# GLOBAL CONFIGURATION
############################################
os.environ['PYTHONIOENCODING']=ENCODING


# Example of Defining a Function

In [11]:
def lib_diagnostics():
    debug.msg_debug("System version    #:{:>12}".format(sys.version))
    debug.msg_debug("Matplotlib version#:{:>12}".format(matplt.__version__))
    debug.msg_debug("Numpy version     #:{:>12}".format(np.__version__))
    debug.msg_debug("Pandas version    #:{:>12}".format(pd.__version__))
    debug.msg_debug("SciPy version     #:{:>12}".format(sp.__version__))

    return

In [12]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


# Library Invocation
### Note that it's also useful to use this code so that you carry around a list of version dependencies and know how you did something (version)

In [13]:
lib_diagnostics()

[2022-12-14 18:23:17 UTC]   DEBUG: System version    #:3.9.15 (main, Nov  4 2022, 16:13:54) 
[GCC 11.2.0] 
[2022-12-14 18:23:17 UTC]   DEBUG: Matplotlib version#:       3.5.3 
[2022-12-14 18:23:17 UTC]   DEBUG: Numpy version     #:      1.23.4 
[2022-12-14 18:23:17 UTC]   DEBUG: Pandas version    #:       1.5.1 
[2022-12-14 18:23:17 UTC]   DEBUG: SciPy version     #:       1.9.3 


# Data Read
### Using pandas read in a data file and establish a log for output.

In [14]:
filename=root_location + os.sep + "All_flo-thru.dat"

#there are an insane number of options, but generally if you have well structured data it's 'magic'
#Reference:http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv
#changed \t delimiter to ^ which made parsing the columns far easier

#straight from the WAP output with the delimiter modified
ac9flothru=pd.read_csv(filename, delimiter='^')



In [15]:
ac9flothru.head()

Unnamed: 0,Time(ms),Latitude,Longitude,Bottom_Depth(m),UTC/GMT_Time,a650__,a676__,a715__,c510__,c532__,...,c650__,c676__,c715__,a510__,a532__,a555__,c412__,c440__,c488__,Temperature(C)
0,1000.0,30.3225,88.8931,0.0,0.6713,0.6869,0.7193,0.5224,4.5003,4.1485,...,3.0008,2.8428,2.5976,1.3964,1.1843,1.0358,7.2849,6.0743,4.9176,14.44
1,2000.0,30.3225,88.893,0.0,0.6713,0.6901,0.723,0.5237,4.5042,4.1503,...,3.0029,2.8432,2.5969,1.396,1.1816,1.0312,7.2719,6.0647,4.9079,14.44
2,2000.0,30.3225,88.893,0.0,0.6713,0.6838,0.7157,0.5178,4.4947,4.1395,...,2.9922,2.834,2.5881,1.3922,1.1809,1.032,7.2677,6.0596,4.9026,14.44
3,2000.0,30.3225,88.893,0.0,0.6713,0.6864,0.7204,0.5217,4.49,4.1368,...,2.9948,2.84,2.5931,1.4021,1.1878,1.036,7.2665,6.0578,4.8996,14.44
4,2000.0,30.3225,88.893,0.0,0.6713,0.6865,0.7197,0.5226,4.4867,4.1345,...,2.9924,2.8363,2.5932,1.3998,1.1818,1.0269,7.265,6.0572,4.8975,14.44


In [16]:
#show column header names.
print(ac9flothru.columns)

Index(['Time(ms)', 'Latitude', 'Longitude', 'Bottom_Depth(m)', 'UTC/GMT_Time',
       'a650__', 'a676__', 'a715__', 'c510__', 'c532__', 'c555__', 'a412__',
       'a440__', 'a488__', 'c650__', 'c676__', 'c715__', 'a510__', 'a532__',
       'a555__', 'c412__', 'c440__', 'c488__', 'Temperature(C)'],
      dtype='object')


In [17]:
ac9flowthru = reduce_mem_usage(ac9flothru)

Memory usage of dataframe is 8.79 MB
Memory usage after optimization is: 2.29 MB
Decreased by 74.0%
