# Jupyter Notebook with SkLearn and Linear Regression
## This notebook represents the "basics" with Data Science work

In [None]:
BUCKET_NAME     = "ai-training-2024-08-09-bucket"
PROJECT_ID      = "ai-training-2024-08-09"
LOCATION        = "us-central1"
secret_name     = "ai-training-key-secret"
secret_version  = "latest"
project_id      = "usfs-tf-admin"
resource_name   = f"projects/{project_id}/secrets/{secret_name}/versions/{secret_version}"

In [None]:
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#- Google Colab Check
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
RunningInCOLAB = False
RunningInCOLAB = 'google.colab' in str(get_ipython())

if RunningInCOLAB:
    print("You are running this notebook in Google Colab.")
else:
    print("You are running this notebook with Jupyter iPython runtime.")
    print("Assumption is you have the required libraries to execute this notebook.")

In [None]:
import sys
import subprocess
import importlib.util

In [None]:
libraries=["numpy", "pandas", "scipy", "sklearn", "matplotlib"]
import importlib.util

for library in libraries:
    if library == "Pillow":
      spec = importlib.util.find_spec("PIL")
    else:
      spec = importlib.util.find_spec(library)
    if spec is None:
      print("Installing library " + library)
      subprocess.run(["pip", "install" , library, "--quiet"])
    else:
      print("Library " + library + " already installed.")

# Pull in Debugging Library

In [None]:
library_list = ['./support_debug.ipynb', './support_functions.ipynb']

for library in library_list:
    %run $library

# Includes and Libraries

In [None]:
############################################
# INCLUDES
############################################
#libraries specific to this example
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.linear_model import LinearRegression
import matplotlib as matplt
import matplotlib.pyplot as plt

#a set of libraries that perhaps should always be in Python source
import os
import datetime
import sys
import gc
import getopt
import inspect
import math
import warnings

#a darn useful library for creating paths and one I recommend you load to your environment
from pathlib import Path

In [None]:
msg_info("Setting Library Configuration")
set_library_configuration()

# Variable declaration

In [None]:
############################################
# GLOBAL VARIABLES
############################################
DEBUG = 1
DEBUG_DATA = 0

# CODE CONSTRAINTS
VERSION_NAME    = "SKLearn"
VERSION_MAJOR   = 0
VERSION_MINOR   = 0
VERSION_RELEASE = 1

#used for values outside standard ASCII, just do it, you'll need it
ENCODING  ="utf-8"

############################################
# GLOBAL CONSTANTS
############################################
TEMPERATURE="Temp(C)"
SALINITY="Sal(PSU)"

############################################
# APPLICATION VARIABLES
############################################

############################################
# GLOBAL CONFIGURATION
############################################
os.environ['PYTHONIOENCODING']=ENCODING


# Library Invocation
### Note that it's also useful to use this code so that you carry around a list of version dependencies and know how you did something (version)

In [None]:
msg_info("Library Diagnostics")
lib_diagnostics()

# Data Read
### Using pandas read in a data file and establish a log for output.

In [None]:
#READ DATA IN

filename = os.path.join(root_location, "mooring_data.txt")


if os.path.isfile(filename):

    try:

        data=pd.read_csv(filename, delimiter='^', engine='python')

        log_file = "."+os.sep+"data"+os.sep+"feature_1_batch_1.log"

    except OSError as e:

        debug.msg_error(f"Exception found: {e}")

        debug.msg_error(f"Unable to filename ({fname}) in code at {__name__}.{inspect.stack()[0][3]}.")  #note fname provided by Exception OSError

        debug.msg_error("Please check your paths and restart.")

else:

    debug.msg_error(f"Unable to filename ({filename}) in code at {__name__}.{inspect.stack()[0][3]}.")

    debug.msg_error("Please check your paths and restart.")


In [None]:
data

In [None]:
#what does the data domain look like?
print(data.columns)

In [None]:
#learn pandas
data.info()

In [None]:
#learn MORE pandas
data.describe()

# Prep the data

In [None]:
#PURGE DATA
debug.msg_info("Data scrub and prep")
#load teperature (referenced by name) for all values (y, x); (all rows, column=Temp)
#into a single dimensional array that is Numpy based (means you got lots of free powerful tools)

try:

    xs=data.loc[:,TEMPERATURE].values
    debug.msg_debug("XS array from " + str(TEMPERATURE) + " is " + str(xs.size))

except KeyError as e:#Missing values check

    debug.msg_error(f"Exception found: {e}")

    debug.msg_error("'TEMPERATURE' key values were not found, please check your file")

except IndexingError as e:#Indexing misalignment check

    debug.msg_error(f"Exception found: {e}")

    debug.msg_error("'TEMPERATURE' index values appear to be incorrect, please check your file")


try:

    ys=data.loc[:,SALINITY].values
    debug.msg_debug("YS array from " + SALINITY + " is " + str(ys.size))

except KeyError as e:#Missing values check

    debug.msg_error(f"Exception found: {e}")

    debug.msg_error("'SALINITY' key values were not found, please check your file")

except IndexingError as e:#Indexing misalignment check

    debug.msg_error(f"Exception found: {e}")

    debug.msg_error("'SALINITY' index values appear to be incorrect, please check your file")


#machine learning doesn't like negative numbers, there are multiple ways of doing this
#pd.DataFrame().query() or np.where() are some options
#LEARN PANDAS!!!!
debug.msg_debug("")
debug.msg_debug("Performing Y mask cleaning")
y_mask=np.where(ys < 1)
debug.msg_debug("")

#clear out the values that should be scrubbed.  Again, any technique to clean the data is viable.
all_xs = np.delete(xs, y_mask)
debug.msg_debug("all_xs (> 1) array from " + TEMPERATURE + " is " + str(all_xs.size))
all_ys = np.delete(ys, y_mask)
debug.msg_debug("all_ys (> 1) array from " + SALINITY + " is " + str(all_ys.size))

In [None]:
#example of all_xs output (temperature)
all_xs

In [None]:
#transpose the data, the API expects it
all_xs = np.transpose([all_xs])
debug.msg_debug("all_xs (transposed) array from " +  TEMPERATURE + " is " + str(all_xs.size))

all_ys = np.transpose([all_ys])
debug.msg_debug("all_ys (transposed) array from " +  SALINITY + " is " + str(all_ys.size))


In [None]:
#example of all_xs output now (tafter transpose)
all_xs

# Statistics
### Lots of statistics at your fingertips with Numpy
### Also useful as it helps with quick operations for other functions (plotting, data smoothing, etc.)

In [None]:
#STATISTICS
x_mean=data[TEMPERATURE].apply(np.mean)
x_std=data[TEMPERATURE].apply(np.std)
x_mean_max=np.max(x_mean)
x_std_max=np.max(x_std)
x_range_limit=x_mean_max + (x_std_max * 5)

y_mean=data[SALINITY].apply(np.mean)
y_std=data[SALINITY].apply(np.std)
y_mean_max=np.max(y_mean)
y_std_max=np.max(y_std)
y_range_limit=y_mean_max + (y_std_max * 5)

# Plots

In [None]:
#show the image inline
%matplotlib inline
PLT_X_SIZE=20
PLT_Y_SIZE=10
PLT_LGD_X_OFFSET=1
PLT_LGD_Y_OFFSET=0.5

#establish the initial figure dimensions and configuration (purposely made large 20x10 inches)
fig = plt.figure(1,figsize=(PLT_X_SIZE, PLT_Y_SIZE))

#this is where you could have multiple plots side by side
axes = plt.subplot(111)
axes.set_title('X, Y Plot')
axes.set_ylabel(SALINITY)
axes.set_xlabel(TEMPERATURE)

labels=axes.get_xticklabels()
plt.setp(labels,rotation=45,horizontalalignment='right')

axes.scatter(all_xs, all_ys, label='x vs. y')
axes.legend(loc='center left', bbox_to_anchor=(PLT_LGD_X_OFFSET,PLT_LGD_Y_OFFSET))


# Linear Regression (SKlearn)

In [None]:
#instantiate the model
lm = LinearRegression()

#fit values with the clean data
lm.fit(all_xs, all_ys)

#run a prediction given the calculation (yeah...that easy)
new_y=lm.predict(all_xs)

In [None]:
debug.msg_debug("             Intercept:" + str(lm.intercept_))
debug.msg_debug("Number of Coefficients:" + str(len(lm.coef_)))

In [None]:
#PLOT DATA (QUICK LOOK)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
PLT_X_SIZE=20
PLT_Y_SIZE=10
PLT_LGD_X_OFFSET=1
PLT_LGD_Y_OFFSET=0.5

fig = plt.figure(1,figsize=(PLT_X_SIZE, PLT_Y_SIZE))

axes = plt.subplot(111)
axes.set_title('Y vs New Y Plot')
axes.set_ylabel(SALINITY)
axes.set_xlabel('Time')

labels=axes.get_xticklabels()
plt.setp(labels,rotation=45,horizontalalignment='right')

axes.scatter(range(0,all_ys.size),all_ys, label='old Y')
axes.scatter(range(0,new_y.size),new_y, label='new Y')
axes.legend(loc='center left', bbox_to_anchor=(PLT_LGD_X_OFFSET,PLT_LGD_Y_OFFSET))


In [None]:
#newY is the prediction across the oldY values.