<a href="https://colab.research.google.com/github/gilmore-h2o/GRIME2_data_processing/blob/main/GRIME2_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import
import pandas as pd
import glob
import os
import plotly.express as px

In [None]:
# prompt: load data from multiple csv files to create a single dataframe

# Get current working directory
current_directory = os.getcwd()

# Use glob to find all CSV files in the current directory
csv_files = glob.glob(os.path.join(current_directory, "*.csv"))

# Create an empty list to store dataframes
dfs = []

# Loop through the CSV files and append each to the list
for file in csv_files:
    try:
        df = pd.read_csv(file)
        dfs.append(df)
        print(f"Successfully loaded: {file}")
    except pd.errors.EmptyDataError:
        print(f"Warning: {file} is empty and will be skipped.")
    except pd.errors.ParserError:
        print(f"Warning: {file} could not be parsed and will be skipped.")

# Concatenate all dataframes in the list into a single dataframe
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    print("Combined dataframe shape:", combined_df.shape)
else:
    print("No valid CSV files found or all files were empty/corrupted.")
    combined_df = pd.DataFrame() # return an empty dataframe if no valid files were found

# Now you can work with the 'combined_df' DataFrame
combined_df.shape
# For example, display the first few rows:
combined_df.head(15)


Successfully loaded: /content/20220808_20220812.csv
Successfully loaded: /content/20220820.csv
Successfully loaded: /content/20220718_20220729.csv
Successfully loaded: /content/20221101_20221115.csv
Successfully loaded: /content/20220729_20220808.csv
Successfully loaded: /content/20220911_20221006.csv
Successfully loaded: /content/20220825_20220911.csv
Successfully loaded: /content/20220715_20220718.csv
Successfully loaded: /content/invalid_timestamps.csv
Successfully loaded: /content/20220812_20220820.csv
Successfully loaded: /content/20221015_20221101.csv
Successfully loaded: /content/20221006_20221015.csv
Combined dataframe shape: (10973, 85)


Unnamed: 0,imgPath,findSuccess,timestamp,illum_state,waterLevel,waterLevelAdjusted,xRMSE,yRMSE,EuclidDistRMSE,waterLine-octagon-angle-diff,...,foundPts[6]-y,foundPts[7]-x,foundPts[7]-y,foundPts[8]-x,foundPts[8]-y,foundPts[9]-x,foundPts[9]-y,....1,plot_DateTime,plot_waterLevel
0,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T15:30:00,Off,0.742,0.742,0.008,-0.052,0.052,-0.097,...,931.507,1119.0,920.031,1103.0,931.508,1087.0,932.014,,,
1,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T15:45:00,Off,0.725,0.725,0.005,-0.003,0.005,0.368,...,931.5,1119.0,931.021,1103.0,930.505,1087.0,931.011,,,
2,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:00:00,Off,0.724,0.724,-0.007,-0.06,0.06,-1.563,...,927.5,,,,,,,,,
3,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:15:00,Off,0.726,0.726,-0.001,-0.144,0.144,-1.112,...,931.504,1119.0,932.0,1103.0,931.501,1087.0,922.011,,,
4,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:30:00,Off,0.732,0.732,-0.012,-0.004,0.012,-2.797,...,924.667,1119.0,917.035,1103.0,916.7,1087.0,929.054,,,
5,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:45:00,Off,0.737,0.737,-0.011,-0.017,0.02,1.191,...,928.51,1119.0,929.011,1103.0,928.508,1087.0,929.002,,,
6,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T17:00:00,Off,0.697,0.697,0.015,0.269,0.269,0.827,...,929.517,1119.0,930.048,1103.0,928.513,1087.0,929.031,,,
7,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T17:15:00,Off,0.723,0.723,0.027,-0.021,0.034,-0.587,...,926.5,,,,,,,,,
8,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,False,2022-08-08T17:30:00,Off,-1.0,-9999999.9,-9999999.0,-9999999.0,-9999999.0,0.0,...,,,,,,,,,,
9,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T17:45:00,Off,0.776,0.776,-0.042,0.089,0.098,1.926,...,928.508,1119.0,930.015,1103.0,928.525,1087.0,929.028,,,


In [None]:
# prompt: starting with combined_df, use the timestamp column to create a new column with date and time in date/time format appropriate for plotting with plotly. Call the new column plot_DateTime. Then take the waterLevel column and do two things. First, create a plot_waterLevel column and populate with waterLevel values, but replace and values of -1 with 0. Then, create an interactive plotly time series plot using plot_waterLevel and plot_DateTime.

# Convert the timestamp column to datetime objects
combined_df['plot_DateTime'] = pd.to_datetime(combined_df['timestamp'])

# Create plot_waterLevel column, replacing -1 with 0
combined_df['plot_waterLevel'] = combined_df['waterLevel'].replace(-1, 0)


In [None]:
# prompt: explore combined_df as follows:
# 1. list any timestamps that are prior to year 2022
# 2. print a list that contains the last 35 characters of the imgPath column value
# 3. export a csv file called "invalid timestamps" that contains the list
# 4. remove entries with invalid timestamps from combined_df to create a new dataframe called plot_df
# 5. sort plot_df by 'plot_DateTime' column

# Find timestamps prior to 2022
invalid_timestamps = combined_df[combined_df['plot_DateTime'] < '2022-01-01']

# Extract the last 35 characters of imgPath
last_35_chars = [img_path[-35:] for img_path in combined_df['imgPath']]

# Export invalid timestamps to CSV
invalid_timestamps.to_csv("invalid_timestamps.csv", index=False)

# Remove entries with invalid timestamps
plot_df = combined_df[combined_df['plot_DateTime'] >= '2022-01-01']

# Sort plot_df by 'plot_DateTime'
plot_df = plot_df.sort_values(by='plot_DateTime')

Timestamps prior to 2022:
 110    1955-09-24 12:05:00
788    1955-09-24 12:05:00
2808   1955-09-24 12:05:00
2903   1955-09-24 12:05:00
2904   1955-09-24 12:05:00
2999   1955-09-24 12:05:00
3095   1955-09-24 12:05:00
3287   1955-09-24 12:05:00
7795   1955-09-24 12:05:00
8082   1955-09-24 12:05:00
8765   1955-09-24 12:05:00
Name: plot_DateTime, dtype: datetime64[ns]

Last 35 characters of imgPath:
 0        _20220812/20220808_1530_GC_KOLA.JPG
1        _20220812/20220808_1545_GC_KOLA.JPG
2        _20220812/20220808_1600_GC_KOLA.JPG
3        _20220812/20220808_1615_GC_KOLA.JPG
4        _20220812/20220808_1630_GC_KOLA.JPG
                        ...                 
10957    _20221101/20221015_1015_GC_KOLA.JPG
10958    _20221101/20221015_1345_GC_KOLA.JPG
10959    _20221101/20221015_1330_GC_KOLA.JPG
10960    _20221101/20221015_1330_GC_KOLA.JPG
10961    _20221101/20221015_1315_GC_KOLA.JPG
Name: imgPath, Length: 10962, dtype: object

Exported invalid timestamps to invalid_timestamps.csv

Shape

In [None]:
# prompt: use plotly to create the following plots:
# 1. create a histogram of all plot_waterLevel values, and use illum_state to group color on this plot
# 2. create a bar plot that has one bar showing count of all plot_waterLevel values above 1.1 or below 0.45, then a second bar showing the count of all remaining values.

import plotly.express as px

# 1. Histogram of plot_waterLevel colored by illum_state
fig1 = px.histogram(plot_df, x="plot_waterLevel", color="illum_state",
                   title="Distribution of Water Levels by Illumination State",
                   labels={"plot_waterLevel": "Water Level", "illum_state": "Illumination State"})
fig1.show()


# 2. Bar plot of water levels above/below thresholds vs. within thresholds
# Define thresholds
upper_threshold = 1.1
lower_threshold = 0.45

# Categorize water levels
plot_df['waterLevelCategory'] = 'Within Range'
plot_df.loc[(plot_df['plot_waterLevel'] > upper_threshold) | (plot_df['plot_waterLevel'] < lower_threshold), 'waterLevelCategory'] = 'Outside Range'

# Count occurrences
water_level_counts = plot_df['waterLevelCategory'].value_counts().reset_index()
water_level_counts.columns = ['Category', 'Count']

# Create the bar plot
fig2 = px.bar(water_level_counts, x='Category', y='Count',
             title='Water Level Outside/Within Range',
             labels={'Category': 'Water Level Range', 'Count': 'Number of Occurrences'})
fig2.show()

In [None]:
# prompt: use plot_df to create plotly scatterplot
# group data by findSuccess column and include legend for groups

# Create the interactive Plotly time series plot with grouping and legend
fig = px.scatter(plot_df, x='plot_DateTime', y='plot_waterLevel',
              symbol='findSuccess',  # Group by 'findSuccess' column
              color='illum_state', # Group by whether IR illumination was used
              title='Water Level Over Time',
              labels={'plot_DateTime':'Date and Time', 'plot_waterLevel':'Water Level', 'findSuccess': 'Success'},
              hover_data=['imgPath']) # Include 'imgPath' in hover information

fig.show()