<a href="https://colab.research.google.com/github/gilmore-h2o/GRIME2_data_processing/blob/main/GRIME2_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import
import pandas as pd
import glob
import os
import plotly.express as px

In [3]:
# prompt: load data from multiple csv files to create a single dataframe

# Get current working directory
current_directory = os.getcwd()

# Use glob to find all CSV files in the current directory
csv_files = glob.glob(os.path.join(current_directory, "*.csv"))

# Create an empty list to store dataframes
dfs = []

# Loop through the CSV files and append each to the list
for file in csv_files:
    try:
        df = pd.read_csv(file)
        dfs.append(df)
        print(f"Successfully loaded: {file}")
    except pd.errors.EmptyDataError:
        print(f"Warning: {file} is empty and will be skipped.")
    except pd.errors.ParserError:
        print(f"Warning: {file} could not be parsed and will be skipped.")

# Concatenate all dataframes in the list into a single dataframe
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    print("Combined dataframe shape:", combined_df.shape)
else:
    print("No valid CSV files found or all files were empty/corrupted.")
    combined_df = pd.DataFrame() # return an empty dataframe if no valid files were found

# Now you can work with the 'combined_df' DataFrame
# For example, display the first few rows:
combined_df.head()


Successfully loaded: /content/20220808_20220812.csv
Successfully loaded: /content/20220820.csv
Successfully loaded: /content/20220718_20220729.csv
Successfully loaded: /content/20221101_20221115.csv
Successfully loaded: /content/20220729_20220808.csv
Successfully loaded: /content/20220911_20221006.csv
Successfully loaded: /content/20220825_20220911.csv
Successfully loaded: /content/20220715_20220718.csv
Successfully loaded: /content/20220812_20220820.csv
Successfully loaded: /content/20221015_20221101.csv
Successfully loaded: /content/20221006_20221015.csv
Combined dataframe shape: (10962, 83)


Unnamed: 0,imgPath,findSuccess,timestamp,illum_state,waterLevel,waterLevelAdjusted,xRMSE,yRMSE,EuclidDistRMSE,waterLine-octagon-angle-diff,...,foundPts[5]-y,foundPts[6]-x,foundPts[6]-y,foundPts[7]-x,foundPts[7]-y,foundPts[8]-x,foundPts[8]-y,foundPts[9]-x,foundPts[9]-y,....1
0,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T15:30:00,Off,0.742,0.742,0.008,-0.052,0.052,-0.097,...,932.021,1135.0,931.507,1119.0,920.031,1103.0,931.508,1087.0,932.014,
1,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T15:45:00,Off,0.725,0.725,0.005,-0.003,0.005,0.368,...,932.01,1135.0,931.5,1119.0,931.021,1103.0,930.505,1087.0,931.011,
2,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:00:00,Off,0.724,0.724,-0.007,-0.06,0.06,-1.563,...,931.0,1135.0,927.5,,,,,,,
3,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:15:00,Off,0.726,0.726,-0.001,-0.144,0.144,-1.112,...,932.025,1135.0,931.504,1119.0,932.0,1103.0,931.501,1087.0,922.011,
4,G:/2021_KOLA/UNK Images and Field Data/UNK Ima...,True,2022-08-08T16:30:00,Off,0.732,0.732,-0.012,-0.004,0.012,-2.797,...,932.002,1135.0,924.667,1119.0,917.035,1103.0,916.7,1087.0,929.054,


In [5]:
# prompt: starting with combined_df, use the timestamp column to create a new column with date and time in date/time format appropriate for plotting with plotly. Call the new column plot_DateTime. Then take the waterLevel column and do two things. First, create a plot_waterLevel column and populate with waterLevel values, but replace and values of -1 with 0. Then, create an interactive plotly time series plot using plot_waterLevel and plot_DateTime.

# Convert the timestamp column to datetime objects
combined_df['plot_DateTime'] = pd.to_datetime(combined_df['timestamp'])

# Create plot_waterLevel column, replacing -1 with 0
combined_df['plot_waterLevel'] = combined_df['waterLevel'].replace(-1, 0)


In [7]:
# prompt: explore combined_df as follows:
# 1. list any timestamps that are prior to year 2022

# Assuming 'plot_DateTime' column exists and is of datetime type.
# If not, you may need to convert it first using pd.to_datetime()

if 'plot_DateTime' in combined_df.columns:
    combined_df['plot_DateTime'] = pd.to_datetime(combined_df['plot_DateTime'])
    timestamps_before_2022 = combined_df[combined_df['plot_DateTime'].dt.year < 2022]['plot_DateTime']
    print(timestamps_before_2022)
else:
    print("Error: 'plot_DateTime' column not found in the DataFrame.")

110    1955-09-24 12:05:00
788    1955-09-24 12:05:00
2808   1955-09-24 12:05:00
2903   1955-09-24 12:05:00
2904   1955-09-24 12:05:00
2999   1955-09-24 12:05:00
3095   1955-09-24 12:05:00
3287   1955-09-24 12:05:00
7795   1955-09-24 12:05:00
8082   1955-09-24 12:05:00
8765   1955-09-24 12:05:00
Name: plot_DateTime, dtype: datetime64[ns]


In [None]:
# Create the interactive Plotly time series plot
fig = px.line(combined_df, x='plot_DateTime', y='plot_waterLevel',
              title='Water Level Over Time',
              labels={'plot_DateTime':'Date and Time', 'plot_waterLevel':'Water Level'})

fig.show()