In [None]:
# load RFSM and CRRL data from google drive folder, Carroll & Breathitt County
# subset the data based on TAIR, VT90, SM02, PRES
# for the RFSM dataset, I have removed the SM02 feature as it is absent from the subset data
import pandas as pd

CRRL_file = '../CRRL.csv'

CRRL_df = pd.read_csv(CRRL_file)

print(CRRL_df.head())

target_vars = ['TAIR','VT90','SM02','PRES']

nan_rows = CRRL_df[CRRL_df[target_vars].isna()]

print(nan_rows.head())

CRRL_subset = CRRL_df.dropna(subset=target_vars)

In [None]:
CRRL_subset

In [None]:
CRRL_subset_only_targets = CRRL_subset[target_vars]
CRRL_subset_only_targets

In [None]:
print(CRRL_subset_only_targets[target_vars].isna().sum())

In [None]:
print("CRRL subsetted data:")
print("\nStart: ", CRRL_subset['UTCTimestampCollected'].iloc[1])
print("\nEnd: ", CRRL_subset['UTCTimestampCollected'].iloc[-1])
# check to make sure no nans
CRRL_subset[target_vars].isna().sum()

In [8]:
RFSM_file = '../RFSM.csv'

RFSM_df = pd.read_csv(RFSM_file)

print(RFSM_df.head())

target_vars = ['TAIR','VT90','SM02','PRES']
target_vars_subset = ['TAIR','VT90','PRES']

RFSM_subset = RFSM_df.dropna(subset=target_vars)
RFSM_subset_2 = RFSM_df.dropna(subset=target_vars_subset)

print(RFSM_df[target_vars].isna().sum())


  RFSM_df = pd.read_csv(RFSM_file)


  NetSiteAbbrev          County UTCTimestampCollected                  TAIR  \
0    Station ID  Station County             Timestamp  Air Temperature (°C)   
1          RFSM       Breathitt   2019-12-10 08:25:00               12.5592   
2          RFSM       Breathitt   2019-12-10 08:30:00               12.4164   
3          RFSM       Breathitt   2019-12-10 08:35:00               12.3232   
4          RFSM       Breathitt   2019-12-10 08:40:00               12.2973   

            DWPT                PRCP           PRES                   RELH  \
0  Dewpoint (°C)  Precipitation (mm)  Pressure (mb)  Relative Humidity (%)   
1        11.6742                 0.0        962.048                   94.3   
2        11.8031                 0.0        962.208                   96.0   
3        11.7733                 0.0        962.406                   96.4   
4        11.8885                 0.0        962.527                   97.3   

                     SRAD                      WDIR  ...

In [None]:
# this subset of all vars is empty
RFSM_subset.head()

In [None]:
# second subset trying for more data, dropped the SM02 var as it has too many missing vals
RFSM_subset_2.head()

In [None]:
# without SM02, with SM02 the dataset is empty
print("RFSM subsetted data:")
print("Start: ", RFSM_subset_2['UTCTimestampCollected'].iloc[1])
print("End: ", RFSM_subset_2['UTCTimestampCollected'].iloc[-1])
print(RFSM_subset[target_vars_subset].isna().sum())

In [None]:
RFSM_subset_2.to_csv('./RFSM_subset_NO_SM02.csv')
RFSM_subset.to_csv('./RFSM_subset.csv')


In [None]:
CRRL_subset.to_csv('./CRRL_subset.csv')

In [None]:
# replacing missing values with NaN
import pandas as pd

# First, let's examine what non-numeric values exist in these columns
print("Checking for non-numeric values in target columns:")
for col in target_vars:
    if col in CRRL_df.columns:
        # Convert to string to check for non-numeric values
        non_numeric = CRRL_df[col].astype(str)
        unique_values = non_numeric.unique()
        print(f"\nColumn '{col}' unique string values (first 20):")
        print(unique_values[:20])

# Create a list to track missing values before conversion
missing_value_records = []

# Convert columns to numeric and track what gets converted to NaN
for col in target_vars:
    if col in CRRL_df.columns:
        # Store original values before conversion
        original_values = CRRL_df[col].copy()

        # Convert to numeric (non-numeric values become NaN)
        CRRL_df[col] = pd.to_numeric(CRRL_df[col], errors='coerce')

        # Find rows where conversion resulted in NaN (but weren't originally NaN)
        # This identifies non-numeric values that were converted
        was_non_numeric = pd.isna(CRRL_df[col]) & pd.notna(original_values)

        # Also find rows that were already NaN
        was_already_nan = pd.isna(original_values)

        # Combine both cases
        all_missing = pd.isna(CRRL_df[col])

        # Record missing values with their datetime and original value
        missing_indices = CRRL_df.index[all_missing]

        for idx in missing_indices:
            # Get datetime
            datetime_col = 'UTCTimestampCollected'
            if datetime_col in CRRL_df.columns:
                timestamp = CRRL_df.loc[idx, datetime_col]
            else:
                timestamp = f"Row_{idx}"  # Fallback if no datetime column

            original_val = original_values.iloc[idx] if idx < len(original_values) else 'Unknown'

            missing_value_records.append({
                'DateTime': timestamp,
                'Column': col,
                'Row_Index': idx,
                'Original_Value': original_val,
                'Missing_Type': 'Non-numeric_converted' if idx in CRRL_df.index[was_non_numeric] else 'Already_missing'
            })

# Create DataFrame to track missing values
missing_values_df = pd.DataFrame(missing_value_records)

# Save missing values tracking to CSV
missing_values_df.to_csv('missing_values_log.csv', index=False)
print(f"\nMissing values log saved to 'missing_values_log.csv'")
print(f"Total missing values tracked: {len(missing_values_df)}")

# Display summary of missing values by column
if len(missing_values_df) > 0:
    print("\nMissing values summary by column:")
    summary = missing_values_df.groupby('Column').size().reset_index(name='Count')
    print(summary)

    print("\nSample of missing values log:")
    print(missing_values_df.head(10))

# Now check for NaN rows in the cleaned data
nan_rows = CRRL_df[CRRL_df[target_vars].isna().any(axis=1)]
print(f"\nRows with NaN values after numeric conversion: {len(nan_rows)}")

# Create subset without NaN values
CRRL_subset = CRRL_df.dropna(subset=target_vars)
print(f"\nOriginal dataset size: {len(CRRL_df)}")
print(f"Subset size after removing NaN: {len(CRRL_subset)}")

# Save the cleaned subset
CRRL_subset.to_csv('CRRL_subset_cleaned.csv', index=False)
print("Cleaned subset saved to 'CRRL_subset_cleaned.csv'")

In [2]:
import pandas as pd
import numpy as np

CRRL_file = '../CRRL.csv'
CRRL_df = pd.read_csv(CRRL_file)

target_vars = ['TAIR','VT90','SM02','PRES']

# First, let's examine what non-numeric values exist in these columns
print("Checking for non-numeric values in target columns:")
for col in target_vars:
    if col in CRRL_df.columns:
        # Convert to string to check for non-numeric values
        non_numeric = CRRL_df[col].astype(str)
        unique_values = non_numeric.unique()
        print(f"\nColumn '{col}' unique string values (first 20):")
        print(unique_values[:20])

# Create a list to track missing values before conversion
missing_value_records = []

# Create a complete copy of the original dataframe for full cleaning
CRRL_df_complete_cleaned = CRRL_df.copy()

# Get all numeric columns (excluding datetime and other non-numeric columns you want to keep)
# You can adjust this list based on your specific needs
datetime_cols = ['UTCTimestampCollected']  # Add your datetime column names here
text_cols = []  # Add any text columns you want to keep as-is

# Identify columns that should be converted to numeric
all_cols = CRRL_df.columns.tolist()
numeric_cols = [col for col in all_cols if col not in datetime_cols + text_cols]

print(f"\nColumns to be processed for numeric conversion: {len(numeric_cols)}")
print(f"Columns to be kept as-is: {datetime_cols + text_cols}")

# Convert ALL numeric columns to numeric and track missing values for target vars
for col in numeric_cols:
    if col in CRRL_df.columns:
        # Store original values before conversion
        original_values = CRRL_df[col].copy()

        # Convert to numeric (non-numeric values become NaN)
        CRRL_df[col] = pd.to_numeric(CRRL_df[col], errors='coerce')
        CRRL_df_complete_cleaned[col] = pd.to_numeric(CRRL_df_complete_cleaned[col], errors='coerce')

        # Only track missing values for target variables (to avoid overwhelming log)
        if col in target_vars:
            # Find rows where conversion resulted in NaN (but weren't originally NaN)
            was_non_numeric = pd.isna(CRRL_df[col]) & pd.notna(original_values)

            # Also find rows that were already NaN
            was_already_nan = pd.isna(original_values)

            # Combine both cases
            all_missing = pd.isna(CRRL_df[col])

            # Record missing values with their datetime and original value
            missing_indices = CRRL_df.index[all_missing]

            for idx in missing_indices:
                # Get datetime - adjust column name as needed
                datetime_col = 'UTCTimestampCollected'  # Change this to your actual datetime column
                if datetime_col in CRRL_df.columns:
                    timestamp = CRRL_df.loc[idx, datetime_col]
                else:
                    timestamp = f"Row_{idx}"  # Fallback if no datetime column

                original_val = original_values.iloc[idx] if idx < len(original_values) else 'Unknown'

                missing_value_records.append({
                    'DateTime': timestamp,
                    'Column': col,
                    'Row_Index': idx,
                    'Original_Value': original_val,
                    'Missing_Type': 'Non-numeric_converted' if idx in CRRL_df.index[was_non_numeric] else 'Already_missing'
                })

# Create DataFrame to track missing values
missing_values_df = pd.DataFrame(missing_value_records)

# Save missing values tracking to CSV
missing_values_df.to_csv('missing_values_log.csv', index=False)
print(f"\nMissing values log saved to 'missing_values_log.csv'")
print(f"Total missing values tracked: {len(missing_values_df)}")

# Display summary of missing values by column
if len(missing_values_df) > 0:
    print("\nMissing values summary by column:")
    summary = missing_values_df.groupby('Column').size().reset_index(name='Count')
    print(summary)

    print("\nSample of missing values log:")
    print(missing_values_df.head(10))

# Now check for NaN rows in the cleaned data
nan_rows = CRRL_df[CRRL_df[target_vars].isna().any(axis=1)]
print(f"\nRows with NaN values in target columns after numeric conversion: {len(nan_rows)}")

# Create subset without NaN values (target variables only)
CRRL_subset = CRRL_df.dropna(subset=target_vars)
print(f"\nOriginal dataset size: {len(CRRL_df)}")
print(f"Target variables subset size after removing NaN: {len(CRRL_subset)}")

# Check missing values in the complete cleaned dataframe
print(f"\nComplete cleaned dataframe missing values summary:")
complete_missing_summary = CRRL_df_complete_cleaned.isna().sum()
complete_missing_summary = complete_missing_summary[complete_missing_summary > 0].sort_values(ascending=False)
print(complete_missing_summary)

# Save all the dataframes
CRRL_subset.to_csv('CRRL_subset_cleaned.csv', index=False)
print("Target variables subset saved to 'CRRL_subset_cleaned.csv'")

CRRL_df_complete_cleaned.to_csv('CRRL_complete_cleaned.csv', index=False)
print("Complete cleaned dataframe saved to 'CRRL_complete_cleaned.csv'")

# Also save the original dataframe with missing values converted to NaN (but no rows removed)
CRRL_df.to_csv('CRRL_with_NaN_converted.csv', index=False)
print("Original dataframe with NaN conversions saved to 'CRRL_with_NaN_converted.csv'")

print(f"\nSummary:")
print(f"- Original dataset: {len(CRRL_df)} rows")
print(f"- Target subset (no NaN in target vars): {len(CRRL_subset)} rows")
print(f"- Complete cleaned dataset: {len(CRRL_df_complete_cleaned)} rows (same as original, just cleaned)")

  CRRL_df = pd.read_csv(CRRL_file)


Checking for non-numeric values in target columns:

Column 'TAIR' unique string values (first 20):
['Air Temperature (°C)' '20.4219' '20.2758' '20.2288' '19.1393' '18.1505'
 '17.4197' '16.8907' '16.3261' '16.313' '15.8912' '14.7009' '14.5218'
 '13.8902' '13.5987' '13.1205' '13.1114' '13.0171' '12.6939' '12.5963']

Column 'VT90' unique string values (first 20):
['Air Temperature at 9.0 meters (°C)' 'nan' '18.92' '18.81' '19.16'
 '18.93' '18.85' '19.2' '19.44' '19.63' '19.66' '20.07' '19.88' '19.9'
 '19.95' '20.19' '20.43' '20.32' '20.24' '20.4']

Column 'SM02' unique string values (first 20):
['Soil Moisture at 2 in. (%)' '0.316' 'nan' '0.315' '0.314' '0.317'
 '0.318' '0.312' '0.308' '0.307' '0.305' '0.303' '0.301' '0.297' '0.295'
 '0.296' '0.293' '0.294' '0.299' '0.298']

Column 'PRES' unique string values (first 20):
['Pressure (mb)' 'nan' '469.875' '998.881' '998.859' '998.88' '998.94'
 '998.894' '998.81' '998.756' '998.642' '998.588' '998.502' '998.356'
 '998.332' '998.344' '998.325

In [4]:
import pandas as pd

# Define the variables we're interested in
key_variables = ['VT90', 'SM02', 'TAIR', 'PRES']
datetime_col = 'UTCTimestampCollected'

# Open a text file for writing
with open('../unsorted data/data_gaps_analysis.txt', 'w') as f:

    f.write(f"Looking for all chunks where all variables have data: {key_variables}\n")
    f.write("-" * 60 + "\n")

    # Sort by datetime to ensure chronological order
    df_sorted = CRRL_df_complete_cleaned

    # Find rows where ALL variables are present (not NaN)
    all_present = df_sorted[key_variables].notna().all(axis=1)

    # Find chunks of continuous complete data
    chunks = []
    in_chunk = False
    chunk_start = None

    for i, has_all_data in enumerate(all_present):
        if has_all_data and not in_chunk:
            # Start of a new chunk
            chunk_start = i
            in_chunk = True
        elif not has_all_data and in_chunk:
            # End of current chunk
            chunk_end = i - 1
            chunks.append((chunk_start, chunk_end))
            in_chunk = False

    # Handle case where dataset ends with a complete chunk
    if in_chunk:
        chunks.append((chunk_start, len(df_sorted) - 1))

    # Write chunk information to file
    if chunks:
        f.write(f"Found {len(chunks)} chunks with complete data for all variables:\n\n")

        total_complete_rows = 0

        for i, (start_idx, end_idx) in enumerate(chunks, 1):
            start_date = df_sorted.iloc[start_idx][datetime_col]
            end_date = df_sorted.iloc[end_idx][datetime_col]
            chunk_duration = end_idx - start_idx + 1
            total_complete_rows += chunk_duration

            f.write(f"Chunk {i}:\n")
            f.write(f"  Start: {start_date} (row {start_idx})\n")
            f.write(f"  End:   {end_date} (row {end_idx})\n")
            f.write(f"  Duration: {chunk_duration} rows\n")

            # Calculate time duration
            try:
                time_duration = end_date - start_date
                f.write(f"  Time span: {time_duration}\n")
            except:
                pass
            f.write("\n")

        # Summary
        total_rows = len(df_sorted)
        complete_percentage = (total_complete_rows / total_rows) * 100

        f.write(f"Summary:\n")
        f.write(f"  Total rows in dataset: {total_rows}\n")
        f.write(f"  Rows with all variables present: {total_complete_rows} ({complete_percentage:.1f}%)\n")
        f.write(f"  Rows with missing data: {total_rows - total_complete_rows} ({100 - complete_percentage:.1f}%)\n")
        f.write(f"  Number of complete data chunks: {len(chunks)}\n")

        if len(chunks) > 1:
            f.write(f"\nGaps between chunks:\n")
            for i in range(len(chunks) - 1):
                gap_start = chunks[i][1] + 1  # End of current chunk + 1
                gap_end = chunks[i+1][0] - 1  # Start of next chunk - 1
                gap_start_date = df_sorted.iloc[gap_start][datetime_col]
                gap_end_date = df_sorted.iloc[gap_end][datetime_col]
                gap_duration = gap_end - gap_start + 1

                f.write(f"  Gap {i+1}: {gap_start_date} to {gap_end_date} ({gap_duration} rows)\n")

    else:
        f.write("❌ No chunks found where all variables have data simultaneously\n")

        # Show individual variable availability
        f.write("\nIndividual variable availability:\n")
        for var in key_variables:
            has_data = df_sorted[var].notna().any()
            if has_data:
                first_date = df_sorted[df_sorted[var].notna()].iloc[0][datetime_col]
                last_date = df_sorted[df_sorted[var].notna()].iloc[-1][datetime_col]
                total_available = df_sorted[var].notna().sum()
                f.write(f"  {var}: {first_date} to {last_date} ({total_available} rows)\n")
            else:
                f.write(f"  {var}: No data available\n")

print("Gap analysis saved to 'data_gaps_analysis.txt'")

Gap analysis saved to 'data_gaps_analysis.txt'


In [6]:
# load RFSM and CRRL data from google drive folder, Carroll & Breathitt County
# subset the data based on TAIR, VT90, SM02, PRES
# for the RFSM dataset, I have removed the SM02 feature as it is absent from the subset data
import pandas as pd

CRRL_file = '../CRRL.csv'

CRRL_df = pd.read_csv(CRRL_file)

target_vars = ['SM02', 'ST02', 'SM04', 'ST04']

CRRL_subset_NO_SOIL = CRRL_df.drop(columns=target_vars)

CRRL_subset_NO_SOIL.to_csv("CRRL_subset_NO_SOIL.csv", index=False)

  CRRL_df = pd.read_csv(CRRL_file)


  NetSiteAbbrev          County UTCTimestampCollected                  TAIR  \
0    Station ID  Station County             Timestamp  Air Temperature (°C)   
1          CRRL         Carroll   2018-05-01 00:00:00               20.4219   
2          CRRL         Carroll   2018-05-01 00:05:00               20.2758   
3          CRRL         Carroll   2018-05-01 00:10:00               20.2288   
4          CRRL         Carroll   2018-05-01 00:15:00               19.1393   

            DWPT                PRCP           PRES                   RELH  \
0  Dewpoint (°C)  Precipitation (mm)  Pressure (mb)  Relative Humidity (%)   
1      -0.958412                 0.0            NaN                  23.79   
2        -1.1164                 0.0            NaN                  23.73   
3      -0.774945                 0.0            NaN                   24.4   
4        1.91157                 0.0            NaN                  31.69   

                     SRAD                      WDIR  ...

In [10]:
# replacing missing values with NaN
import pandas as pd

file = '/Users/cylis/Work/mes_summer25/RFSM_subset_NO_SM02.csv'

RFSM_df = pd.read_csv(file)
# First, let's examine what non-numeric values exist in these columns
print("Checking for non-numeric values in target columns:")
for col in target_vars:
    if col in RFSM_df.columns:
        # Convert to string to check for non-numeric values
        non_numeric = RFSM_df[col].astype(str)
        unique_values = non_numeric.unique()
        print(f"\nColumn '{col}' unique string values (first 20):")
        print(unique_values[:20])

# Create a list to track missing values before conversion
missing_value_records = []

# Convert columns to numeric and track what gets converted to NaN
for col in target_vars:
    if col in RFSM_df.columns:
        # Store original values before conversion
        original_values = RFSM_df[col].copy()

        # Convert to numeric (non-numeric values become NaN)
        RFSM_df[col] = pd.to_numeric(RFSM_df[col], errors='coerce')

        # Find rows where conversion resulted in NaN (but weren't originally NaN)
        was_non_numeric = pd.isna(RFSM_df[col]) & pd.notna(original_values)

        # Also find rows that were already NaN
        was_already_nan = pd.isna(original_values)

        # Combine both cases
        all_missing = pd.isna(RFSM_df[col])

        # Record missing values with their datetime and original value
        missing_indices = RFSM_df.index[all_missing]

        for idx in missing_indices:
            # Get datetime
            datetime_col = 'UTCTimestampCollected'
            if datetime_col in RFSM_df.columns:
                timestamp = RFSM_df.loc[idx, datetime_col]
            else:
                timestamp = f"Row_{idx}"  # Fallback if no datetime column

            original_val = original_values.iloc[idx] if idx < len(original_values) else 'Unknown'

            missing_value_records.append({
                'DateTime': timestamp,
                'Column': col,
                'Row_Index': idx,
                'Original_Value': original_val,
                'Missing_Type': 'Non-numeric_converted' if idx in RFSM_df.index[was_non_numeric] else 'Already_missing'
            })

# Create DataFrame to track missing values
missing_values_df = pd.DataFrame(missing_value_records)

# Save missing values tracking to CSV
missing_values_df.to_csv('RFSM_missing_values_log.csv', index=False)
print(f"\nMissing values log saved to 'RFSM_missing_values_log.csv'")
print(f"Total missing values tracked: {len(missing_values_df)}")

# Display summary of missing values by column
if len(missing_values_df) > 0:
    print("\nMissing values summary by column:")
    summary = missing_values_df.groupby('Column').size().reset_index(name='Count')
    print(summary)

    print("\nSample of missing values log:")
    print(missing_values_df.head(10))

# Now check for NaN rows in the cleaned data
nan_rows = RFSM_df[RFSM_df[target_vars].isna().any(axis=1)]
print(f"\nRows with NaN values after numeric conversion: {len(nan_rows)}")

# Create subset without NaN values
RFSM_subset = RFSM_df.dropna(subset=target_vars)
print(f"\nOriginal dataset size: {len(RFSM_df)}")
print(f"Subset size after removing NaN: {len(RFSM_subset)}")

# Save the cleaned subset
RFSM_subset.to_csv('RFSM_subset_cleaned.csv', index=False)
print("Cleaned subset saved to 'RFSM_subset_cleaned.csv'")

  RFSM_df = pd.read_csv(file)


Checking for non-numeric values in target columns:

Column 'TAIR' unique string values (first 20):
['Air Temperature (°C)' '4.04204' '4.08667' '4.07621' '4.11729' '4.09035'
 '4.07849' '4.06616' '4.05743' '4.01165' '3.99285' '3.96894' '3.94577'
 '3.95389' '3.95062' '3.91289' '3.97761' '3.93206' '3.95066' '3.96472']

Column 'VT90' unique string values (first 20):
['Air Temperature at 9.0 meters (°C)' '4.272' '4.273' '4.298' '4.283'
 '4.275' '4.271' '4.238' '4.221' '4.224' '4.184' '4.141' '4.144' '4.145'
 '4.142' '4.139' '4.138' '4.135' '4.087' '4.073']

Column 'SM02' unique string values (first 20):
['Soil Moisture at 2 in. (%)' 'nan']

Column 'PRES' unique string values (first 20):
['Pressure (mb)' '964.091' '964.207' '964.287' '964.243' '964.363'
 '964.406' '964.559' '964.523' '964.603' '964.521' '964.56' '964.602'
 '964.562' '964.604' '964.442' '964.403' '964.44' '964.367' '964.289']

Missing values log saved to 'RFSM_missing_values_log.csv'
Total missing values tracked: 563579

Missi

In [14]:
import pandas as pd

file = ('/Users/cylis/Work/mes_summer25/original/RFSM.csv')

df = pd.read_csv(file)
df.head()

  df = pd.read_csv(file)


Unnamed: 0,NetSiteAbbrev,County,UTCTimestampCollected,TAIR,DWPT,PRCP,PRES,RELH,SRAD,WDIR,...,SM02,SM04,ST02,ST04,VT05,VT20,VT90,VR05,VR20,VR90
0,Station ID,Station County,Timestamp,Air Temperature (°C),Dewpoint (°C),Precipitation (mm),Pressure (mb),Relative Humidity (%),Solar Radiation (W/m²),Wind Direction (degrees),...,Soil Moisture at 2 in. (%),Soil Moisture at 4 in. (%),Soil Temperature at 2 in. (°C),Soil Temperature at 4 in. (°C),Air Temperature at 0.5 meters (°C),Air Temperature at 2.0 meters (°C),Air Temperature at 9.0 meters (°C),Relative Humidity at 0.5 meters (%),Relative Humidity at 2.0 meters (%),Relative Humidity At 9.0 meters (%)
1,RFSM,Breathitt,2019-12-10 08:25:00,12.5592,11.6742,0.0,962.048,94.3,0.165165,228.3,...,,,,,,,,,,
2,RFSM,Breathitt,2019-12-10 08:30:00,12.4164,11.8031,0.0,962.208,96.0,0.0,234.5,...,,,,,,,,,,
3,RFSM,Breathitt,2019-12-10 08:35:00,12.3232,11.7733,0.0,962.406,96.4,0.165164,240.9,...,,,,,,,,,,
4,RFSM,Breathitt,2019-12-10 08:40:00,12.2973,11.8885,0.0,962.527,97.3,0.165165,240.4,...,,,,,,,,,,


In [15]:
df = df.drop(0)
df.head()

Unnamed: 0,NetSiteAbbrev,County,UTCTimestampCollected,TAIR,DWPT,PRCP,PRES,RELH,SRAD,WDIR,...,SM02,SM04,ST02,ST04,VT05,VT20,VT90,VR05,VR20,VR90
1,RFSM,Breathitt,2019-12-10 08:25:00,12.5592,11.6742,0.0,962.048,94.3,0.165165,228.3,...,,,,,,,,,,
2,RFSM,Breathitt,2019-12-10 08:30:00,12.4164,11.8031,0.0,962.208,96.0,0.0,234.5,...,,,,,,,,,,
3,RFSM,Breathitt,2019-12-10 08:35:00,12.3232,11.7733,0.0,962.406,96.4,0.165164,240.9,...,,,,,,,,,,
4,RFSM,Breathitt,2019-12-10 08:40:00,12.2973,11.8885,0.0,962.527,97.3,0.165165,240.4,...,,,,,,,,,,
5,RFSM,Breathitt,2019-12-10 08:45:00,12.3016,11.9395,0.0,962.568,97.6,0.165166,230.0,...,,,,,,,,,,
