In [13]:
import requests
import json
import pandas as pd
#from tqdm import tqdm  
from datetime import date,timedelta, datetime
import numpy as np
from sqlalchemy.orm import Session
from sqlalchemy.engine import reflection
from sqlalchemy import inspect
from sqlalchemy import create_engine, Column, Integer, String, TIMESTAMP, FLOAT, MetaData, Table, text, DateTime, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError, SQLAlchemyError, InterfaceError
from sqlalchemy.ext.declarative import declarative_base
from data_loading import load_data_from_db, preprocess_sensor_data
from os import path
import pandas as pd
from utils import preprocess_sensor_data
from sqlalchemy.sql import select
from sqlalchemy import Table, MetaData
from sqlalchemy import inspect
from sqlalchemy import create_engine
from model import get_session

from pathlib import Path


In [14]:
def get_db_connection(db_user, db_pass, db_ip, db_port, db_name):
    try:
        connection_url = f'leanxcale://{db_user}:{db_pass}@{db_ip}:{db_port}/{db_name}?autocommit=False&parallel=True?txn_mode=NO_CONFLICTS_NO_LOGGING'
        eng = create_engine(connection_url)
        return eng
    except Exception as e:
        st.error(f"Error connecting to the database: {e}")
        return None


def get_table_names(db_connection):
    try:
        if db_connection is None:
            raise ValueError("No database connection available.")
        return inspect(db_connection).get_table_names()
    except Exception as e:
        st.error(f"Error fetching table names: {e}")
        return []


def load_data_from_db(table_name, engine):
    metadata = MetaData(bind=engine)
    table = Table(table_name, metadata, autoload=True)
    query = select([table])
    with engine.connect() as connection:
        result = connection.execute(query)
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
        df.columns = df.columns.str.lower()
        df.set_index('timestamp', inplace=True)
    return df



# Connection
DB_USER = 'app'
DB_PASS = 'app'
DB_IP = '0.0.0.0'
DB_PORT = '1529'
DB_NAME = 'MOH'

In [3]:
engine = get_db_connection(DB_USER, DB_PASS, DB_IP, DB_PORT, DB_NAME)


In [5]:
table_names = get_table_names(engine)
table_names = [col for col in table_names if "hours" in col.lower()]


In [8]:
selected_table = table_names[0]

In [15]:
readings = load_data_from_db(selected_table, engine)

pk_list: []


In [17]:
readings.to_csv('readings.csv')

In [60]:
readings.columns

Index(['min_col33vi603', 'max_col33vi603', 'sum_col33vi603',
       'count_col33vi603', 'count_col33vi603_isvalid', 'min_col33vi601',
       'max_col33vi601', 'sum_col33vi601', 'count_col33vi601',
       'count_col33vi601_isvalid', 'min_col33vi602', 'max_col33vi602',
       'sum_col33vi602', 'count_col33vi602', 'count_col33vi602_isvalid',
       'min_col33vi604', 'max_col33vi604', 'sum_col33vi604',
       'count_col33vi604', 'count_col33vi604_isvalid', 'min_col33si501a',
       'max_col33si501a', 'sum_col33si501a', 'count_col33si501a',
       'count_col33si501a_isvalid', 'min_col33pi222', 'max_col33pi222',
       'sum_col33pi222', 'count_col33pi222', 'count_col33pi222_isvalid',
       'min_col33pi601', 'max_col33pi601', 'sum_col33pi601',
       'count_col33pi601', 'count_col33pi601_isvalid'],
      dtype='object')

In [56]:
def calculate_invalid_readings(data, original_freq_sec=10, agg_interval_sec=3600):
    """
    Calculate the number and percentage of invalid readings for each sensor per hour.

    Parameters:
    data (pd.DataFrame): The dataframe containing sensor readings.
    original_freq_sec (int): The frequency of the original data in seconds (default is 10 seconds).
    agg_interval_sec (int): The aggregation interval in seconds (default is 3600 seconds for 1 hour).

    Returns:
    pd.DataFrame: A dataframe with the number and percentage of invalid readings for each sensor per hour.
    """
    expected_readings_per_hour = agg_interval_sec // original_freq_sec
    sensor_columns = [col for col in data.columns if 'count_' in col and '_isvalid' not in col]
    print(sensor_columns)
    print(expected_readings_per_hour)

    invalid_readings = {}
    invalid_percentages = {}

    for sensor in sensor_columns:
        valid_col = f"{sensor}_isvalid"
        invalid_col = f"{sensor}_invalid"
        percentage_col = f"{sensor}_invalid_percentage"

        data[invalid_col] = data[sensor] - data[valid_col]
        data[percentage_col] = (data[invalid_col] / expected_readings_per_hour) * 100

        invalid_readings[invalid_col] = data[invalid_col]
        invalid_percentages[percentage_col] = data[percentage_col]

    invalid_readings_df = pd.DataFrame(invalid_readings)
    invalid_percentages_df = pd.DataFrame(invalid_percentages)
    return invalid_readings_df, invalid_percentages_df

In [57]:
# Load the data
file_path = 'readings.csv'
data = pd.read_csv(file_path)

In [58]:
data

Unnamed: 0,timestamp,min_col33vi603,max_col33vi603,sum_col33vi603,count_col33vi603,count_col33vi603_isvalid,min_col33vi601,max_col33vi601,sum_col33vi601,count_col33vi601,...,min_col33pi222,max_col33pi222,sum_col33pi222,count_col33pi222,count_col33pi222_isvalid,min_col33pi601,max_col33pi601,sum_col33pi601,count_col33pi601,count_col33pi601_isvalid
0,2021-05-15 00:00:00,0.093979,2.950925,855.527964,360,360,8.051951,47.872948,5418.180742,360,...,-0.179779,2.224877,648.390362,360,360,9.448253,54.812092,6112.817972,360,360
1,2021-05-15 01:00:00,0.087731,2.980732,856.942858,360,360,7.821442,48.559711,5380.531765,360,...,-0.181610,2.227451,647.480360,360,360,9.492846,54.082409,6076.850348,360,360
2,2021-05-15 02:00:00,0.090974,3.052270,883.753914,360,360,8.338897,50.290916,5711.346509,360,...,-0.181610,2.144166,612.114070,360,360,9.357123,55.913773,6110.726194,360,360
3,2021-05-15 03:00:00,0.098235,3.042732,875.203113,360,360,8.329462,50.476917,5746.177293,360,...,-0.185639,2.052914,600.417815,360,360,9.302748,56.185616,6132.398027,360,360
4,2021-05-15 04:00:00,0.107190,2.973475,865.277831,360,360,8.183337,50.448303,5675.205743,360,...,-0.187715,2.057338,602.522817,360,360,9.301237,55.227009,6102.456418,360,360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21378,2023-10-22 20:00:00,2.432278,2.718428,925.219375,360,360,6.152232,7.392217,2419.019455,360,...,2.179270,2.218265,791.810840,360,360,9.668272,9.829887,3513.755909,360,360
21379,2023-10-22 21:00:00,2.479970,2.813811,943.342240,360,360,6.199924,7.439909,2421.404039,360,...,2.189394,2.233262,797.471002,360,360,9.695459,9.828378,3515.986797,360,360
21380,2023-10-22 22:00:00,2.527661,2.861503,961.035867,360,360,6.056849,7.344525,2401.611989,360,...,2.207016,2.238137,798.845922,360,360,9.718116,9.822335,3517.616538,360,360
21381,2023-10-22 23:00:00,2.527661,2.813811,957.554371,360,360,6.152232,7.726059,2435.234647,360,...,2.211141,2.241511,801.286060,360,360,9.736241,9.870669,3529.615422,360,360


In [59]:
x,y = calculate_invalid_readings(data, original_freq_sec=10, agg_interval_sec=3600)

['count_col33vi603', 'count_col33vi601', 'count_col33vi602', 'count_col33vi604', 'count_col33si501a', 'count_col33pi222', 'count_col33pi601']
360


In [23]:

# Load the data
file_path = 'readings.csv'
data = pd.read_csv(file_path)

# Calculate the number of invalid readings for each sensor per hour
# Assuming that invalid readings are the difference between total readings and valid readings
sensor_columns = [col for col in data.columns if 'count_' in col and '_isvalid' not in col]

for sensor in sensor_columns:
    valid_col = f"{sensor}_isvalid"
    invalid_col = f"{sensor}_invalid"
    data[invalid_col] = data[sensor] - data[valid_col]

# Annotate each hour with a feature indicating the quality of the data based on the number of invalid readings
# Here we sum up all invalid readings to get a general quality indicator per hour
data['total_invalid_readings'] = data[[f"{sensor}_invalid" for sensor in sensor_columns]].sum(axis=1)

# Optionally, you can create a categorical label based on the number of invalid readings
# For example, if total_invalid_readings > threshold, label as 'poor', otherwise 'good'
threshold = 10  # Set your own threshold
data['data_quality'] = data['total_invalid_readings'].apply(lambda x: 'poor' if x > threshold else 'good')

# Prepare the dataset for machine learning by dropping unnecessary columns
# Keep timestamp, quality indicator, and/or labels, and sensor features
features = ['timestamp', 'total_invalid_readings', 'data_quality'] + [col for col in data.columns if 'min_' in col or 'max_' in col or 'sum_' in col]
ml_data = data[features]

# Encode categorical labels if needed (e.g., for supervised learning)
ml_data['data_quality'] = ml_data['data_quality'].map({'good': 0, 'poor': 1})

# Save or return the prepared dataset
ml_data.to_csv('prepared_ml_data.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ml_data['data_quality'] = ml_data['data_quality'].map({'good': 0, 'poor': 1})


In [22]:
!pip install ace_tools


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[31mERROR: Could not find a version that satisfies the requirement ace_tools (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for ace_tools[0m[31m
[0m

In [29]:
ml_data.total_invalid_readings.unique()

array([0])

In [27]:
def calculate_total_invalid_per_sensor(data):
    """
    Calculate the total number of invalid readings for each sensor.

    Parameters:
    data (pd.DataFrame): The dataframe containing sensor readings.

    Returns:
    pd.Series: A series with sensor names as index and total invalid readings as values.
    """
    sensor_columns = [col for col in data.columns if 'count_' in col and '_isvalid' not in col]
    print(sensor_columns)
    total_invalid_per_sensor = {}

    for sensor in sensor_columns:
        valid_col = f"{sensor}_isvalid"
        invalid_col = f"{sensor}_invalid"
        data[invalid_col] = data[sensor] - data[valid_col]
        total_invalid_per_sensor[sensor] = data[invalid_col].sum()

    return pd.Series(total_invalid_per_sensor)

# Load the data
file_path = 'readings.csv'
data = pd.read_csv(file_path)

# Calculate the total invalid readings per sensor
total_invalid_readings = calculate_total_invalid_per_sensor(data)
total_invalid_readings


['count_col33vi603', 'count_col33vi601', 'count_col33vi602', 'count_col33vi604', 'count_col33si501a', 'count_col33pi222', 'count_col33pi601']


count_col33vi603     0
count_col33vi601     0
count_col33vi602     0
count_col33vi604     0
count_col33si501a    0
count_col33pi222     0
count_col33pi601     0
dtype: int64