### Interpolation

In numerical analysis, **interpolation** is type of estimation to construct new data point s based on the range of dicrete set of known data points. Linear interpolation (**LERP**) is the basic interpolation method to evaluate sample data based on a linear function.

In [1]:
%%writefile interpolate-requirements.txt
pandas==2.2.1
numpy==1.26.4
pika==1.3.2

Overwriting interpolate-requirements.txt


In [2]:
!pip install -r interpolate-requirements.txt



In [3]:
import pandas as pd
import numpy as np
import pika
import json
import sql_operations

In [4]:
# Temporary for in operation
import psycopg2
import sqls

In [5]:
mq_connection = pika.BlockingConnection(
    pika.ConnectionParameters('172.16.238.20'))
channel = mq_connection.channel()
channel.queue_declare(queue='new_data')
channel.queue_declare(queue='new_interpolarate')

<METHOD(['channel_number=1', 'frame_type=1', "method=<Queue.DeclareOk(['consumer_count=0', 'message_count=0', 'queue=new_interpolarate'])>"])>

In [6]:
connection = psycopg2.connect(database='postgres_db',
                              user='postgres',
                              password='p.postgres',
                              host='172.16.238.10',
                              port=5432)
cursor = connection.cursor()

In [7]:
def get_raw_data_to_process(ids_to_fetch):
    cursor.execute(sqls.get_raw_data_to_process, (ids_to_fetch,))
    raws = cursor.fetchall()
    raw_data_to_process = []
    ids = []
    for raw in raws:
        raw_data_to_process.append(
            {'id': raw[0], 'timestamp': raw[1], 'value': raw[2]})
        ids.append(raw[0])
    return raw_data_to_process

In [8]:
def interpolate_data(timestamps, values):
    timeSerie = pd.Series(values, index=timestamps)
    timeSerie[timeSerie==-1] = np.nan
    timeSerie = timeSerie.resample('min')
    timeSerie = timeSerie.interpolate(method='time')
    return  timeSerie

In [9]:
def get_previous_sum(batch_id, tag_id):
    return sql_operations.get_the_sum_of_production(batch_id, tag_id)

In [10]:
def get_latest_interpolated_data(batch_id, tag_id):
    return sql_operations.get_latest_interpolated_data(batch_id, tag_id)

In [11]:
def find_duplicate_timestamps(data_frame):
    df = data_frame.copy()
    ts = df['timestamp']
    xy = df[ts.isin(ts[ts.duplicated()])].sort_values(['timestamp', 'value'])
    if xy.empty:
        return data_frame
    xy.drop_duplicates(subset='timestamp', keep='last', inplace=True)
    a = pd.merge(df, xy, on=['timestamp', 'value'], how='outer', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)
    return find_duplicate_timestamps(a)

In [12]:
def find_duplicate_values(data_frame):
    df = data_frame.copy()
    ts = df['value']
    xy = df[ts.isin(ts[ts.duplicated()])].sort_values(['timestamp', 'value'])
    if xy.empty:
        return data_frame
    xy.drop_duplicates(subset='timestamp', keep='last', inplace=True)
    a = pd.merge(data_frame, xy, on=['timestamp', 'value'], how='outer', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)
    return find_duplicate_values(a)

In [13]:
def interpolate_tag(batch_id, tag_id, ids_to_interpolate):
    sum_of_previous = get_previous_sum(batch_id, tag_id)
    raw_data_to_process = get_raw_data_to_process(tuple(ids_to_interpolate))
    raw_data_df = pd.DataFrame(raw_data_to_process)
    timestamps = raw_data_df['timestamp'].tolist()
    raw_data_df = find_duplicate_timestamps(raw_data_df)
    raw_data_df = find_duplicate_values(raw_data_df)
    latest_interpolated = None
    latest_interpolated_value = None
    # if interpolated before get the latest interpolated data to continue
    if sum_of_previous > 0:
        latest_interpolated = get_latest_interpolated_data(batch_id, tag_id)
        latest_interpolated_value = latest_interpolated['value']
        raw_data_to_process.insert(0, {'id': 0, 'timestamp': latest_interpolated['timestamp'], 'value': sum_of_previous})
    timestamps = [d['timestamp'] for d in raw_data_to_process]
    values = [d['value'] for d in raw_data_to_process]
    interpolated_dataframe= pd.DataFrame(interpolate_data(raw_data_df['timestamp'].tolist(), raw_data_df['value'].tolist()), columns=['value'])
    interpolated_dataframe.value = interpolated_dataframe.value.round()
    first_value = interpolated_dataframe.iloc[0]['value']
    interpolated_dataframe = interpolated_dataframe.diff().fillna(first_value)
    interpolated_dataframe.loc[0, 'value'] = first_value
    time_set = set(timestamps)
    inserted_ids = []
    for index, row in interpolated_dataframe.iterrows():
        if index is not None and index !=0:
            idx = sql_operations.insert_interpolated_data(pd.to_datetime(index), row['value'], 'ORIGINAL' if index in time_set else 'INTERPOLATED')
            sql_operations.insert_interpolated_batch(idx, batch_id)
            sql_operations.insert_interpolated_tag(idx, tag_id)
            inserted_ids.append(idx)
    sql_operations.commit()
    return inserted_ids

In [14]:
def send_message(message):
    channel.basic_publish(exchange='', routing_key='new_interpolarate', body=message)
    pass

In [15]:
def process_message(queue_message):
    master_tag_interpolated_ids = interpolate_tag(queue_message['BatchId'], 
                                                  queue_message['MasterTag']['id'],
                                                  queue_message['MasterTag']['values'])
    error_tag_interpolated_ids = interpolate_tag(queue_message['BatchId'], 
                                                  queue_message['ErrorTag']['id'],
                                                  queue_message['ErrorTag']['values'])
    message_to_send = {
        'ConfigId': queue_message['ConfigId'],
        'BatchId': queue_message['BatchId'],
        'MasterTag': {
            'id': queue_message['MasterTag']['id'],
            'name': queue_message['MasterTag']['name'],
            'interpolatedIds': master_tag_interpolated_ids
        },
        'ErrorTag': {
            'id': queue_message['ErrorTag']['id'],
            'name': queue_message['ErrorTag']['name'],
            'interpolatedIds': error_tag_interpolated_ids
        }
    }
    return json.dumps(message_to_send)

In [16]:
def message_consumer():
    def callback(ch, method, properties, body):
        queue_message = json.loads(body)
        message = process_message(queue_message)
        send_message(message)
    channel.basic_consume(queue='new_data', on_message_callback=callback, auto_ack=True)
    channel.start_consuming()

In [17]:
message_consumer()

KeyboardInterrupt: 

In [18]:
mq_connection.close()