In [6]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta

In [None]:
df = pd.read_pickle('anomaly_operator_logs.pkl')

anomalies = df[df['is_anomaly'] == True]

In [None]:
def generate_sensor_insight(sensor_type, value, equipment_id,operator_log):
    sensor_type = sensor_type.strip()
    t = thresholds.get(sensor_type)
    
    if not t:
        return f"No rules defined for {sensor_type}."

    low_warn, high_warn = t['warning']
    low_anom, high_anom = t['anomaly']

    level = None
    if value >= high_anom:
        level = "ANOMALY"
        message = f"{sensor_type} reading of {value} is too high, could be Anomaly in the senor. Ideal Temperature should not go above:  (> {high_anom})."
    elif value >= high_warn:
        level = "WARNING"
        message = f"{sensor_type} reading of {value} is above normal. Check the Equipment. Ideal Temperature should not go above:  (> {high_warn})."
    elif value <= low_anom:
        level = "ANOMALY"
        message = f"{sensor_type} reading of {value} too low, could be Anomaly in the senor. Ideal Temperature should not go below:  (< {low_anom})."
    elif value <= low_warn:
        level = "WARNING"
        message = f"{sensor_type} reading of {value} is lower than expected.  Check the Equipment. Ideal Temperature should not go below:  (< {low_warn})."
    else:
        level = "Normal"
        message = f"{sensor_type} reading of {value} is within normal range."

    return f"""
Status: {level}  
Operator Note: {operator_log}
Insight: {message}
""".strip()


In [None]:
from rapidfuzz import fuzz

def fuzzy_match_logs(anomaly_df, log_df, time_window=30, score_threshold=80):
    matched_logs = []

    for _, anomaly_row in anomaly_df.iterrows():
        a_time = anomaly_row['Timestamp']
        a_equip = anomaly_row['Equipment_ID']

        # Filter logs within time window
        nearby_logs = log_df[
            (log_df['Timestamp'] >= a_time - timedelta(minutes=time_window)) &
            (log_df['Timestamp'] <= a_time + timedelta(minutes=time_window))
        ]

        # Fuzzy match equipment ID to log text
        best_score = 0
        best_log = None
        for _, log_row in nearby_logs.iterrows():
            score = fuzz.partial_ratio(a_equip.lower(), log_row['Operator_Log'].lower())
            if score > best_score and score >= score_threshold:
                best_score = score
                best_log = log_row['Operator_Log']

        matched_logs.append(best_log)

    anomaly_df = anomaly_df.copy()
    anomaly_df['Matched_Log'] = matched_logs
    return anomaly_df

TypeError: 'Index' object is not callable

In [None]:
anomaly_df = fuzzy_match_logs(df_anomalies,df_operator_logs[df_operator_logs['Equipment_ID'].str.contains('Mud_Pump')==True])

'Status: ANOMALY  \nOperator Note: 103413    Alert: Temperature deviation from baseline in ...\nName: Matched_Log, dtype: object\nInsight: Temperature reading of 100 is too high, could be Anomaly in the senor. Ideal Temperature should not go above:  (> 100).'

In [120]:
df[df['Matched_Log'].isnull() == False]['Matched_Log'][:1]

103413    Alert: Temperature deviation from baseline in ...
Name: Matched_Log, dtype: object

In [99]:
def generate_insights_df(df):
    messages = []

    for _, row in df.iterrows():
        for sensor_type in thresholds.keys():
            if sensor_type in row:
                prompt = generate_sensor_prompt(
                    sensor_type=sensor_type,
                    value=row[sensor_type],
                    equipment_id=row['Equipment_ID'],
                )
                messages.append(prompt+ ' '+ df['Matched_Log'])
    
    return messages

generate_insights_df(df[df['Matched_Log'].isnull() == False])

[103413    Status: ANOMALY  \nInsight: Temperature readin...
 103546    Status: ANOMALY  \nInsight: Temperature readin...
 103770    Status: ANOMALY  \nInsight: Temperature readin...
 103868    Status: ANOMALY  \nInsight: Temperature readin...
 190507    Status: ANOMALY  \nInsight: Temperature readin...
                                 ...                        
 851671    Status: ANOMALY  \nInsight: Temperature readin...
 851714    Status: ANOMALY  \nInsight: Temperature readin...
 851784    Status: ANOMALY  \nInsight: Temperature readin...
 851812    Status: ANOMALY  \nInsight: Temperature readin...
 851826    Status: ANOMALY  \nInsight: Temperature readin...
 Name: Matched_Log, Length: 71, dtype: object,
 103413    Status: ANOMALY  \nInsight: Pressure reading o...
 103546    Status: ANOMALY  \nInsight: Pressure reading o...
 103770    Status: ANOMALY  \nInsight: Pressure reading o...
 103868    Status: ANOMALY  \nInsight: Pressure reading o...
 190507    Status: ANOMALY  \nInsight:

In [None]:
df.to_pickle("RAG_INPUT.pkl")

In [227]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from ollama import Client

# Load CSV and chunk it
def load_chunks(df, chunk_size=5):
    chunks = []
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size].to_markdown(index=False)
        chunks.append(chunk)
    return chunks

# Build FAISS index
def build_index(chunks, embed_model):
    embeddings = embed_model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, embeddings, chunks

# Retrieve top chunks
def retrieve_chunks(query, index, chunks, embed_model, top_k=3):
    q_vec = embed_model.encode([query])
    _, I = index.search(q_vec, top_k)
    return [chunks[i] for i in I[0]]

# Ask question using Ollama
def ask_ollama(context, question, model='mistral'):
    client = Client()
    prompt = f"""You are a helpful analyst. Use the following oil rig sensor data to answer the question.

Sensor Data:
{context}

Question: {question}

Answer:"""
    response = client.chat(model=model, messages=[{"role": "user", "content": prompt}])
    return response['message']['content']

# Main pipeline
def rag_answer(df, query, model='phi'):
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    chunks = load_chunks(df)
    index, _, chunks = build_index(chunks, embed_model)
    relevant = retrieve_chunks(query, index, chunks, embed_model)
    context = "\n\n".join(relevant)
    return ask_ollama(context, query, model=model)

# Example usage
if __name__ == "__main__":
    query = input("Ask a question about oil rig sensor data: ")
    answer = rag_answer(df, query, model="mistral")  # or "mistral"
    print("\nAnswer:\n", answer)



Answer:
  From the data provided, it appears that there are several logs indicating issues with the Mud_Pump-2 equipment. The logs suggest that the pump's current reading has been consistently high, exceeding the normal range multiple times since 1:30. For instance, at 1:41, 1:47, 1:51, 1:56, and 1:57, the current reading for Mud_Pump-2 was 50A, 61.5A, 70.87A, 65.88A, and 75.14A respectively, which are all significantly higher than the normal range.

It's also worth noting that there have been multiple messages suggesting that someone should check the Mud_Pump-2 equipment at various time points, such as 1:30, 1:41, and so on, up to 1:57. This could indicate a recurring issue with this specific piece of equipment that needs attention.

Based on these observations, it seems that there is an ongoing problem with the Mud_Pump-2 that requires investigation or maintenance.


In [228]:
# Example usage
if __name__ == "__main__":
    query = input("Most anomalous Equipment ")
    answer = rag_answer(df, query, model="mistral")  # or "mistral"
    print("\nAnswer:\n", answer)


Answer:
  The data in the table shows readings from two mud pumps (Mud_Pump-1 and Mud_Pump-2) at regular intervals, over a period of time (minutes). Each row provides details about the pump's current reading, including the timestamp, the pump ID, the number of minutes that have elapsed since the start of the measurement period (Time), the pump status (On or Off), and various measurements such as Current, Pressure, Temperature, RPM (Revolutions Per Minute), Flow Rate, Torque, and Power Consumption.

The table also includes derived metrics, like Average Speed, Power Factor, and Efficiency for each reading. Additionally, it shows the total power consumed by each pump over time, as well as some diagnostic metrics such as AE_anomaly (Acoustic Emission anomaly) and Matched_Log (indicating whether the reading matches a known log entry).

The question you've provided doesn't require an answer in this context since it's more about summarizing the nature of the data rather than answering a spec

In [129]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model_name = "EleutherAI/gpt-neo-1.3B"  # 1.3B parameters (requires ~5GB RAM)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

prompt = "Explain: 'Status: ANOMALY  \nInsight: Temperature reading of 42.06910562218704 too low, could be Anomaly in the senor. Ideal Temperature should not go below:  (< 80). Big spike in temperature on 28th Feb. Happened around 4PM. Not normal.'"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids, max_length=100, temperature=0.8)
print(tokenizer.decode(output[0], skip_special_tokens=True))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/EleutherAI/gpt-neo-1.3B/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Explain: 'Status: ANOMALY  
Insight: Temperature reading of 42.06910562218704 too low, could be Anomaly in the senor. Ideal Temperature should not go below:  (< 80). Big spike in temperature on 28th Feb. Happened around 4PM. Not normal.'

A:

I think you are looking for the following:

The temperature is too low.
The temperature is too high.




In [132]:
df_operator_logs = pd.read_pickle('Final_Operator_Log.pkl')
print("read pickles 1")

df_anomalies = pd.read_pickle('Final_Anomaly_Data.pkl')

read pickles 1


In [163]:
df_operator_logs[df_operator_logs['Equipment_ID'].str.contains('Mud_Pump')==True]

Unnamed: 0,Timestamp,Equipment_ID,Operator_Log
3,2025-03-01 01:30:00,Mud_Pump-2,Need someone to take a look at Current reading...
4,2025-03-01 02:00:00,Mud_Pump-2,Reading jumped unexpectedly — Current on Mud_P...
7,2025-03-01 03:30:00,Mud_Pump-1,Flagged weird Current behavior. Equipment: Mud...
8,2025-03-01 04:00:00,Mud_Pump-2,Alert: Current deviation from baseline in Mud_...
10,2025-03-01 05:00:00,Mud_Pump-2,Unusual Current metrics observed in Mud_Pump-2...
...,...,...,...
12826,2025-05-25 03:24:00,Mud_Pump-4,That Current reading for Mud_Pump-4 doesn’t lo...
12827,2025-05-25 03:25:00,Mud_Pump-4,Current on Mud_Pump-4 looks off. [2025-05-25 0...
12828,2025-05-25 03:26:00,Mud_Pump-4,Looks like something's off with Current on Mud...
12829,2025-05-25 03:27:00,Mud_Pump-4,Flagged weird Current behavior. Equipment: Mud...
