In [None]:
from google.colab import files
uploaded = files.upload()

Saving PdM_threshold_exced.xlsx to PdM_threshold_exced.xlsx


In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI

client = OpenAI(api_key="DeepSeek API Key", base_url="https://api.deepseek.com")

excel_file = "/content/PdM_threshold_exced.xlsx"

sheet_names = pd.ExcelFile(excel_file).sheet_names

all_comparisons = {}

for sheet_name in sheet_names:
    try:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)

        target_column = df.columns[-1]
        values = df[target_column].dropna().tolist()[:100]

        if len(values) < 100 or not all(isinstance(v, (int, float, np.integer, np.floating)) for v in values):
            print(f"❌ Skipping {sheet_name}: insufficient or invalid numeric data.")
            continue

        threshold_95 = float(np.percentile(values, 95))
        python_count = sum(1 for v in values if v > threshold_95)

        prompt = (
            f"I have a list of 100 numbers and a threshold.\n"
            f"Threshold: {threshold_95:.4f}\n"
            f"List: {values}\n\n"
            f"How many values in the list are strictly greater than the threshold?\n"
            f"Just return the count as a number, nothing else."
        )

        response = client.chat.completions.create(
            model="deepseek-reasoner",
            messages=[
                {"role": "system", "content": "You are a data scientist."},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )

        deepseek_count_str = response.choices[0].message.content.strip()
        try:
            deepseek_count = int(deepseek_count_str)
        except ValueError:
            deepseek_count = None

        status = "MATCH ✅" if deepseek_count == python_count else "MISMATCH ❌"

        all_comparisons[sheet_name] = {
            "Threshold": threshold_95,
            "Python Count": python_count,
            "DeepSeek Count": deepseek_count,
            "Match": status
        }

        print(f"{sheet_name} → Python: {python_count}, DeepSeek: {deepseek_count} → {status}")

    except Exception as e:
        print(f"❌ Error processing sheet '{sheet_name}': {e}")

results_df = pd.DataFrame.from_dict(all_comparisons, orient='index')
results_df.to_csv("/content/exceedance.csv") # running time: 101 min

Machine_1 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_2 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_3 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_4 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_5 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_6 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_7 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_8 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_9 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_10 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_11 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_12 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_13 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_14 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_15 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_16 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_17 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_18 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_19 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_20 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_21 → Python: 5, DeepSeek: 5 → MATCH ✅
Machine_22 → Python: 5, DeepSeek: 5 → MATCH

In [None]:
from sklearn.metrics import mean_squared_error

client = OpenAI(api_key="DeepSeek API Key", base_url="https://api.deepseek.com")

excel_file = "/content/PdM_modified_slope_calc.xlsx"
sheet_names = pd.ExcelFile(excel_file).sheet_names

deepseek_slopes = []
python_slopes = []
sheet_results = {}

for sheet_name in sheet_names:
    try:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)

        target_column = df.columns[-1]
        y = df[target_column].dropna().values[:100]

        if len(y) < 2 or not np.issubdtype(y.dtype, np.number):
            print(f"❌ Skipping {sheet_name}: not enough numeric data")
            continue

        x = np.arange(len(y))

        data_string = f"x = {list(x)}\ny = {list(y)}\n"

        prompt = (
            f"Given the following dataset:\n{data_string}\n"
            f"Compute the slope of the best-fit line using linear regression. "
            f"Just return the slope as a number, nothing else."
        )

        response = client.chat.completions.create(
            model="deepseek-reasoner",
            messages=[
                {"role": "system", "content": "You are a data scientist"},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )

        deepseek_response = response.choices[0].message.content.strip()
        try:
            deepseek_slope = float(deepseek_response)
        except ValueError:
            print(f"❌ Could not parse slope from DeepSeek for {sheet_name}")
            continue

        slope, intercept = np.polyfit(x, y, 1)

        deepseek_slopes.append(deepseek_slope)
        python_slopes.append(slope)

        sheet_results[sheet_name] = {
            "DeepSeek Slope": deepseek_slope,
            "Python Slope": slope,
            "Difference": deepseek_slope - slope
        }

        print(f"{sheet_name} → DeepSeek: {deepseek_slope:.4f}, Python: {slope:.4f}")

    except Exception as e:
        print(f"❌ Error processing {sheet_name}: {e}")

mse = mean_squared_error(python_slopes, deepseek_slopes)
print(f"\n✅ Final MSE between DeepSeek and Python slopes: {mse:.6f}")

results_df = pd.DataFrame.from_dict(sheet_results, orient='index')
results_df.to_csv("/content/slope.csv")

Machine_1 → DeepSeek: 2.1380, Python: 2.0707
Machine_2 → DeepSeek: 2.0867, Python: 1.9459
Machine_3 → DeepSeek: 2.2380, Python: 2.2384
Machine_4 → DeepSeek: 2.2299, Python: 2.2179
Machine_5 → DeepSeek: 2.0106, Python: 2.0118
Machine_6 → DeepSeek: 2.3000, Python: 1.9878
Machine_7 → DeepSeek: 1.9848, Python: 1.9836
Machine_8 → DeepSeek: 2.1132, Python: 2.1132
❌ Could not parse slope from DeepSeek for Machine_9
Machine_10 → DeepSeek: 2.1570, Python: 1.9718
Machine_11 → DeepSeek: 1.9655, Python: 2.0519
Machine_12 → DeepSeek: 2.0320, Python: 2.0313
Machine_13 → DeepSeek: 2.0000, Python: 1.9951
Machine_14 → DeepSeek: 2.0072, Python: 1.9264
❌ Could not parse slope from DeepSeek for Machine_15
Machine_16 → DeepSeek: 1.9690, Python: 1.9686
Machine_17 → DeepSeek: 2.0141, Python: 2.0129
Machine_18 → DeepSeek: 2.0880, Python: 1.9428
Machine_19 → DeepSeek: 3.1670, Python: 1.9793
Machine_20 → DeepSeek: 2.0012, Python: 2.0149
❌ Could not parse slope from DeepSeek for Machine_21
Machine_22 → DeepSeek:

In [None]:
excel_file = "/content/PdM_modified_slope_calc.xlsx"
xls = pd.ExcelFile(excel_file)
sheet_names = xls.sheet_names

target_machine_ids = [9, 15, 21, 40, 49, 52, 66, 85, 86, 87, 88, 89]
target_sheets = [sheet_names[i - 1] for i in target_machine_ids]

deepseek_slopes = []
python_slopes = []
sheet_results = {}

for sheet_name in target_sheets:
    try:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        target_column = df.columns[-1]
        y = df[target_column].dropna().values[:100]

        if len(y) < 2 or not np.issubdtype(y.dtype, np.number):
            print(f"❌ Skipping {sheet_name}: insufficient numeric data.")
            continue

        x = np.arange(len(y))

        data_string = f"x = {list(x)}\ny = {list(y)}\n"
        prompt = (
            f"Given the following dataset:\n{data_string}\n"
            f"Compute the slope of the best-fit line using linear regression. "
            f"Just return the slope as a number, nothing else."
        )

        response = client.chat.completions.create(
            model="deepseek-reasoner",
            messages=[
                {"role": "system", "content": "You are a data scientist"},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )

        deepseek_response = response.choices[0].message.content.strip()
        try:
            deepseek_slope = float(deepseek_response)
        except ValueError:
            print(f"❌ Failed to parse DeepSeek slope for {sheet_name}")
            continue

        slope, _ = np.polyfit(x, y, 1)

        deepseek_slopes.append(deepseek_slope)
        python_slopes.append(slope)

        sheet_results[sheet_name] = {
            "DeepSeek Slope": deepseek_slope,
            "Python Slope": slope
        }

        print(f"✅ {sheet_name}: DeepSeek = {deepseek_slope:.4f}, Python = {slope:.4f}")

    except Exception as e:
        print(f"❌ Error processing {sheet_name}: {e}")

pd.DataFrame.from_dict(sheet_results, orient='index').to_csv("/content/rerun_slope.csv")

❌ Failed to parse DeepSeek slope for Machine_9
✅ Machine_15: DeepSeek = 1.7230, Python = 1.9807
❌ Failed to parse DeepSeek slope for Machine_21
✅ Machine_40: DeepSeek = 2.5560, Python = 2.1528
❌ Error processing Machine_49: Connection error.
❌ Error processing Machine_52: Connection error.
❌ Error processing Machine_66: Connection error.
✅ Machine_85: DeepSeek = 2.0220, Python = 2.0275
✅ Machine_86: DeepSeek = 2.0000, Python = 1.9981
✅ Machine_87: DeepSeek = 2.5780, Python = 2.0046
✅ Machine_88: DeepSeek = 2.0859, Python = 1.9903
✅ Machine_89: DeepSeek = 1.9803, Python = 1.9803


In [None]:
excel_file = "/content/PdM_modified_slope_calc.xlsx"
xls = pd.ExcelFile(excel_file)
sheet_names = xls.sheet_names

target_machine_ids = [9, 21, 49, 52, 66]
target_sheets = [sheet_names[i - 1] for i in target_machine_ids]

deepseek_slopes = []
python_slopes = []
sheet_results = {}

for sheet_name in target_sheets:
    try:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)
        target_column = df.columns[-1]
        y = df[target_column].dropna().values[:100]

        if len(y) < 2 or not np.issubdtype(y.dtype, np.number):
            print(f"❌ Skipping {sheet_name}: insufficient numeric data.")
            continue

        x = np.arange(len(y))

        data_string = f"x = {list(x)}\ny = {list(y)}\n"
        prompt = (
            f"Given the following dataset:\n{data_string}\n"
            f"Compute the slope of the best-fit line using linear regression. "
            f"Just return the slope as a number, nothing else."
        )

        response = client.chat.completions.create(
            model="deepseek-reasoner",
            messages=[
                {"role": "system", "content": "You are a data scientist"},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )

        deepseek_response = response.choices[0].message.content.strip()
        try:
            deepseek_slope = float(deepseek_response)
        except ValueError:
            print(f"❌ Failed to parse DeepSeek slope for {sheet_name}")
            continue

        slope, _ = np.polyfit(x, y, 1)

        deepseek_slopes.append(deepseek_slope)
        python_slopes.append(slope)

        sheet_results[sheet_name] = {
            "DeepSeek Slope": deepseek_slope,
            "Python Slope": slope
        }

        print(f"✅ {sheet_name}: DeepSeek = {deepseek_slope:.4f}, Python = {slope:.4f}")

    except Exception as e:
        print(f"❌ Error processing {sheet_name}: {e}")

pd.DataFrame.from_dict(sheet_results, orient='index').to_csv("/content/rerun_slope_2.csv")

✅ Machine_9: DeepSeek = 1.9498, Python = 1.9516
✅ Machine_21: DeepSeek = 2.0360, Python = 2.0364
✅ Machine_49: DeepSeek = 2.3620, Python = 2.3584
✅ Machine_52: DeepSeek = 1.9881, Python = 1.9881
✅ Machine_66: DeepSeek = 2.0721, Python = 1.9660


In [None]:
original_df = pd.read_csv("slope.csv", index_col=0)
rerun1_df = pd.read_csv("rerun_slope.csv", index_col=0)
rerun2_df = pd.read_csv("rerun_slope_2.csv", index_col=0)

combined_df = pd.concat([original_df, rerun1_df, rerun2_df])

print(f"Total datasets combined: {len(combined_df)}")

final_mse = mean_squared_error(combined_df["Python Slope"], combined_df["DeepSeek Slope"])
print(f"✅ Final Mean Squared Error (MSE) across all datasets: {final_mse:.6f}")

Total datasets combined: 100
✅ Final Mean Squared Error (MSE) across all datasets: 0.041993


In [2]:
# Re-import required modules after reset
import pandas as pd
import numpy as np

# Manually reconstruct the DeepSeek vs Python values from your data
deepseek = [
    2.1380, 2.0867, 2.2380, 2.2299, 2.0106, 2.3000, 1.9848, 2.1132, 1.9498, 2.1570,
    1.9655, 2.0320, 2.0000, 2.0072, 1.7230, 1.9690, 2.0141, 2.0880, 3.1670, 2.0012,
    2.0360, 2.1600, 2.0000, 2.0780, 2.1245, 2.2256, 2.3440, 2.1009, 2.1500, 1.9460,
    1.4780, 1.7190, 1.7840, 2.0696, 1.8463, 2.5360, 2.4643, 2.2804, 2.1629, 2.5560,
    2.2633, 2.3190, 1.8255, 1.8464, 2.3800, 2.5360, 2.2130, 2.0130, 2.3620, 1.7568,
    2.0714, 1.9881, 2.0733, 2.0158, 2.0305, 2.0000, 2.5570, 2.0833, 2.0768, 1.8233,
    2.0430, 2.0196, 2.0088, 2.0800, 2.0250, 2.0721, 2.0726, 1.8235, 1.9576, 2.1080,
    2.0718, 1.8410, 1.9714, 2.0275, 1.9333, 2.0111, 1.9300, 1.9314, 2.0000, 2.0000,
    2.0183, 2.0702, 1.9710, 1.9394, 2.0220, 2.0000, 2.5780, 2.0859, 1.9803, 2.0000,
    1.9330, 2.0157, 2.0336, 2.0273, 1.9050, 2.0190, 1.9234, 1.9690, 2.0081, 2.0352
]

python = [
    2.0707, 1.9459, 2.2384, 2.2179, 2.0118, 1.9878, 1.9836, 2.1132, 1.9516, 1.9718,
    2.0519, 2.0313, 1.9951, 1.9264, 1.9807, 1.9686, 2.0129, 1.9428, 1.9793, 2.0149,
    2.0364, 1.9991, 1.9720, 2.0780, 2.1245, 2.0197, 1.8845, 2.1057, 2.1476, 1.9227,
    1.4648, 1.7070, 1.7839, 2.0600, 1.9224, 1.8154, 2.0651, 2.1229, 2.1035, 2.1528,
    2.2633, 1.9103, 1.8301, 1.9608, 1.9736, 2.1789, 2.2142, 1.8027, 2.3584, 2.0261,
    1.9780, 1.9881, 1.9929, 2.0158, 1.9691, 1.9543, 2.0460, 2.0773, 1.9690, 1.9976,
    2.0431, 2.0202, 1.9968, 2.0094, 2.0241, 1.9660, 1.9251, 1.9594, 1.9462, 1.9518,
    2.0578, 1.8421, 2.0001, 2.0261, 1.9859, 2.0111, 1.9765, 1.9990, 1.9693, 1.9987,
    2.0337, 2.0147, 1.9835, 1.9975, 2.0275, 1.9981, 2.0046, 1.9903, 1.9803, 1.9999,
    1.9881, 1.9670, 2.0259, 2.0300, 2.0220, 2.0074, 1.9646, 1.9835, 2.0148, 1.9939
]

# Build DataFrame and compute metrics
df = pd.DataFrame({
    "DeepSeek": deepseek,
    "Python": python
})
df["MSE"] = (df["DeepSeek"] - df["Python"]) ** 2
df["MAE"] = abs(df["DeepSeek"] - df["Python"])

# Compute summary stats for table
summary = df[["MSE", "MAE"]].describe().T

summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MSE,100.0,0.041993,0.157537,0.0,4e-06,0.000804,0.013238,1.410631
MAE,100.0,0.101422,0.17896,0.0,0.001875,0.02835,0.11505,1.1877
