In [69]:
import pandas as pd
import re

In [70]:
df = pd.read_excel("../DATA OLD/PARAMETERS_OLD.xlsx")
df_sales = pd.read_excel("../DATA CURRENT/SALES.xlsx")

# Numeric and Categorical Parameters

# Change once Andrea gets back!!!!

In [71]:
df.dtypes

Item Code              object
Layout                 object
Sensing Element        object
Case Material          object
Cable Material         object
Cable Length          float64
Terminal               object
Material               object
R0                     object
Class                  object
TC Type                object
R25                    object
B25/85                float64
Case Diameter          object
Case Length            object
Case HEX              float64
Case Thread            object
Cable Wires Number    float64
dtype: object

**Case Diameter and Case Length are not reading as numeric variables -> need to convert** 

**Case Length**: 40/F20+20, 40/F30+10 and commas instead of decimal points

In [72]:
#Case Length
df['Case Length'] = df['Case Length'].astype(str).str.split('/').str[0].str.replace(',', '.').astype(float)

**Case Diameter**: 

In [73]:
# Function to extract the numeric part before any non-numeric characters
def extract_numeric(value):
    # Use regular expression to find the first numeric part
    match = re.match(r"(\d+)", value)
    return match.group(1) if match else value

# Clean the 'Case Diameter' column
df['Case Diameter'] = df['Case Diameter'].astype(str).str.replace(',', '.').apply(extract_numeric).astype(float)

# Removing Case Thread

In [74]:
df = df.drop('Case Thread', axis=1)

# Missing Parameters 

In [75]:
# Remove rows with missing values in Level 1
df_cleaned = df.dropna(subset=['Layout', 'Sensing Element', 'Case Material', 'Cable Material', 'Cable Length', 'Terminal'])

# Remove rows with missing values in columns applicable for all sensors 
df_cleaned = df_cleaned.dropna(subset=['Case Length', 'Case Diameter', "Cable Wires Number"])

# If “Layout” is THREADED and “Case HEX” is empty, remove the row
df_cleaned = df_cleaned[~((df_cleaned['Layout'] == 'Threaded') & (df_cleaned['Case HEX'].isna()))]

# When “Sensing Element” is NTC, remove rows if “R25” and/or “B25/85” is empty
df_cleaned = df_cleaned[~((df_cleaned['Sensing Element'] == 'NTC') & (df_cleaned[['R25', 'B25/85']].isna().any(axis=1)))]

# When “Sensing Element” is PTC, remove rows if “Class” is empty
df_cleaned = df_cleaned[~((df_cleaned['Sensing Element'] == 'PTC') & (df_cleaned['Class'].isna()))]

# When “Sensing Element” is RTD, remove rows if “Material”, “R0” or “Class” is empty
df_cleaned = df_cleaned[~((df_cleaned['Sensing Element'] == 'RTD') & (df_cleaned[['Material', 'R0', 'Class']].isna().any(axis=1)))]

# When “Sensing Element” is MG0, remove rows if “TC Type” is empty
df_cleaned = df_cleaned[~((df_cleaned['Sensing Element'] == 'MGO') & (df_cleaned['TC Type'].isna()))]

# When “Sensing Element” is TCx, remove rows if “TC Type” is empty
df_cleaned = df_cleaned[~((df_cleaned['Sensing Element'] == 'TCx') & (df_cleaned['TC Type'].isna()))]

# Identify and separate the deleted rows according to the provided rules
deleted_rows = df[~df.index.isin(df_cleaned.index)]

# Save the deleted rows to a separate excel file
deleted_rows.to_excel('../DELETED ITEMS/MISSING_PARAMETERS_LVL2.xlsx', index=False)

# Missing Sales

In [76]:
# Identify items in df_cleaned but not in df_sales
missing_items = df_cleaned[~df_cleaned['Item Code'].isin(df_sales['Item Code'])]

# Update df_cleaned to only include items present in df_sales
df_cleaned = df_cleaned[df_cleaned['Item Code'].isin(df_sales['Item Code'])]

# Save the missing items to an Excel file
missing_items.to_excel('../DELETED ITEMS/MISSING_SALES_LVL2.xlsx', index=False)

# Download Cleaned Parameters

In [77]:
# Save the cleaned dataset as well
df_cleaned.to_excel('../DATA CURRENT/L2-PARAMETERS.xlsx', index=False)