In [44]:
import pandas as pd
import random

# Define valid defect patterns for 10 causes (3D array with multiple patterns)
cause_defect_patterns = [
    
    # Cause 1: Minor collision during assembly line transfer (Multiple patterns)
    [
        [[0], [], []],
        [[0], [0], []],
        [[0], [], [0]],
        [[0], [0], [0]]
    ],
    # Cause 2: Improper welding pressure (Expanded patterns)
    [
        [[1], [0], []],
        [[1], [], [1]],
        [[1], [0], [0]],
        [[1], [1], [1]],
        [[0], [0], [1]]
    ],
    # Cause 3: Damaged tool head in cutting machine (Expanded patterns)
    [
        [[0, 1], [0], [1]],
        [[0], [1], [0]],
        [[1], [0], [1]],
        [[0], [0, 1], [1]],
        [[0, 1], [0, 2], [1]]
    ],
    # Cause 4: Over-tightened wheel fixture (Expanded patterns)
    [
        [[0], [1], [0]],
        [[0], [0], [1]],
        [[0, 1], [0], [0]],
        [[0], [1], [1]],
        [[1], [0], [0]]
    ],
    # Cause 5: Body panel alignment issue (Expanded patterns)
    [
        [[0], [1], []],
        [[0], [0], [1]],
        [[1], [0], []],
        [[0, 1], [0], []],
        [[0], [1], [0]]
    ],
    # Cause 6: Paint machine nozzle malfunction (Expanded patterns)
    [
        [[2], [2], []],
        [[2], [0], [2]],
        [[2], [1], [2]],
        [[1], [2], [0]],
        [[0], [2], [0]]
    ],
    # Cause 7: Corroded sheet metal batch (Expanded patterns)
    [
        [[1], [1], [1]],
        [[0], [1], [1]],
        [[1], [0], [1]],
        [[1], [1], [0]],
        [[0], [0], [1]]
    ],
    # Cause 8: Paint contamination and poor surface preparation (Expanded patterns)
    [
        [[0, 2], [0, 1], [2]],
        [[0, 1], [0, 2], [2]],
        [[1], [0, 2], [1]],
        [[2], [0, 1], [0]],
        [[0], [1], [2]]
    ],
    # Cause 9: Multiple system failures in paint and welding units (Expanded patterns)
    [
        [[0, 1, 2], [1], [0, 2]],
        [[1], [0, 2], [0]],
        [[0, 1], [2], [1]],
        [[2], [1], [0]],
        [[0], [1], [2]]
    ],
    # Cause 10: Conveyor belt sensor failure causing defect positioning errors (Unique patterns)
    [
        [[0], [2], [1]],
        [[1, 2], [0], []],
        [[2], [0], [1]],
        [[0], [1, 2], []],
        [[1], [2], [0, 2]]
    ]
]

# Cause dictionary
cause_dict = {
    0: "Assembly transfer arm malfunction (minor collision)",
    1: "Welding machine pressure control failure",
    2: "Damaged cutting tool head",
    3: "Faulty torque limiter in wheel fixture unit",
    4: "Misaligned body panel due to positioning robot error",
    5: "Paint nozzle malfunction in spray system",
    6: "Corrosion from defective metal sheet batch",
    7: "Surface contamination due to inadequate surface prep or paint filtration failure",
    8: "Multiple failures in paint and welding systems",
    9: "Conveyor belt sensor failure causing defect positioning errors"
}

# Generate 200 rows by randomly selecting patterns from cause_defect_patterns
rows = []
for _ in range(2000):
    cause = random.randint(0, 9)  # Randomly select a cause (0-9)
    pattern = random.choice(cause_defect_patterns[cause])  # Randomly select a pattern for the selected cause
    
    row = {
        "defects_on_body": pattern[0],
        "defects_on_wheels": pattern[1],
        "defects_on_backbody": pattern[2],
        "cause": cause
    }
    rows.append(row)

# Create DataFrame and save it to CSV
df = pd.DataFrame(rows)
df.to_csv("defect_data.csv", index=False)

print(df.head())  # Check the first few rows of the generated data


  defects_on_body defects_on_wheels defects_on_backbody  cause
0       [0, 1, 2]               [1]              [0, 2]      8
1             [1]               [1]                 [1]      1
2             [0]            [1, 2]                  []      9
3             [2]               [2]                  []      5
4             [0]            [1, 2]                  []      9


In [1]:
%pip install scikit-learn pandas

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
Downloading joblib-1.5.0-py3-none-any.whl (307 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.5.0 scikit-learn-1.6.1 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd

defect_dict = {
    "scratches": 0,
    "hole": 1,
    "discoloration": 2,
}
cause_dict = {
    0: "Assembly transfer arm malfunction (minor collision)",
    1: "Welding machine pressure control failure",
    2: "Damaged cutting tool head",
    3: "Faulty torque limiter in wheel fixture unit",
    4: "Misaligned body panel due to positioning robot error",
    5: "Paint nozzle malfunction in spray system",
    6: "Corrosion from defective metal sheet batch",
    7: "Surface contamination due to inadequate surface prep or paint filtration failure",
    8: "Multiple failures in paint and welding systems",
    9: "Conveyor belt sensor failure causing defect positioning errors"
}

data = pd.read_csv("defect_data.csv")
data

Unnamed: 0,defects_on_body,defects_on_wheels,defects_on_backbody,cause
0,"[0, 1, 2]",[1],"[0, 2]",8
1,[1],[1],[1],1
2,[0],"[1, 2]",[],9
3,[2],[2],[],5
4,[0],"[1, 2]",[],9
...,...,...,...,...
1995,[2],"[0, 1]",[0],7
1996,[2],[0],[1],9
1997,[1],[0],[1],6
1998,"[0, 1]",[2],[1],8


In [32]:
import ast
def parse_list_column(column):
    return column.apply(ast.literal_eval)
data['defects_on_body'] = parse_list_column(data['defects_on_body'])
data['defects_on_wheels'] = parse_list_column(data['defects_on_wheels'])
data['defects_on_backbody'] = parse_list_column(data['defects_on_backbody'])

In [33]:
data

Unnamed: 0,defects_on_body,defects_on_wheels,defects_on_backbody,cause
0,"[0, 1, 2]",[1],"[0, 2]",8
1,[1],[1],[1],1
2,[0],"[1, 2]",[],9
3,[2],[2],[],5
4,[0],"[1, 2]",[],9
...,...,...,...,...
1995,[2],"[0, 1]",[0],7
1996,[2],[0],[1],9
1997,[1],[0],[1],6
1998,"[0, 1]",[2],[1],8


In [35]:
from sklearn.preprocessing import MultiLabelBinarizer
mlbs = {}
def encode_defect_column(column, prefix):
    mlb = MultiLabelBinarizer()
    encoded = mlb.fit_transform(column)
    mlbs[prefix] = mlb
    return pd.DataFrame(encoded, columns=[f'{prefix}_{cls}' for cls in mlb.classes_])

In [36]:
body_encoded = encode_defect_column(data['defects_on_body'], 'body')
wheels_encoded = encode_defect_column(data['defects_on_wheels'], 'wheels')
backbody_encoded = encode_defect_column(data['defects_on_backbody'], 'backbody')

In [37]:
X = pd.concat([body_encoded, wheels_encoded, backbody_encoded], axis=1)
y = data['cause']
X

Unnamed: 0,body_0,body_1,body_2,wheels_0,wheels_1,wheels_2,backbody_0,backbody_1,backbody_2
0,1,1,1,0,1,0,1,0,1
1,0,1,0,0,1,0,0,1,0
2,1,0,0,0,1,1,0,0,0
3,0,0,1,0,0,1,0,0,0
4,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...
1995,0,0,1,1,1,0,1,0,0
1996,0,0,1,1,0,0,0,1,0
1997,0,1,0,1,0,0,0,1,0
1998,1,1,0,0,0,1,0,1,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import joblib

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. SVM
from sklearn.svm import SVC
svm_model = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1, gamma='scale', probability=True))
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
joblib.dump(svm_model, "./defect_models/svm_model.pkl")
print(f'\nðŸ”¹ SVM\nAccuracy: {accuracy_score(y_test, svm_pred) * 100:.2f}%\nPrecision: {precision_score(y_test, svm_pred,average='weighted')}')
print(classification_report(y_test, svm_pred))

# 2. Logistic Regression
from sklearn.linear_model import LogisticRegression
log_model = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
joblib.dump(log_model, "./defect_models/logistic_model.pkl")
print(f'\nðŸ”¹ Logistic Regression\nAccuracy: {accuracy_score(y_test, log_pred) * 100:.2f}%\nPrecision: {precision_score(y_test, log_pred,average='weighted')}')
print(classification_report(y_test, log_pred))

# 3. Linear Regression (not recommended for classification)
from sklearn.linear_model import LinearRegression
import numpy as np

lin_model = make_pipeline(StandardScaler(), LinearRegression())
lin_model.fit(X_train, y_train)
lin_pred = lin_model.predict(X_test)
# Round predictions to nearest integer (naive classification)
lin_pred_class = np.rint(lin_pred).astype(int)
# Clip to valid class range (0 to 9)
lin_pred_class = np.clip(lin_pred_class, 0, 9)
joblib.dump(lin_model, "./defect_models/linear_model.pkl")
print(f'\nðŸ”¹ Linear Regression (Rounded)\nAccuracy: {accuracy_score(y_test, lin_pred_class) * 100:.2f}%\nPrecision: {precision_score(y_test, lin_pred_class,average='weighted')}')
print(classification_report(y_test, lin_pred_class))

# 4. Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
joblib.dump(rf_model, "./defect_models/random_forest.pkl")
print(f'\nðŸ”¹ Random Forest\nAccuracy: {accuracy_score(y_test, rf_pred) * 100:.2f}%\nPrecision: {precision_score(y_test, rf_pred,average='weighted')}')
print(classification_report(y_test, rf_pred))

joblib.dump(mlbs, "mlbs.pkl")


ðŸ”¹ SVM
Accuracy: 81.25%
F1 Score: 0.8326601535884625
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       0.38      0.47      0.42        30
           2       0.64      1.00      0.78        36
           3       0.76      0.35      0.48        37
           4       0.88      0.67      0.76        42
           5       1.00      1.00      1.00        41
           6       0.69      0.73      0.71        45
           7       0.82      1.00      0.90        42
           8       1.00      0.76      0.86        37
           9       1.00      1.00      1.00        42

    accuracy                           0.81       400
   macro avg       0.82      0.80      0.79       400
weighted avg       0.83      0.81      0.81       400


ðŸ”¹ Logistic Regression
Accuracy: 81.00%
F1 Score: 0.8737545744104094
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['mlbs.pkl']