In [51]:
#importing the important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('Task_1.csv')
df.head()

Unnamed: 0,Primary Key,Order Date,Product Category,Complaint,Cause,Correction,Root Cause,Symptom Condition 1,Symptom Component 1,Symptom Condition 2,Symptom Component 2,Symptom Condition 3,Symptom Component 3,Fix Condition 1,Fix Component 1,Fix Condition 2,Fix Component 2,Fix Condition 3,Fix Component 3
0,SO0026296-1,3/8/2023,SPRAYS,VISIBLY NOTICE fasteners under cab on P clips ...,Not tighten at factory.,"GO THROUGH AND RE-TIGHTEN ALL P CLIPS, NUTS, A...",Not Tightened,Loose,Cab P Clip,Loose,Left-Air Duct,Loose,Bulkhead Connector,Retightened,Cab P Clip,Retightened,Left Air Duct,Retightened,Bulkhead Connector
1,SO0026385-1,3/8/2023,SPRAYS,Fuel door will not stay open,GAS STRUT NOT INSTALLED OR ANYWHERE ON MACHINE,FOUND GAS STRUT NOT INSTALLED OR ANYWHERE ON M...,Not Installed,Won't stay open,Fuel Door,,,,,Installed,Gas Strut,,,,
2,SO0026385-11,3/8/2023,SPRAYS,"Compressor pressure line, braided steel, crushed","Compressor pressure line, braided steel, crush...",DRAIN AIR FROM SYSTEM.REMOVE ASSOCIATED P CLIP...,,,,,,,,,,,,,
3,SO0028352-1,3/8/2023,SPRAYS,Oil running from bottom of machine,OIL RETURN UNDER MACHINE SWIVEL FITTING LEFT L...,OIL RETURN UNDER MACHINE SWIVEL FITTING LEFT L...,,,,,,,,,,,,,
4,SO0028770-1,3/8/2023,SPRAYS,MISSING VECTOR & INTRIP UNLOCKS.,MISSING VECTOR & INTRIP UNLOCKS WERE NOT INSTA...,INSTALLED MISSING UNLOCKS RAN AND TESTED.,,,,,,,,,,,,,


In [52]:
#Getting the shape
df.shape

(20, 19)

In [53]:
#Getting the information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Primary Key          20 non-null     object
 1   Order Date           20 non-null     object
 2   Product Category     20 non-null     object
 3   Complaint            20 non-null     object
 4   Cause                20 non-null     object
 5   Correction           20 non-null     object
 6   Root Cause           2 non-null      object
 7   Symptom Condition 1  2 non-null      object
 8   Symptom Component 1  2 non-null      object
 9   Symptom Condition 2  1 non-null      object
 10  Symptom Component 2  1 non-null      object
 11  Symptom Condition 3  1 non-null      object
 12  Symptom Component 3  1 non-null      object
 13  Fix Condition 1      2 non-null      object
 14  Fix Component 1      2 non-null      object
 15  Fix Condition 2      1 non-null      object
 16  Fix Compon

In [54]:
#Checking for null_values
df.isnull().sum()

Unnamed: 0,0
Primary Key,0
Order Date,0
Product Category,0
Complaint,0
Cause,0
Correction,0
Root Cause,18
Symptom Condition 1,18
Symptom Component 1,18
Symptom Condition 2,19


In [55]:
#Checking for duplicates
df.duplicated().sum()

0

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import re

# Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [57]:
#Creating Taxonomy and pposite terms
taxonomy = {
    "Root Cause": ["faulty bolts","external issue","internal issue","faulty autoboom","drip down","out of range",
                   "no oring","oring","failed sending","poor material","out of fitting","not included","leaking",
                   "loose","crushed","not tighten","not installed","misalignment", "overheating", "wear and tear"],
    "Symptom_Condition": ["won't work","will not work","error codes","fault","uneven","open","broke",
                          "components missing","unlocks","crushed","loose","leak","oil dripping",
                          "Will not stay open","won't stay open","oil running","Noise", "Vibration",
                          "Overheating"],
    "Symptom_Component": ["bracket","condenser","transducer","elbow","supply module","sensors","hose","hydraulik",
                          "hood","fuel sender","rinse tank","sight glass tube","ncv harness",
                          "bulk head","oring","adapter","hydraulic reservoir","brackets","bolts","coupler",
                          "intrip","vector","swivel fitting","compressor pressure line","braided steel","fuel door",
                          "left air duct","bulkhead connectors","p clips","engine", "gearbox", "brakes"],
    "Fix_Condition": ["working","fixed codes","fixed","aligned","locked","replaced","installed","tighten",
                          "repaired","lubrication", "alignment", "replacement"],
    "Fix_Component": ["bracket","condenser","transducer","elbow","supply module","sensors","hose","hydraulik",
                          "hood","fuel sender","rinse tank","sight glass tube","ncv harness",
                          "bulk head","oring","adapter","hydraulic reservoir","brackets","bolts","coupler",
                          "intrip","vector","swivel fitting","compressor pressure line","braided steel","fuel door",
                          "left air duct","bulkhead connectors","p clips","screws", "bearings", "pipes"]
}

opposite_terms = {
    "faulty bolts": "replaced bolts",
    "external issue": "resolved externally",
    "internal issue": "resolved internally",
    "faulty autoboom": "autoboom fixed",
    "drip down": "sealed properly",
    "out of range": "within range",
    "no oring": "oring installed",
    "oring": "oring fixed",
    "failed sending": "successful sending",
    "poor material": "quality material",
    "out of fitting": "fitted properly",
    "not included": "included",
    "leaking": "sealed",
    "loose": "tighten",
    "leak": "sealed",
    "oil dripping": "no leaks",
    "loose": "tightened",
    "crushed": "replaced",
    "not tighten": "tightened",
    "not installed": "installed",
    "misalignment": "aligned",
    "overheating": "cooled",
    "wear and tear": "restored"
}

In [58]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', str(text))
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Preprocess dataset columns
df['Complaint'] = df['Complaint'].apply(preprocess_text)
df['Cause'] = df['Cause'].apply(preprocess_text)
df['Correction'] = df['Correction'].apply(preprocess_text)

# Function to find the best match for a taxonomy category
def find_best_match(text, category):
    if not text:
        return "Unclassified"
    corpus = taxonomy[category] + [text]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    best_match_idx = similarity_scores.argmax()
    return taxonomy[category][best_match_idx] if similarity_scores.max() > 0.1 else "Unclassified"

# Vectorizer and similarity computation for a taxonomy category
def find_matches(text, category):
    if not text:
        return []
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(taxonomy[category] + [text])
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    matched_indices = similarity_scores.argsort()[::-1]
    matches = [taxonomy[category][i] for i in matched_indices if similarity_scores[i] > 0.1]
    return matches[:3]

# Apply taxonomy tagging
def assign_symptom_components(text, category, columns):
    matches = find_matches(text, category)
    for i in range(len(columns)):
        if i < len(matches) and matches[i]:
            columns[i] = matches[i]
        else:
            columns[i] = ""
    return columns

# Function to find the fix condition based on opposite terms and taxonomy
def find_fix_condition(text, category):
    if not text:
        return "-"

    # Check for a direct match in opposite_terms
    if text.lower() in opposite_terms:
        return opposite_terms[text.lower()]

    # Use taxonomy for finding the best match
    corpus = taxonomy[category] + [text]
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarity_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])
    best_match_idx = similarity_scores.argmax()

    return taxonomy[category][best_match_idx] if similarity_scores.max() > 0.1 else "-"

# Tagging columns
df['Root Cause'] = df['Cause'].apply(lambda x: find_best_match(x, 'Root Cause'))
df['Symptom Condition 1'] = df['Complaint'].apply(lambda x: find_best_match(x, 'Symptom_Condition'))
df['Symptom Condition 2'] = df['Complaint'].apply(lambda x: find_best_match(x, 'Symptom_Condition'))
df['Symptom Condition 3'] = df['Complaint'].apply(lambda x: find_best_match(x, 'Symptom_Condition'))
df['Fix Condition 1'] = df['Correction'].apply(lambda x: find_fix_condition(x, 'Fix_Condition'))
df['Fix Condition 2'] = df['Correction'].apply(lambda x: find_fix_condition(x, 'Fix_Condition'))
df['Fix Condition 3'] = df['Correction'].apply(lambda x: find_fix_condition(x, 'Fix_Condition'))

# Add Symptom and Fix columns dynamically
for category, columns in [
    ("Symptom_Component", ["Symptom Component 1", "Symptom Component 2", "Symptom Component 3"]),
    ("Fix_Component", ["Fix Component 1", "Fix Component 2", "Fix Component 3"])
]:
    df[columns] = ""
    if category == "Symptom_Component":
        for index in df.index:
            updated_columns = assign_symptom_components(df.loc[index, 'Complaint'], category, columns.copy())
            df.loc[index, columns] = updated_columns
    elif category == "Fix_Component":
        for index in df.index:
            updated_columns = assign_symptom_components(df.loc[index, 'Correction'], category, columns.copy())
            df.loc[index, columns] = updated_columns

# Save the tagged dataset
output_path = "tagged_dataset.csv"
df.to_csv(output_path, index=False)

In [59]:
output_path

'tagged_dataset.csv'