In [1]:
import pandas as pd

# Load the dataset
file_path = "extracted_intents.csv"
df = pd.read_csv(file_path)

# Remove duplicates
df = df.drop_duplicates()

# Trim whitespace from intent and response columns
df['intent'] = df['intent'].str.strip()
df['response'] = df['response'].str.strip()

# Display dataset info after cleaning
print(f"Dataset after duplicate removal: {df.shape}")
df.head()


Dataset after duplicate removal: (139, 2)


Unnamed: 0,intent,response
0,acknowledge,Thank you for visiting our medibot! I’m here t...
1,acknowledge,"Thank you for your query,hope you get well soon!"
2,BP,Let me know which type of services do you want...
3,BP_Ayurvedic Remedies,Add garlic to your diet to help reduce high bl...
4,BP_Diet,BP Diet Information


In [2]:
import re

# Function to find special characters in text
def find_special_chars(text):
    return re.findall(r"[^a-zA-Z0-9\s.,?!'-]", text)  # Keeping common punctuations

# Apply function to detect special characters
df["intent_special_chars"] = df["intent"].apply(find_special_chars)
df["response_special_chars"] = df["response"].apply(find_special_chars)

# Show rows with special characters
special_chars_df = df[(df["intent_special_chars"].apply(len) > 0) | (df["response_special_chars"].apply(len) > 0)]
print(f"Rows with special characters: {special_chars_df.shape[0]}")
special_chars_df[["intent", "intent_special_chars", "response", "response_special_chars"]]


Rows with special characters: 111


Unnamed: 0,intent,intent_special_chars,response,response_special_chars
0,acknowledge,[],Thank you for visiting our medibot! I’m here t...,"[’, ""]"
3,BP_Ayurvedic Remedies,[_],Add garlic to your diet to help reduce high bl...,[]
4,BP_Diet,[_],BP Diet Information,[]
5,BP_Home Remedies,[_],"eat pomegranates which contains Polyphenols, I...",[]
6,BP_medication,[_],CARDIGLOW-H40(Telmisartan and Hydrochlorothiaz...,"[(, ), (, )]"
...,...,...,...,...
134,Vomitings_Medication,[_],Domperidone tablets I.P. Dose-10mg Cause-Dom...,"[(, )]"
135,Vomitings_Naturopathy Remedies,[_],"""Take warm water mixed with salt and drink it ...","["", ""]"
136,Vomitings_Remedies,[_],Please let me know which type of remedies do y...,[]
137,Vomitings_Symptom_Check,"[_, _]",please check the vomitings symptoms,[]


In [3]:
# Remove underscores from intent names
df["intent"] = df["intent"].str.replace("_", " ")

# Remove double quotes from response
df["response"] = df["response"].str.replace('"', "")

print("Cleaning complete. Here’s a preview:")
df.head()


Cleaning complete. Here’s a preview:


Unnamed: 0,intent,response,intent_special_chars,response_special_chars
0,acknowledge,Thank you for visiting our medibot! I’m here t...,[],"[’, ""]"
1,acknowledge,"Thank you for your query,hope you get well soon!",[],[]
2,BP,Let me know which type of services do you want...,[],[]
3,BP Ayurvedic Remedies,Add garlic to your diet to help reduce high bl...,[_],[]
4,BP Diet,BP Diet Information,[_],[]


In [4]:
# Check if any underscores are still present
remaining_underscores = df[df["intent"].str.contains("_")]
print(f"Rows still containing underscores: {len(remaining_underscores)}")
remaining_underscores


Rows still containing underscores: 0


Unnamed: 0,intent,response,intent_special_chars,response_special_chars


In [5]:
# Save the cleaned dataset
df.to_csv("cleaned_intents.csv", index=False)

print("Cleaned dataset saved successfully as 'cleaned_intents.csv'.")


Cleaned dataset saved successfully as 'cleaned_intents.csv'.
