with dataset : works very well on dataset , nice

In [None]:
import numpy as np
import nltk
import string
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

# Ensure the NLTK data directory is set correctly
nltk.data.path.append('/root/nltk_data')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')


# Provide the file path directly
file_path = "/content/Mental_Health_FAQ.csv"
df = pd.read_csv(file_path)

# Display column names for debugging
print("CSV Columns:", df.columns.tolist())

# Ensure the correct columns exist
expected_columns = ['Questions', 'Answers']
for col in expected_columns:
    if col not in df.columns:
        raise KeyError(f"The CSV file must contain a '{col}' column.")

# Convert text to lowercase and preprocess
sent_tokens = df['Questions'].dropna().astype(str).str.lower().tolist()
word_tokens = nltk.word_tokenize(" ".join(sent_tokens))

lemmer = WordNetLemmatizer()
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREET_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]

def greet(sentence):
    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)

def response(user_response):
    robo1_response = ''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    sent_tokens.pop()  # Remove the user input after processing

    if req_tfidf == 0:
        return "I am sorry! I don't understand you."
    else:
        return df['Answers'].iloc[idx]

# Chatbot interaction
print("BOT: My name is Stark. Let's have a conversation! Type 'Bye' to exit.")
flag = True

while flag:
    user_response = input("You: ").lower()
    if user_response != 'bye':
        if user_response in ["thanks", "thank you"]:
            print("BOT: You are welcome!")
            flag = False
        elif greet(user_response):
            print("BOT: " + greet(user_response))
        else:
            print("BOT:", response(user_response))
    else:
        print("BOT: Goodbye! Take care <3")
        flag = False

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


CSV Columns: ['Question_ID', 'Questions', 'Answers']
BOT: My name is Stark. Let's have a conversation! Type 'Bye' to exit.
You: hi
BOT: *nods*
You: i feel sad




BOT: A lot of people are alone right now, but we don't have to be lonely. We're all in this together. 
 While you may be physically separated from friends, family members, and other loved ones, it has never been more important to maintain those social connections. Social connections are an opportunity to seek and share support, talk through difficult feelings, share a laugh, keep up-to-date with loved ones, and help each other cope. This pandemic is a lot for one person to deal with on their own. While measures like physical distancing and self-isolation are necessary to slow the spread of the virus, the physical separation can amplify a lot of challenging emotions like loneliness and fear. 
 Think about the different ways to connect that are most meaningful for you. For example, you might prefer a video chat over a phone call, or you might prefer to text throughout the day rather than one set time for a video call. Then, work with your social networks to make a plan. You might video c

with web crawling/ dosent work :((((

In [None]:
import numpy as np
import nltk
import string
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('wordnet')

# Function to scrape Q&A data from a given webpage
def scrape_qa_data(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data from {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    questions = [q.get_text(strip=True) for q in soup.select('.question-selector')]
    answers = [a.get_text(strip=True) for a in soup.select('.answer-selector')]

    if len(questions) != len(answers):
        raise ValueError("Mismatch between number of questions and answers")

    return pd.DataFrame({'Questions': questions, 'Answers': answers})

# Provide a URL to scrape data
url = "https://medlineplus.gov/howtoimprovementalhealth.html"
df = scrape_qa_data(url)
df.to_csv("chatbot_data.csv", index=False)

# Text preprocessing
sent_tokens = df['Questions'].dropna().astype(str).str.lower().tolist()
word_tokens = nltk.word_tokenize(" ".join(sent_tokens))

lemmer = WordNetLemmatizer()
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

# Greeting inputs and responses
GREET_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey")
GREET_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]

def greet(sentence):
    for word in sentence.split():
        if word.lower() in GREET_INPUTS:
            return random.choice(GREET_RESPONSES)

def response(user_response):
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    sent_tokens.pop()  # Remove the user input after processing

    if vals.shape[1] < 2:  # Not enough sentences for comparison
        return "I am sorry! I don't understand you."

    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]

    if req_tfidf == 0:
        return "I am sorry! I don't understand you."
    else:
        return sent_tokens[idx]

# Chatbot interaction
print("BOT: My name is Stark. Let's have a conversation! Type 'Bye' to exit.")

flag = True
while flag:
    user_response = input("You: ").lower()
    if user_response != 'bye':
        if user_response in ["thanks", "thank you"]:
            print("BOT: You are welcome!")
            flag = False
        elif greet(user_response):
            print("BOT: " + greet(user_response))
        else:
            print("BOT:", response(user_response))
    else:
        print("BOT: Goodbye! Take care <3")
        flag = False

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


BOT: My name is Stark. Let's have a conversation! Type 'Bye' to exit.
You: hi
BOT: hey
You: what is mental health




BOT: I am sorry! I don't understand you.
You: how to deal with depression
BOT: I am sorry! I don't understand you.
You: Can people with mental illness recover?
BOT: I am sorry! I don't understand you.
You: bye
BOT: Goodbye! Take care <3


using hugging face