In [1]:
import json
import requests
from bs4 import BeautifulSoup
import IPython.display as display
from collections.abc import Iterable
import os

In [2]:
# Initialize knowledge base (a dictionary)
knowledge_base = {}

### Loading and Saving the file

In [3]:
# Loading knowledge from file
filename = os.path.join(".", "ai_knowledge.json")

def load_knowledge():
    global knowledge_base
    try:
        with open(filename, 'r') as f:
            knowledge_base = json.load(f)
    except FileNotFoundError:
        knowledge_base = {}

In [4]:
# Save knowledge to file
def save_knowledge(filename=os.path.join(".", "ai_knowledge.json")):
    with open(filename, 'w') as f:
        json.dump(knowledge_base, f, indent=4)

### Translate the content from URL into Ensligh

In [5]:
# Translate the content from URL into English
"""from googletrans import Translator

translator = Translator()

def translate_to_english(text):
    #Translate non-English text to English while preserving English parts.
    try:
        # Split the text into words and identify non-English parts
        words = text.split()
        translated_parts = []
        
        for word in words:
            # Detect the language of each word
            lang = translator.detect(word).lang
            # If the word is not English, translate it
            if lang != 'en':
                translated = translator.translate(word, dest='en')
                translated_parts.append(translated.text)
            else:
                translated_parts.append(word)  # Keep English words as they are
        
        # Join the translated parts back into a single string
        return ' '.join(translated_parts)
    except Exception as e:
        return f"Error during translation: {str(e)}"""

from googletrans import Translator

translator = Translator()

def translate_to_english(text):
    """Translate non-English text to English while preserving English parts."""
    try:
        # Detect the language of the entire text
        detected_language = translator.detect(text).lang
        
        # If the text is not in English, translate it
        if detected_language != 'en':
            translation = translator.translate(text, dest='en')
            return translation.text
        else:
            return text  # If the text is already in English, return it as is
    except Exception as e:
        return f"Error during translation: {str(e)}"

### Function to Fetch content from URL 

In [6]:
# Function to fetch content from url
def fetch_content_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Use a list of common tags for extracting meaningful content
        common_tags = [
            'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 
            'span', 'strong', 'em', 'blockquote', 'li', 
            'div', 'article', 'section', 'main', 'header', 'footer'
        ]
        content_parts = []

        # Iterate through the common tags and extract text
        for tag in common_tags:
            elements = soup.find_all(tag)
            for element in elements:
                text = element.get_text(strip=True)
                if text:  # Only add non-empty text
                    content_parts.append(text)

        # Join the extracted content into a single string
        content = "\n\n".join(content_parts)
        print(content)
        # Return content or error message if no content found
        return content[:2000] if content else "Error: Could not extract meaningful content."

    except Exception as e:
        return f"Error fetching content from {url}: {str(e)}"

### Training Model to sort the information

In [1]:
import pandas as pd
import joblib
import os
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Assuming the dataset is placed in a 'data' folder within your project directory
labeled_dataset_path = os.path.join(".", "data", "Cleaned_output_file.xlsx")

# Assuming the model file is placed in a 'models' folder within your project directory
model_filename = os.path.join(".", "models", "voting_classifier_model.joblib")

# Load the dataset
labeled_dataset = pd.read_excel(labeled_dataset_path)

# Updated train_model function with ensemble method
def train_model():
    X = labeled_dataset['Sentences']
    y = labeled_dataset['Labels']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create individual models
    model1 = LogisticRegression(max_iter=1000)
    model2 = SVC(probability=True)  # Enable probability estimates
    model3 = RandomForestClassifier()

    # Combine models into a voting classifier
    ensemble_model = VotingClassifier(
        estimators=[
            ('logistic', model1),
            ('svc', model2),
            ('random_forest', model3)
        ],
        voting='soft'  # Use 'soft' for probability-based voting
    )

    # Create a pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('classifier', ensemble_model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    y_predict = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_predict)

    # Save the model to disk
    joblib.dump(pipeline, model_filename)

    return pipeline, accuracy

def load_model():
    if os.path.exists(model_filename):
        # Load the model from disk
        pipeline = joblib.load(model_filename)
        return pipeline, None  # Return None for accuracy since we're not evaluating it here
    else:
        return train_model()

# Use the model
pipeline, accuracy = load_model()
if accuracy is not None:
    print(f'Model accuracy: {accuracy:.2f}')
else:
    print("Model loaded from disk.")

### Sorting Useful Information, got from the website URL

In [None]:
# Define the sorting information function
def sorting_information(raw_data):
    model, accuracy = load_model()
    if accuracy is not None:
        print("Model is trained")
    else:
        print("Model loaded from disk")
    sentences = raw_data.split('. ')
    
    # Filter out useful information
    useful_sentences = []
    for sentence in sentences:
        prediction = model.predict([sentence])[0]
        #print(f"Sentence: {sentence.strip()} | Prediction: {prediction}")  # Debugging line
        if prediction == 'Useful':  # Make sure to match the label
            useful_sentences.append([sentence.strip()])

    # Join all the useful sentences into a single text
    #useful_text = '\n\n'.join(useful_sentences)
    useful_text = '\n\n'.join([item[0] for item in useful_sentences])
    useful_text = useful_text.replace('[Tex]', '$$').replace('[/Tex]', '$$')
    
    # Add the new useful sentences to the dataset (label them as 'Useful')
    new_data = pd.DataFrame({'Sentences': [item[0] for item in useful_sentences], 'Labels': ['Useful'] * len(useful_sentences)})
    updated_data = pd.concat([labeled_dataset, new_data], ignore_index=True)

    # Save the updated dataset back to the excel file
    updated_data.to_excel(labeled_dataset_path, index=False)

    return useful_sentences

### Function for AI to ask questions

In [None]:
def get_user_input():
    """Function to get user input for a topic."""
    user_input = input("What topic do you want to add information about? ")

    # Store the answer under a common response
    if user_input not in knowledge_base:
        knowledge_base[user_input] = []

    while True:
        main_point = input("Enter a main point (or type 'stop' to finish): ")
        if main_point.lower() == 'stop':
            break
        
        # Initialize an empty list to hold comments for this main point
        comments = []
        
        while True:
            comment = input(f"Enter a comment for '{main_point}' (or type 'stop' to finish): ")
            if comment.lower() == 'stop':
                break
            comments.append(comment.strip())
        
        # Store the main point and its corresponding comments
        knowledge_base[user_input].append({
            "main_point": main_point.strip(),
            "comments": comments
        })

    print(f"AI: I learned your answer about '{user_input}'")

    # Save the updated knowledge base
    save_knowledge()

# Function for AI to ask a question
def ask_ai_question(question):
    """Ask a question to the AI and store the answer."""
    # Checking if AI already knows the answer 
    if question in knowledge_base:
        return f"{knowledge_base[question]}"
    else:
        # AI doesn't know the answer, ask the user to provide the answer
        print(f"AI: I don't know the answer to '{question}'. What should I learn?")
        user_input = input("Provide an answer or a URL for me to learn (or type 'skip' to skip): ")

        if user_input.lower() == 'skip':
            print("AI: You can enter the answer later.")
            return  # Exit from the answer input state

        if user_input.startswith('https://') or user_input.startswith('http://'):
            # Learn from the URL
            content = fetch_content_from_url(user_input)
            if "error" in content.lower():
                print(f"Error in the fetched content: {content}")
                return
            #translated_content = translate_to_english(content)
            #if "error" in translated_content.lower():
                #print(f"Error in translated content: {translated_content}")
                #return

            # Sorting useful information from the data
            sorted_information = sorting_information(content)
            print("Fetched and sorted information:")
            print(sorted_information) 
            
            if input("Is this information useful? (y/n): ").lower() == 'y':
                # Store the answer under a common response
                if sorted_information not in knowledge_base.values():
                    knowledge_base[question] = sorted_information
                print(f"AI: I learned from the website: {user_input}")
                save_knowledge()
                return sorted_information
            else:
                print("AI: The information was not saved.")
                return False

            # Sorting useful information from bunch of data
            #translated_information = sorting_information(translated_content)
            
            # Store the answer under a common response
            #if translated_information not in knowledge_base.values():
            #    knowledge_base[question] = translated_information
            
            #knowledge_base[question].append(translated_information)  # Append the URL as a source
           # print(f"AI: I learned from the website: {user_input}")
            #save_knowledge()
            #return f"{translated_information}" 
        else:
            get_user_input()
            return "I learned your answer"

### Functions to format, wrap and show all the QUESTIONS/ANSWERS and delete them

In [None]:
import textwrap

def wrap_text(text, width=80):
    """Wrap the text to the specified width."""
    return textwrap.fill(text, width)

def format_answer(answer):
    """Format the answer in bold."""
    return f"\033[1m{answer}\033[0m"

def format_data(raw_data): 
    if isinstance(raw_data, list):
        # If the input is a list, format each item
        formatted_data = "\n\n".join(format_data(item) for item in raw_data)
        return formatted_data
    
    # Rest of the function remains the same
    paragraphs = raw_data.split("\n")
    
    # Process each paragraph
    formatted_paragraphs = []
    for para in paragraphs:
        # Remove excessive whitespace
        para = re.sub(r'\s+', ' ', para).strip()
        
        # Add bullet points for lists
        if para.startswith('•'):
            para = f"- {para[1:].strip()}"
        
        # Capitalize first letter of each sentence
        para = re.sub(r'(?<=\. )(?=[a-z])', lambda x: x.group(0).upper(), para)
        
        # Add formatted paragraph to list
        if para:
            formatted_paragraphs.append(para)
    
    # Join paragraphs with line breaks
    formatted_data = "\n\n".join(formatted_paragraphs)
    
    # Add a header
    header = "Formatted Data Output:\n" + "="*30 + "\n"
    
    return header + formatted_data

def show_all_answers():
    """Print all stored questions with their answers."""
    if knowledge_base:
        print("Stored Questions and Answers:")
        for questions, answer in knowledge_base.items():
            print(f"Question {questions} \nAnswer: {answer}")#\nDetails: {data.get('details', 'No additional details provided.')}\n")
    else:
        print("AI: No questions and answers have been learned yet.")

def show_all_questions():
    """Print all stored questions"""
    if knowledge_base:
        print("Stored Questions:")
        for questions in knowledge_base.keys():
            print(f"Question: {questions}")
    else:
        print("AI: No questions have been learned yet.")

def remove_ai_memory(question):
    # Check if the question exists in knowledge_base
    if knowledge_base:
        if question in knowledge_base:
            del knowledge_base[question] # Remove the question
            save_knowledge()
            print(f"AI: The Memory for the question '{question}' has been removed.")
        else:
            print(f"AI: The question '{question}' was not found in memory.")
    else:
        print("AI: No questions have been learned yet.")

# Removing all the knowledge
def remove_all_knowledge():
    global knowledge_base
    if knowledge_base:
        knowledge_base.clear()
        save_knowledge()# Clear the knowledge_base dictionary
        print("AI: All questions and answers have been removed from memory.")
    else:
        print("AI: No questions have been learned yet.")

import re

# Helper function to flatten a nested list
def flatten(lst):
    for i in lst:
        if isinstance(i, Iterable) and not isinstance(i, str):
            yield from flatten(i)
        else:
            yield i

def showing_mathematical_equations(raw_text):
    # If the input is a list (or nested list), flatten and join into a single string
    if isinstance(raw_text, list):
        raw_text = ' '.join(flatten(raw_text))
    
    # Clean up the raw text and handle LaTeX-like expressions
    cleaned_text = re.sub(r'\[Tex\](.*?)\[/Tex\]', r'$\1$', raw_text, flags=re.DOTALL)
    
    # Optionally handle newlines and extra spaces if needed
    cleaned_text = re.sub(r'\n+', '\n', cleaned_text)  # Clean multiple newlines
    cleaned_text = cleaned_text.strip()  # Remove leading/trailing spaces
    print(cleaned_text)

### Function to check if question already exists in memory

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained sentence-transformer model once
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Precompute embeddings for the stored questions
def precompute_question_embeddings(knowledge_base):
    stored_questions = list(knowledge_base.keys())
    stored_question_embeddings = model.encode(stored_questions, convert_to_tensor=True)
    return stored_question_embeddings, stored_questions

# Precompute the embeddings once


def is_question_in_memory(user_question):
    if not knowledge_base:
        return False
    # Precompute the embeddings once
    stored_question_embeddings, stored_questions = precompute_question_embeddings(knowledge_base)
    
    # Encode the user question   
    user_question_embedding = model.encode(user_question, convert_to_tensor=True)

    # Calculate the similarity between the user question and stored questions
    similarities = util.pytorch_cos_sim(user_question_embedding, stored_question_embeddings)

    # Find the maximum similarity score and its index
    max_similarity = similarities.max().item()
    max_index = similarities.argmax().item()

    # Define a similarity threshold (e.g., 0.8 for paraphrased questions)
    threshold = 0.8
    if max_similarity > threshold:
        # Retrieve the corresponding question from the knowledge base
        matched_question = stored_questions[max_index]
        answer = knowledge_base[matched_question]
        return answer  # Return the answer and True indicating the question exists
    else:
        return False  # Return False, False indicating the question is not found

### Voice Recognition function

In [None]:
import speech_recognition as sr

def recognize_until_stop():
    recognizer = sr.Recognizer()
    
    # Use the microphone as the audio source
    with sr.Microphone() as source:
        print("Adjusting for ambient noise. Please wait...")
        recognizer.adjust_for_ambient_noise(source, duration=1)
        print("Listening... (Say 'stop' to end)")

        collected_text = []  # To collect the spoken text

        while True:
            try:
                # Listen for audio
                audio_data = recognizer.listen(source)
                print("Recognizing...")
                
                # Recognize speech using Google Web Speech API
                text = recognizer.recognize_google(audio_data)
                print(f"You said: {text}")

                if text.lower() == "stop":
                    print("Stopping...")
                    break  # Exit the loop if "stop" is said

                # Append the recognized text
                collected_text.append(text)
            except sr.UnknownValueError:
                print("Sorry, I could not understand the audio.")
            except sr.RequestError as e:
                print(f"Could not request results from Google Speech Recognition service; {e}")
            except Exception as e:
                print(f"An error occurred: {e}")

        # Return the collected text as a single string
        return " ".join(collected_text)

### Providing google search links for user quesiton

In [None]:
from urllib.parse import quote_plus

def save_to_json(data, filename='search_results.json'):
    """Save the data to a JSON file."""
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def load_from_json(filename='search_results.json'):
    """Load the data from a JSON file."""
    if os.path.exists(filename):
        with open(filename, 'r') as json_file:
            return json.load(json_file)
    return {}

def search_internet(user_input):
    """Search for the user input on Google and return the first five links."""
    search_url = f"https://www.google.com/search?q={quote_plus(user_input)}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(search_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract search results
        results = soup.find_all('h3')  # Look for the <h3> tags which usually contain the titles
        links = []
        
        for result in results:
            a_tag = result.find_parent('a')  # Find the parent anchor tag to get the link
            if a_tag and a_tag.get('href'):
                link = a_tag.get('href')
                # Clean the link by splitting to remove unnecessary parameters
                if link.startswith('/url?q='):
                    clean_link = link.split('/url?q=')[1].split('&')[0]  # Get the actual URL
                    links.append(clean_link)
                elif 'http' in link:  # Directly add if it's already a valid URL
                    links.append(link)
            if len(links) >= 5:  # Stop after finding 5 links
                break

        return links
    else:
        print(f"HTTP Error: {response.status_code}")
        return None

def provide_links(user_input):
    # Load existing search results from JSON file
    search_history = load_from_json()

    # Check if the question has already been asked
    if user_input in search_history:
        print("Here are the saved links for your question:")
        for index, link in enumerate(search_history[user_input], start=1):
            print(f"{index}. {link}")
    else:
        # Search the internet
        print("Searching the internet...")
        links = search_internet(user_input)

        if links:
            print("Here are the first five links:")
            for index, link in enumerate(links, start=1):
                print(f"{index}. {link}")
            
            # Save to search history
            search_history[user_input] = links
            save_to_json(search_history)
        else:
            print("Error: Unable to retrieve search results.")

### Main loop to interact with the AI

In [None]:
# Main loop to interact with the AI
def main():
    load_knowledge()
    
    print("Welcome! You can ask me questions or describe what you're doing.")
    user_choice = input("Do you want to use voice recognition (y/n): ").lower()
    while True:
        if user_choice == 'y':
            question = recognize_until_stop()
        else:
            question = input("\nAsk a question or type 'exit' to stop: ").lower()
            
        if question == 'exit':
            print("Goodbye!")
            break

        if question == 'show questions' or question == 'show all the questions' or question == 'show all questions':
            show_all_questions()
            continue  # Skip further processing and go back to the main loop

        if question == 'show answers' or question == 'show all the answers' or question == 'show all answers':
            show_all_answers()
            continue  # Skip further processing and go back to the main loop

        if question == 'remove question' or question == 'delete question' or question == 'remove the answer' or question == 'delete answer' or question == 'delete the answer':
            if input("Are you sure? y/n").lower() == 'y':
                question_to_remove = input("Enter the question to remove --> ")
                remove_ai_memory(question_to_remove)
                continue
            else:
                continue

        if question == 'remove all questions' or question == 'remove all question' or question == 'remove all answers':
            if input("Are you sure? Doing this will wipe out all the memory of AI (y/n): ") == 'y':
                if input("Please confirm (y/n): ") == 'y':
                    remove_all_knowledge()
                    continue
            else:
                continue
        
        user_question = is_question_in_memory(question)
        if user_question:
            print(f"AI Answer:")
            formated_question = format_data(user_question)
            showing_mathematical_equations(formated_question)
            continue

        links = input("Do you want me to provide you links for your question (y/n): ").lower()
        if links == 'y':
            provide_links(question)
            continue

        answer = ask_ai_question(question)
        if answer:
            #bold_answer = format_answer(answer)
            #wrapped_answer = wrap_text(bold_answer, width=80)
            formated_data = format_data(answer)
            showing_mathematical_equations(formated_data)
            #print(f"AI Answer: {wrapped_answer}")
        else:
            print("No change")

In [None]:
if __name__ == "__main__":
    main()