In [1]:
#importing all packages

import pandas as pd #to make csv file into table interpreted by Python
import os #to traverse folders and file system
import numpy as np
import json


from sentence_transformers import SentenceTransformer #for vector embedding creation
from langchain_community.embeddings import SentenceTransformerEmbeddings
import time
import nltk #for natural lang processing
from nltk.tokenize import sent_tokenize #tokenises string by sentence
from langchain_community.vectorstores import FAISS #for vector db
from langchain_core.documents import Document #document is a distinct piece of text

  from tqdm.autonotebook import tqdm, trange


In [2]:
def get_user_chat_content(folder_path):
    """
    Reads user chat logs from text files in the specified folder.

    Args:
        folder_path (str): Path to the folder containing user chat logs.

    Returns:
        dict: A dictionary with user IDs as keys and their chat content as values.
    """
    user_chats = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            user_id = os.path.splitext(filename)[0]  # Assuming filenames like 'user1.txt'
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    user_chats[user_id] = content
            else:
                print(f"File {file_path} does not exist.")
    print("User chat logs loaded.")
    print("User IDs in user_chats:", user_chats.keys())
    return user_chats

In [3]:
#functions for chunking and adding chunks for embedding

nltk.download('punkt') #module in nltk

def chunk_text(text, chunk_size):
    #Chunks text into smaller pieces of length chunk_size
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] 

def process_sentences(sentences, model, chunk_size=256):
    """
    Processes sentences by chunking and encoding.

    Args:
        sentences (list): List of sentences.
        model: The SentenceTransformer model instance.
        chunk_size (int): The size of each text chunk.

    Returns:
        list: List of chunks.
        list: List of embeddings.
    """
    embeddings = []
    chunks_list = []
    for sentence in sentences:
        chunks = chunk_text(sentence, chunk_size)
        chunk_embeddings = model.encode(chunks)  # Use the model from sentence-transformers
        embeddings.extend(chunk_embeddings)
        chunks_list.extend(chunks)
    return chunks_list, embeddings


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aakashnamboodiri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def process_user_chat(content, model, chunk_size=256):
    """
    Processes a user's chat content by chunking and encoding.

    Args:
        content (str): The user's chat content.
        model: The SentenceTransformer model instance.
        chunk_size (int): The size of each text chunk.

    Returns:
        np.ndarray: The user's style embedding.
    """
    sentences = sent_tokenize(content)
    chunks, embeddings = process_sentences(sentences, model, chunk_size)
    # Average the embeddings to get a style embedding
    style_embedding = np.mean(embeddings, axis=0)
    return style_embedding


In [5]:
def create_user_style_profiles(user_chats, model):
    """
    Creates style profiles for users based on their chat histories.

    Args:
        user_chats (dict): Dictionary of user IDs and their chat content.
        model: SentenceTransformer model instance.

    Returns:
        dict: A dictionary with user IDs and their style embeddings and labels.
    """
    user_profiles = {}
    for user_id, content in user_chats.items():
        style_embedding = process_user_chat(content, model)
        # For simplicity, assign a style label (this would be more complex in practice)
        style_label = 'friendly'  # Placeholder label
        user_profiles[user_id] = {
            'style_embedding': style_embedding,
            'style_label': style_label
        }
    print("User style profiles created.")
    return user_profiles


In [6]:
import json

def load_user_profiles(json_file):
    """
    Loads user profiles from a JSON file matching the specified format.

    Args:
        json_file (str): Path to the JSON file containing user profiles.

    Returns:
        dict: A dictionary with user IDs and their profiles.
    """
    with open(json_file, 'r', encoding='utf-8') as file:
        profiles = json.load(file)

    user_profiles = {}
    for user_id, profile_data in profiles.items():
        # Exclude sensitive information like passwords
        user_profiles[user_id] = {
            'full_name': profile_data.get('full_name', ''),
            'college': profile_data.get('college', ''),
            'school_year': profile_data.get('school_year', ''),
            'major': profile_data.get('major', ''),
            'age': profile_data.get('age', ''),
            'gender': profile_data.get('gender', ''),
            'smoking_habits': profile_data.get('smoking_habits', ''),
            'sleeping_habits': profile_data.get('sleeping_habits', ''),
            'guest_preferences': profile_data.get('guest_preferences', ''),
            'has_pet': profile_data.get('has_pet', False),
            'bio': profile_data.get('bio', ''),
            # If pronouns are not provided, you might derive them from gender
            'pronouns': profile_data.get('pronouns', derive_pronouns(profile_data.get('gender', ''))),
            # Add any other fields you need
            'preferences': profile_data.get('preferences', '')
        }
    print("User profiles loaded from JSON.")
    return user_profiles

def derive_pronouns(gender):
    """
    Derives pronouns based on the user's gender.

    Args:
        gender (str): The user's gender.

    Returns:
        str: The pronouns corresponding to the gender.
    """
    gender = gender.lower()
    if gender == 'male':
        return 'he/him'
    elif gender == 'female':
        return 'she/her'
    else:
        return 'they/them'


In [7]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

start_time = time.time()  # Start computation time

# Get user chat contents; returns a dictionary {user_id: chat_content}
user_chats = get_user_chat_content("Chats")

user_style_profiles = {}  # Dictionary to store user style profiles

for user_id, text in user_chats.items():
    sentences = sent_tokenize(text)
    
    # Process sentences to get chunks and embeddings
    chunks, embeddings = process_sentences(sentences, model)
    
    # Optionally, write embeddings to a file per user
    with open(f'embeddings_{user_id}.txt', 'w') as f:
        for i, embedding in enumerate(embeddings):
            f.write(f"Embedding for chunk {i+1}: {embedding.tolist()}\n")
    
    # Create a style embedding for the user by averaging their chunk embeddings
    style_embedding = np.mean(embeddings, axis=0)
    
    # Store the user's style embedding in the user style profiles dictionary
    user_style_profiles[user_id] = {
        'style_embedding': style_embedding
        # You can add more style features here if needed
    }
    
    # Create Document objects for each chunk if needed
    documents = [Document(page_content=chunk) for chunk in chunks]
    print(f"Documents for user {user_id}:")
    print(documents)
    
print("Process finished --- %s seconds ---" % (time.time() - start_time))

User chat logs loaded.
User IDs in user_chats: dict_keys(['aakash', 'ankith', 'ashwin'])
Documents for user aakash:
Documents for user ankith:
[Document(metadata={}, page_content='[10/1/22, 3:54:03\u202fPM] Varun Hariharan: \u200eMessages and calls are end-to-end encrypted.'), Document(metadata={}, page_content='No one outside of this chat, not even WhatsApp, can read or listen to them.'), Document(metadata={}, page_content='\u200e[10/1/22, 3:54:03\u202fPM] Ankith: \u200esticker omitted\n[10/1/22, 3:54:17\u202fPM] Ankith: Imma use this\n[10/1/22, 4:04:02\u202fPM] Varun Hariharan: daddy\n[10/4/22, 2:23:50\u202fAM] Ankith: Didn’t reach maple\n[10/4/22, 2:23:59\u202fAM] Ankith: Got shot in the balls in the middle of t'), Document(metadata={}, page_content='he street\n[10/4/22, 2:24:10\u202fAM] Ankith: I am now bleeding to death as we speak\n[10/5/22, 10:01:20\u202fAM] Ankith: Oi oi oi\n[10/5/22, 10:01:25\u202fAM] Ankith: Do we have hw on ALEKS\n[10/5/22, 10:06:15\u202fAM] Varun Hariharan:

In [8]:

from langchain_community.vectorstores import FAISS #for similarity search


# Load user chats
user_chats = get_user_chat_content("Chats")

# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create user style profiles
user_style_profiles = create_user_style_profiles(user_chats, model)

# Load user profiles
user_profiles = load_user_profiles('users.json')


User chat logs loaded.
User IDs in user_chats: dict_keys(['aakash', 'ankith', 'ashwin'])
User style profiles created.
User profiles loaded from JSON.


In [9]:
# Import necessary libraries
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document


# Initialize the embeddings model
embeddings_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Prepare documents and embeddings
documents = []
embeddings_list = []

for user_id, text in user_chats.items():
    sentences = sent_tokenize(text)
    chunks, embeddings = process_sentences(sentences, model)
    
    # Create Document objects for each chunk
    user_documents = [Document(page_content=chunk) for chunk in chunks]
    documents.extend(user_documents)
    embeddings_list.extend(embeddings)

# Create text-embedding pairs
text_embeddings = list(zip([doc.page_content for doc in documents], embeddings_list))

# Create the FAISS vector store
vector_db = FAISS.from_documents(documents, embeddings_model)

# Save the FAISS index
vector_db.save_local(folder_path="vector_store", index_name="index")


In [10]:

def load_faiss_index(index_path: str, embeddings_model) -> FAISS:
    return FAISS.load_local(
        index_path,
        embeddings=embeddings_model,
        allow_dangerous_deserialization=True
    )

# Load the FAISS index
faiss_index = load_faiss_index('vector_store', embeddings_model)


In [11]:
message_type_response = { #how to handle different types of queries when entered into chatgpt/gemini
    "Greetings Message" : "Greet the user by saying Hi or Hello",
    "Appreciation/Feedback Messages" : "Reply with thank you or similar response",
    "Questions Asked About the Content" : "Refer the context provided below",
    "Questions Asked Out of Context but Relevant to the Influencer" : "Refer online material and generate response",
    "Questions Asked but Irrelevant to the Influencer": "Don't answer",
    "Spam Messages": "Don't answer"
}

In [12]:
from langchain_core.document_loaders.base import Document
from typing import List

#start of RAG stage

#function for similarity search using an arbitrary number of top documents(5) and take query as a string and return a list of docs

def similarity_search(faiss_index: FAISS, query: str, k: int = 5) -> List[Document]:
    return faiss_index.similarity_search(query, k=k)

In [13]:
#library that converts text to speech
import pyttsx3
def text_to_speech(text, filename): #takes in text and name of file to save speech in
    text_speech = pyttsx3.init()
    voices = text_speech.getProperty('voices') #retrives list of voices
    text_speech.setProperty('voice', voices[0].id) #chooses a particular voice
    text_speech.save_to_file(text, filename) #saves speech as .wav file
    text_speech.runAndWait() #execution

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI #importing gemini model from google 

def load_model():
    llm = ChatGoogleGenerativeAI(model="gemini-pro",api_key="AIzaSyApivUwDmnG5mR-3SqEv4-x6QZP0ru6iO0") #loads model with my own api key
    return llm

In [33]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def get_retrieval_results(context: List[Document], point: str, user_profile: dict, user_style_label: str) -> str:
    context_text = " ".join([doc.page_content for doc in context])

    prompt = f'''
Assume you are {user_profile.get('full_name', 'an individual')} studying {user_profile.get('major', '')} at {user_profile.get('college', '')}. Your pronouns are {user_profile.get('pronouns', 'they/them')}.
You are looking for a roommate who {user_profile.get('guest_preferences', 'is compatible with your lifestyle')}.
Your chatting style is {user_style_label}.

Below is the conversation so far:
{context_text}

Respond to the following message from a potential roommate:
"{point}"

Your response should continue the conversation naturally, reflecting your chatting style.
'''

    response = generate_response(prompt)
    return response.strip()


tokenizer = AutoTokenizer.from_pretrained('gpt2')
language_model = AutoModelForCausalLM.from_pretrained('gpt2')

def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output_ids = language_model.generate(
        input_ids,
        max_length=1024,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        num_return_sequences=1
    )
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response.strip()

In [34]:
def simulate_conversation(user1_id, user2_id, user_profiles, user_style_profiles, faiss_index, max_turns=10):
    conversation_history = ''
    last_message = 'Hello!'
    print(f"{user1_id}: {last_message}")
    conversation_history += f"{user1_id}: {last_message}\n"

    for turn in range(max_turns):
        current_user_id = user2_id if turn % 2 == 0 else user1_id

        # Retrieve relevant context
        docs = similarity_search(faiss_index, last_message, 5)

        # Get user profile and style label
        user_profile = user_profiles[current_user_id]
        user_style_label = user_style_profiles.get(current_user_id, {}).get('style_label', 'friendly')

        # Generate response
        response = get_retrieval_results(
            context=docs,
            point=last_message,
            user_profile=user_profile,
            user_style_label=user_style_label
        )
        print(f"{current_user_id}: {response}")
        conversation_history += f"{current_user_id}: {response}\n"

        # Update last_message for the next turn
        last_message = response

    print("Conversation simulation completed.")
    return conversation_history


In [37]:
# Simulate conversation between two users
user1_id = 'ashwin'
user2_id = 'aakash'

conversation_history = simulate_conversation(
    user1_id,
    user2_id,
    user_profiles,
    user_style_profiles,
    faiss_index
)

print("\nFinal Conversation:")
print(conversation_history)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


ashwin: Hello!
aakash: Assume you are Aakash Namboodiri studying ECE at University of Washington. Your pronouns are he/him.
You are looking for a roommate who I occasionally host people.
Your chatting style is friendly.

Below is the conversation so far:
] Aakash N: submitted
[7/5/24, 23:41:21] Joydeep: amazing
[7/8/24, 11:46:46] Aakash N: Hello sir
[7/8/24, 11:46:49] Aakash N: Where is you? e! Relax!  da. nee?

Respond to the following message from a potential roommate:
"Hello!"

Your response should continue the conversation naturally, reflecting your chatting style.

Aakash N: [6:58:49] Aakash N: Hello sir

[7/8/24, 12:01:02] Joydeep: nice

[7/9/24, 16:45:44] Joydeep: ok

[7/10/24, 13:30:01] Joydeep: so whats up?

[7/11/24, 18:44:33] Joydeep: so im glad to hear you

[7/12/24, 19:43:29] Aakash N: Hello

[7/13/24, 13:34:55] Joydeep: well i dont think im in

[7/14/24, 16:18:10] Joydeep: i dont know

[7/15/24, 11:42:42] Aakash N: Good to hear from a roommate

[7/16/24, 11:45:09] Joydeep

Token indices sequence length is longer than the specified maximum sequence length for this model (1691 > 1024). Running this sequence through the model will result in indexing errors
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


ValueError: Input length of input_ids is 1691, but `max_length` is set to 1024. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [None]:
docs #returns list of relevant docs

In [None]:
filename = "output.wav"
text_to_speech(result, filename)

## Notes

- EXTEND FOR MORE EMBEDDINGS WITHOUT REGENERATION EVERYTIME
- LEARN FAISS INTERNAL MECHANISM
- PROMPT ENGINEERING FOR QUALITY RESPONSE AND TO USE OPENAI IN LESS TOKENS(IMP)
- CREATING MORE TEST QUERIES FOR ACCURACY TESTING (IMP)
- CREATE CHAT INTERFACE OR SYSTEM TO USE USER RESPONSE AS CONTEXT FOR NEW RESPONSE

