<a href="https://colab.research.google.com/github/etuckerman/AI-Powered_CS-S_Surfing_Community_Assistant/blob/main/KSF_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
# Step 1: Install necessary packages
!pip install -q pandas scikit-learn numpy sentence-transformers tqdm langchain langchain_community transformers bitsandbytes accelerate nltk
!pip install gradio==3.35.2
!pip install tqdm
import pandas as pd
from tqdm.auto import tqdm  # Import tqdm



In [45]:
# Step 2: Import necessary libraries
import pandas as pd
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import pickle
from tqdm import tqdm
from google.colab import files
import os
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.schema.runnable import RunnableSequence
from langchain.chains import LLMChain
import nltk # import nltk library
from nltk.corpus import stopwords # import the stopwords object from nltk.corpus
from nltk.stem import WordNetLemmatizer # import WordNetLemmatizer


In [5]:
# Step 3: Check for files in Google Colab
if os.path.exists('messages.csv'):
    # Load the CSV file if it exists
    df = pd.read_csv('messages.csv')
    print("File 'messages.csv' found and loaded.")
else:
    # If the file does not exist, prompt the user to upload it
    print("File 'messages.csv' not found. Please upload the file.")
    uploaded = files.upload()
    df = pd.read_csv(next(iter(uploaded.keys())))
    print("File 'messages.csv' uploaded and loaded.")


File 'messages.csv' found and loaded.


In [15]:
# Data Cleaning
nltk.download('stopwords')
nltk.download('wordnet')

df['Message'] = df['Message'].astype(str)
encoded_emoji_pattern = re.compile(r'[^\x00-\x7F]+')
df['Message'] = df['Message'].apply(lambda x: re.sub(encoded_emoji_pattern, '', x))

# Remove URLs and mentions
df['Message'] = df['Message'].apply(lambda x: re.sub(r'http\S+', '', x))
df['Message'] = df['Message'].apply(lambda x: re.sub(r'@\S+', '', x))

# Remove blank messages
df = df[df['Message'].str.strip() != '']

# Remove short responses and messages with less than 3 words
short_responses = ['ok', 'lol', 'haha', 'yes', 'no', 'sure', 'right', 'cool', 'nah', 'yep', 'nope', 'yeah', 'k', 'hm']
df = df[~df['Message'].str.lower().isin(short_responses)]
df = df[df['Message'].str.contains(r'\b\w{3,}\b')]
df = df[df['Message'].str.strip().str.len() > 2]

# Remove stop words and lemmatize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
df['Message'] = df['Message'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word.lower() not in stop_words]))

df = df.reset_index(drop=True)

# Save cleaned messages to a new CSV file
df.to_csv('cleaned_messages.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
nltk.download('punkt')
# Step 4: Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device='cuda')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [27]:
len(qa_pairs)

26731

In [22]:
# Step 4: Initialize and test multiple SentenceTransformer models
model_names = ['paraphrase-MiniLM-L6-v2', 'all-mpnet-base-v2', 'all-distilroberta-v1']

for model_name in model_names:
  model = SentenceTransformer(model_name, device='cuda')
  qa_pairs = extract_qa_pairs(df)
  df_name = f'df_{model_name}'
  globals()[df_name] = pd.DataFrame(qa_pairs)
  print(f'Results for model: {model_name}')
  print(globals()[df_name].head(10)) # Print the first 10 rows

# Now you can access the DataFrames using their names (e.g., df_paraphrase-MiniLM-L6-v2, df_all-mpnet-base-v2, etc.)

Results for model: paraphrase-MiniLM-L6-v2
                                        Question  \
0  check someone's profile weird character name?   
1                    hey level vote KSF servers?   
2                         Hi know cheeto banned?   
3                 someone restart eu public pls?   
4                  might thinking different map?   
5                           pls restart 100t eu?   
6                              long backup take?   
7                                   moonkingdom?   
8                      extreme copout cyberwave?   
9                                    whered play   

                                              Answer  
0                                    miniminter live  
1                                           hi rango  
2                                                nan  
3                                  Lol manana public  
4                             surfle channel already  
5  ||I mean first thought borderland cause gate 1...  

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Results for model: all-mpnet-base-v2
                                        Question  \
0  check someone's profile weird character name?   
1                    hey level vote KSF servers?   
2                         Hi know cheeto banned?   
3                 someone restart eu public pls?   
4                  might thinking different map?   
5                           pls restart 100t eu?   
6                              long backup take?   
7                                   moonkingdom?   
8                      extreme copout cyberwave?   
9                                    whered play   

                                              Answer  
0                                    miniminter live  
1                                           hi rango  
2                                                nan  
3                                  Lol manana public  
4                             surfle channel already  
5  ||I mean first thought borderland cause gate 1...  
6    

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Results for model: all-distilroberta-v1
                                        Question  \
0  check someone's profile weird character name?   
1                    hey level vote KSF servers?   
2                         Hi know cheeto banned?   
3                 someone restart eu public pls?   
4                  might thinking different map?   
5                           pls restart 100t eu?   
6                              long backup take?   
7                                   moonkingdom?   
8                      extreme copout cyberwave?   
9                                    whered play   

                                              Answer  
0                                    miniminter live  
1                                           hi rango  
2                                                nan  
3                                  Lol manana public  
4                             surfle channel already  
5  ||I mean first thought borderland cause gate 1...  
6 

In [28]:
def extract_qa_pairs(df, window_size=50):  # Increased window size to 50
    qa_pairs = []
    for i in range(len(df) - window_size):
        message = df['Message'][i]
        author = df['Author'][i]

        # Check if the message is a question
        if message.endswith('?') or any(word in message.lower() for word in ['who', 'what', 'where', 'when', 'why', 'how']):
            potential_answers = df['Message'][i + 1:min(i + 1 + window_size, len(df))]
            potential_authors = df['Author'][i + 1:min(i + 1 + window_size, len(df))]

            # Check if there are any potential answers
            if len(potential_answers) > 0:
                # Rank potential answers based on cosine similarity
                if len(potential_answers) > 1:
                  question_embedding = model.encode(message)
                  answer_embeddings = model.encode(potential_answers.tolist())
                  similarities = cosine_similarity([question_embedding], answer_embeddings)[0]
                  best_answer_index = similarities.argmax()
                else:
                  best_answer_index = 0

                best_answer = potential_answers.iloc[best_answer_index]
                best_author = potential_authors.iloc[best_answer_index]

                # Check if the best answer is from a different author and not a question
                if best_author != author and not best_answer.endswith('?'):
                    qa_pairs.append({'Question': message, 'Answer': best_answer})

    return qa_pairs

In [29]:
# Extract QA pairs from the cleaned dataframe
qa_pairs = extract_qa_pairs(df)
messages_qa = pd.DataFrame(qa_pairs)

# Save QA pairs to a CSV file
messages_qa.to_csv('qa_pairs.csv', index=False)

In [30]:
messages_qa.head(20)

Unnamed: 0,Question,Answer
0,check someone's profile weird character name?,||not_so_zen probs idk||
1,hey level vote KSF servers?,||I sure nsz 3rd||
2,wwho,worky
3,Hi know cheeto banned?,ye bad
4,pls restart 100t eu?,someone fix manually guess
5,long backup take?,backup time
6,moonkingdom?,maybe like illumination shit pretty bad guess
7,extreme copout cyberwave?,csgo map dont know sure
8,good way see picture every surf map available ...,"load surf server first, load map console, serv..."
9,Oh forgot name map PROUD getting completion on...,"Map like 3 different versions. original, one f..."


In [36]:
if os.path.exists('qa_embeddings.pkl'):
    with open('qa_embeddings.pkl', 'rb') as f:
        messages_qa['Question_Embedding'] = pickle.load(f)
        messages_qa['Answer_Embedding'] = pickle.load(f)
    print("QA Embeddings loaded from 'qa_embeddings.pkl'.")
else:
    tqdm.pandas() # Initialize tqdm for pandas
    messages_qa['Question_Embedding'] = messages_qa['Question'].progress_apply(lambda x: model.encode(x))
    messages_qa['Answer_Embedding'] = messages_qa['Answer'].progress_apply(lambda x: model.encode(x))

    with open('qa_embeddings.pkl', 'wb') as f:
        pickle.dump(messages_qa['Question_Embedding'].tolist(), f)
        pickle.dump(messages_qa['Answer_Embedding'].tolist(), f)
    print("QA Embeddings computed and saved to 'qa_embeddings.pkl'.")

  0%|          | 0/27467 [00:00<?, ?it/s]

  0%|          | 0/27467 [00:00<?, ?it/s]

QA Embeddings computed and saved to 'qa_embeddings.pkl'.


In [37]:
def find_relevant_qa_pair(user_query, messages_qa):
     query_embedding = model.encode(user_query)

     messages_qa['Question_Similarity'] = messages_qa['Question_Embedding'].progress_apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])
     messages_qa['Answer_Similarity'] = messages_qa['Answer_Embedding'].progress_apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])

     messages_qa['Similarity'] = (messages_qa['Question_Similarity'] + messages_qa['Answer_Similarity']) / 2

     best_match = messages_qa.loc[messages_qa['Similarity'].idxmax()]
     return best_match['Question'], best_match['Answer']

In [39]:
# Step 7: Set up the model and tokenizer with 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model_name = "mistralai/Mistral-7B-v0.1"

# Read the token from huggingface_token.txt
with open('huggingface_token.txt', 'r') as f:
    token = f.read().strip()

model_4bit = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config,
    use_auth_token=token # Pass the token for authentication
)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [40]:
# Step 8: Create the text generation pipeline

pipeline_inst = pipeline(
    "text-generation",
    model=model_4bit,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2500,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

# Set up LangChain with the new API
llm = HuggingFacePipeline(pipeline=pipeline_inst)

  llm = HuggingFacePipeline(pipeline=pipeline_inst)


In [42]:
# Step 9: Define the template for generating refined responses
template = """<s>[INST] You are a helpful assistant, knowledgeable about Counter-Strike: Source surfing. A user has asked the following question:

{user_query}

Here's a relevant message from the CS:S surfing community:

{most_relevant_message}

Please provide a helpful and informative response to the user's question, considering the relevant message.
</s>
<out>
Refined Answer: [/INST]
"""

In [56]:
# Step 10: Generate refined answers (modified to accept most_relevant_message)
def generate_refined_answer(user_query, most_relevant_message): # Changed function signature
    prompt = PromptTemplate(template=template, input_variables=["user_query", "most_relevant_message"])
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    response = llm_chain.run({"user_query": user_query, "most_relevant_message": most_relevant_message})
    return response

In [57]:
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer, util

# Step 11: Find the most relevant message
def find_most_relevant_message(user_query, messages_qa):
    model = SentenceTransformer('all-mpnet-base-v2', device='cuda') # Load the model on CUDA if available

    user_query_embedding = model.encode(user_query, convert_to_tensor=True)
    max_similarity = -1
    most_relevant_message = None

    # Iterate over the rows of the DataFrame
    for index in tqdm(range(len(messages_qa)), desc="Finding relevant message"):
        message_data = messages_qa.iloc[index] # Get the row data using the integer index

        # Access the 'Question' column
        message = message_data['Question']

        message_embedding = model.encode(message, convert_to_tensor=True)
        similarity = util.cos_sim(user_query_embedding, message_embedding)

        if similarity > max_similarity:
            max_similarity = similarity
            most_relevant_message = message

    return most_relevant_message

In [60]:
# Example usage
user_query = "What are some popular CS:S surf maps?"
most_relevant_message = find_most_relevant_message(user_query, messages_qa) # Find the message first
refined_answer = generate_refined_answer(user_query, most_relevant_message) # Then generate the answer
print("Most relevant message:", most_relevant_message)
print("Refined Answer:", refined_answer)

Finding relevant message:   0%|          | 0/27467 [00:00<?, ?it/s]

Finding relevant message:   0%|          | 0/19 [00:00<?, ?it/s]

AttributeError: 'str' object has no attribute 'iloc'

In [53]:
messages_qa.head()

Unnamed: 0,Question,Answer,Question_Embedding,Answer_Embedding
0,check someone's profile weird character name?,||not_so_zen probs idk||,"[-0.008260254, -0.06181579, -0.0012661027, 0.0...","[0.047274515, -0.0339494, 0.0020443453, 0.0313..."
1,hey level vote KSF servers?,||I sure nsz 3rd||,"[0.0042027286, -0.03383143, -0.0050567407, 0.0...","[0.04278438, -0.030553034, 0.011113143, 0.0619..."
2,wwho,worky,"[0.045430966, 0.022987619, 0.0005916415, -0.03...","[0.014592825, 0.023254113, 0.0020521404, -0.04..."
3,Hi know cheeto banned?,ye bad,"[0.0065030013, -0.01924986, -0.017681759, -0.0...","[0.01716209, -0.03768509, 0.015804702, -0.0351..."
4,pls restart 100t eu?,someone fix manually guess,"[0.0034461734, 0.020581365, -0.012955842, 0.01...","[0.026100537, -0.010459516, 0.022285827, 0.010..."


In [None]:
def chatbot_interface(user_query):
   relevant_question, relevant_answer = find_relevant_qa_pair(user_query, messages_qa)
   refined_answer = generate_refined_answer(user_query, relevant_answer)  # Use relevant_answer here
   return refined_answer