<a href="https://colab.research.google.com/github/etuckerman/surf_NLP/blob/main/KSF_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install pandas scikit-learn numpy sentence-transformers tqdm



In [29]:
# Step 1: Import necessary libraries
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
from tqdm import tqdm
import openai


In [30]:
# Set up OpenAI API key
with open('openai_key.txt', 'r') as f:
    openai.api_key = f.read().strip()

In [3]:
# Step 2: Load the CSV file
from google.colab import files
uploaded = files.upload()

Saving messages.csv to messages.csv


In [4]:
# Assuming the CSV file is called 'messages.csv'
df = pd.read_csv(list(uploaded.keys())[0])

In [5]:
# Step 3: Data Cleaning
# Convert any non-string values in the 'Message' column to an empty string
df['Message'] = df['Message'].astype(str)

# Apply regex to remove encoded emojis
encoded_emoji_pattern = re.compile(r'[^\x00-\x7F]+')
df['Message'] = df['Message'].apply(lambda x: re.sub(encoded_emoji_pattern, '', x))

# Remove messages with no full words or very short messages
short_responses = ['ok', 'lol', 'haha', 'yes', 'no', 'sure', 'right', 'cool', 'nah', 'yep', 'nope', 'yeah', 'k', 'hm']
df = df[~df['Message'].str.lower().isin(short_responses)]
df = df[df['Message'].str.contains(r'\b\w{3,}\b')]

# Remove empty or very short messages after cleaning
df = df[df['Message'].str.strip().str.len() > 2]

# Reset index after cleaning
df = df.reset_index(drop=True)

In [6]:
# Step 4: Initialize model with GPU support if available
model = SentenceTransformer('paraphrase-MiniLM-L6-v2', device='cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
# Function for batch processing of embeddings
def batch_encode(model, texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        embeddings.extend(model.encode(batch))
    return embeddings

In [8]:
# Generate embeddings using batch processing
df['Embedding'] = batch_encode(model, df['Message'].tolist())

In [9]:
# Save embeddings to a file after computation (optional)
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(df['Embedding'].tolist(), f)

# Load embeddings from a file (optional, for future use)
# with open('embeddings.pkl', 'rb') as f:
#     df['Embedding'] = pickle.load(f)

In [18]:
# Step 5: Function to find the most relevant answer using index as recency
def find_answer(query, df, recency_bias=0.01):
    query_embedding = model.encode(query)


    # Initialize tqdm progress bar
    tqdm.pandas(desc="Finding best answer")

    # Calculate cosine similarity between the query and all messages, progress_apply() for tqdm tracking
    df['Similarity'] = df['Embedding'].progress_apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])

    # Apply recency bias based on index
    df['Recency_Score'] = np.exp(-recency_bias * (len(df) - df.index))

    # Combine similarity and recency scores
    df['Score'] = df['Similarity'] * df['Recency_Score']

    # Find the message with the highest score
    best_match = df.loc[df['Score'].idxmax()]
    return best_match['Message']

In [19]:
# Example usage
query = "how do i strafe?"
answer = find_answer(query, df)
print("Best Answer:", answer)

Finding best answer: 100%|██████████| 658280/658280 [06:02<00:00, 1815.20it/s]

Best Answer: Also you kind of look a bit shaky and reactionary when you board a ramp especially on the blind angles. Get really comfortable with all the ramp boards and flicks in saveloc so you know exactly how the flow of the map is and try to not so abruptly stop but kind of consistently strafe when you're in the air, either towards the side of the next ramp or just weave back and forth if it's straigh in front of you





In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import os

# Set your Hugging Face token (if needed)
os.environ['HF_TOKEN'] = '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'

# Load model and tokenizer
model_name = "mistralai/Mistral-7B-v0.1"

# Load the tokenizer and model with offloading to CPU
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=os.getenv('HF_TOKEN'))
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=os.getenv('HF_TOKEN')).to('cpu')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_text(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Assuming you have the following variables
#query = "What is the Counter-Strike surf movement gamemode?"
initial_answer = answer
context = "Counter-Strike surf movement gamemode"

# Generate refined answer
refined_answer = generate_response(query, context, model, tokenizer)
print("Refined Answer:", refined_answer)
