<a href="https://colab.research.google.com/github/etuckerman/surf_NLP/blob/main/surf_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Log into HuggingFace**

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
data_path = "/content/surf_messages.csv"
preprocessed_data = []


In [3]:
import csv
from spacy.lang.en import English
from gensim import corpora, models


In [4]:
def load_and_preprocess_data(data_path):
  """
  This function loads data from a CSV file, extracts messages, and preprocesses them.

  Args:
      data_path: The path to the CSV file containing chat data.

  Returns:
      A list of cleaned text messages.
  """
  data = []
  try:
    with open(data_path, 'r', encoding='utf-8') as csvfile:
      reader = csv.DictReader(csvfile)
      for row in reader:
        # Assuming messages are in the 'Message' column (adjust if different)
        message = row['Message']
        data.append(message)
  except FileNotFoundError:
    print("Error: Could not find the data file. Please check the path and try again.")
    exit()

  # Preprocess data (clean text)
  cleaned_data = load_and_preprocess_data(data)
  return cleaned_data

def clean_text(text):
  """
  This function cleans text data by removing punctuation, stop words, and converting to lemmas.

  Args:
      text: The text string to be cleaned.

  Returns:
      A cleaned text string.
  """
  nlp = English()
  tokens = nlp(text.lower())
  clean_tokens = [token.lemma_ for token in tokens if not token.is_stop and not token.is_punct]
  return " ".join(clean_tokens)


In [5]:
def perform_topic_modeling(preprocessed_data, num_topics=15):
  """
  This function performs topic modeling on the preprocessed data using LDA.

  Args:
      preprocessed_data: A list of cleaned text messages.
      num_topics: The number of topics to identify (default 15).

  Returns:
      A dictionary containing the LDA model and id2word mapping.
  """
  dictionary = corpora.Dictionary(preprocessed_data)
  corpus = [dictionary.doc2bow(text) for text in preprocessed_data]
  ldamodel = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
  return {'model': ldamodel, 'id2word': dictionary.id2word}


In [6]:
# Replace with your function to identify potential questions (consider conversation flow)
def is_question(message):
  """
  This function checks if a message is a question based on patterns.

  Args:
      message: The text message to be analyzed.

  Returns:
      True if the message is a question, False otherwise.
  """
  question_words = ["help", "tip", "best", "smoke", "nade", "flash", "fire", "pop", "molly", "decoy", "inferno", "mirage", "dust2", "overpass", "nuke", "cache", "train", " Vertigo", "cobblestone", "office", "cache", "office", "vertigo", "cobblestone", "how", "what", "where", "when", "why", "can", "could", "should", "which", "do", "be", "would"]
  return any(word in message.lower() for word in question_words) or "?" in message.lower()

def get_qna_pairs(data, window_size=10):
  """
  This function identifies potential question-answer pairs within a time window.

  Args:
      data: A list containing all cleaned chat messages.
      window_size: The size of the time window to consider after a question (default 10 messages).

  Returns:
      A list of potential question-answer pairs as tuples (question, answer).
  """
  qna_pairs = []
  for i in range(len(data)):
    if is_question(data[i]):  # Check if current message is a question
      # Look for answers within the window
      for j in range(i + 1, min(i + window_size + 1, len(data))):
        qna_pairs.append((data[i], data[j]))  # Add potential question-answer pair
  return qna_pairs


In [7]:
# Placeholder function for answer relevance (replace with your logic)
def is_answer_relevant(question, answer):
  """
  This function checks if an answer is potentially relevant to the question based on keywords.

  Args:
      question: The user's question.
      answer: The potential answer message.

  Returns:
      True if the answer seems relevant, False otherwise (replace with your logic).
  """
  # This is a basic example using keywords in the answer. You can improve this!
  question_words = question.lower().split()
  answer_words = answer.lower().split()
  return any(word in answer_words for word in question_words)


In [9]:
# Assuming data_path is already defined (replace with your actual path)
data_path = "/content/surf_messages.csv"

# Load and preprocess data
preprocessed_data = load_and_preprocess_data(data_path)

# Perform topic modeling (assuming you want 15 topics, adjust if desired)
topic_modeling_results = perform_topic_modeling(preprocessed_data)
ldamodel = topic_modeling_results['model']
id2word = topic_modeling_results['id2word']

# Main loop for user interaction
while True:
  user_question = input("You: ")
  if user_question.lower() == "quit":
    break

  # Preprocess user question
  cleaned_question = clean_text(user_question)

  # Identify potential question-answer pairs
  qna_pairs = get_qna_pairs(preprocessed_data)

  # Get dominant topic for the question
  dominant_topic = get_dominant_topic(cleaned_question, ldamodel, id2word)

  # Find potential answers based on topic and conversation flow
  potential_answers = find_answers(cleaned_question, dominant_topic, qna_pairs, is_answer_relevant)

  # Print response
  if potential_answers:
    print("CS:GO Surf Chatbot:")
    for answer in potential_answers:
      print(answer)
  else:
    print("CS:GO Surf Chatbot: I couldn't find an answer related to CS:GO Surf in this conversation.")

print("CS:GO Surf Chatbot: Goodbye!")

# Define functions not implemented yet (replace with their logic)
def get_dominant_topic(message, ldamodel, id2word):
  """
  This function identifies the most dominant topic for a message using LDA.

  Args:
      message: The text message for which to identify the dominant topic.
      ldamodel: The trained LDA model.
      id2word: The id2word mapping from the LDA model.

  Returns:
      The index of the most dominant topic.
  """
  bow = dictionary.doc2bow(clean_text(message))  # Assuming clean_text is defined
  topics = ldamodel[bow]  # Get topic distribution for the message
  dominant_topic_index = max(enumerate(topics), key=lambda x: x[1])[0]  # Find the topic with the highest probability
  return dominant_topic_index

def find_answers(question, topic_index, qna_pairs, is_answer_relevant):
  """
  This function finds potential answers for a question based on topic and conversation flow.

  Args:
      question: The user's question.
      topic_index: The index of the most dominant topic for the question.
      qna_pairs: A list of potential question-answer pairs.
      is_answer_relevant: A function to check answer relevance (replace with your implementation).

  Returns:
      A list of potential answer messages.
  """
  answers = []
  for qna_pair in qna_pairs:
    q, a = qna_pair
    if get_dominant_topic(q, ldamodel, id2word) == topic_index and is_answer_relevant(question, a):  # Check topic and answer relevance
      answers.append(a)
  return answers


TypeError: expected str, bytes or os.PathLike object, not list