In [2]:
import pandas as pd
import ast
import torch
import json
import random
from collections import defaultdict
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler




with open('../../data/new_usernames_with_data.json', 'r') as file:
    data = json.load(file)
print(data.keys())

usernames_with_data = data['new_usernames_with_data']

print(f"Total usernames with data: {len(usernames_with_data)}")

# Print the first 3 usernames and their data
for i, (username, items) in enumerate(usernames_with_data.items()):
    if i >= 3:
        break
    print(f"Username: {username}")
    print("Items:")
    for item, count in items:
        print(f"  - {item}: {count}")
        

dict_keys(['new_usernames_with_data'])
Total usernames with data: 490
Username: Spark
Items:
  - Freddie Mercury: 53
  - Richard Dawkins: 41
  - The God Delusion: 41
  - Twisted Sister: 28
  - M. C. Escher: 27
  - Billy Joel: 21
  - Mad Max (film): 17
  - Joe Scarborough: 13
  - Parents Music Resource Center: 13
  - Invention of radio: 11
Username: Slash
Items:
  - List of open-source video games: 16
  - Kevlar: 8
  - Friedrich Nietzsche: 8
  - Boxing: 6
  - The Holocaust: 6
  - Transcendental Meditation: 6
  - Fire triangle: 6
  - Unreal Tournament: 5
  - Enabling act: 5
  - Thus Spoke Zarathustra: 5
Username: Mattspac
Items:
  - List of Impact Wrestling personnel: 56
  - List of WWE personnel: 47
  - Night of Champions (2008): 21
  - WWE SmackDown vs. Raw 2009: 20
  - WrestleMania XXIV: 14
  - WWE Raw: 14
  - Dickinson, Texas: 14
  - 2008 WWE Draft: 13
  - WWE: 13
  - Kurt Angle: 13


In [4]:
#Full batch dataset
from itertools import islice

# Get the first 100 users from usernames_with_data (without converting the whole dictionary to a list)
#first_100_users = dict(islice(usernames_with_data.items(), 100))

# Categories list, it looks like you already have this correctly defined
categories = [
    "Academic disciplines", "Business", "Communication", "Concepts", "Culture", "Economy", "Education", "Energy",
    "Engineering", "Entertainment", "Entities", "Food and drink", "Geography", "Government", "Health", "History",
    "Human behavior", "Humanities", "Information", "Internet", "Knowledge", "Language", "Law", "Life", "Lists", "Mass media",
    "Mathematics", "Military", "Nature", "People", "Philosophy", "Politics", "Religion", "Science", "Society", "Sports",
    "Technology", "Time", "Universe", "Other"
]

categoryString = "\n".join([f"{item}" for item in categories])

# Function to classify users in batches of 10
def classify_users_in_batches(usernames_with_data, batch_size=10):
    all_responses = []  # Store all responses
    
    # Loop over the users in batches of 10
    for i in range(0, len(usernames_with_data), batch_size):
        sampled_users = list(usernames_with_data.keys())[i:i + batch_size]
        
        # Step 1: Create a formatted string for the LLM input for each batch
        user_articles = []
        for username in sampled_users:
            articles = usernames_with_data[username]
            formatted_articles = [str(article) for article, _ in articles]
            user_articles.append(f"User: {username}\nArticles: {', '.join(formatted_articles)}")
        llm_input = "\n\n".join(user_articles)
        
        # Step 2: Prepare the classification prompt
        content = (
            f"Below is a list of 40 categories:\n{categoryString}\n\n"
            "You will be given a list of users along with their article names. "
            "Your task is to classify each user into one or more of the given categories based on the context and content of their article names. "
            "The response must be in the following format:\n"
            "<user_name>: category1, category2, category3, category4\n\n"
            "Instructions:\n"
            "1. Use only the provided categories.\n"
            "2. Assign at least one category to each user.\n"
            "3. Do not assign more than 4 categories per user.\n"
            "4. Display only the classification in the specified format. Do not include any additional text or explanation."
        )
        
        # Combine content and question
        prompt = [
            {"role": "system", "content": content},
            {"role": "user", "content": llm_input}
        ]
        
        # LLM initialization (adjust this part based on your actual LLM class and API)
        llm = Ollama(model="llama3.1")
        response = llm.invoke(prompt)
        
        # Step 3: Process the LLM response
        # Assuming the response is a string with classifications per user
        all_responses.append(response)  # Store the responses for each batch
        
        print(f"Batch {i // batch_size + 1} processed.")

    return all_responses

# Call the function to process the users in batches
all_classifications = classify_users_in_batches(usernames_with_data)

# Output the results (you can store or manipulate them as needed)
for batch_response in all_classifications:
    print(batch_response)

Batch 1 processed.
Batch 2 processed.
Batch 3 processed.
Batch 4 processed.
Batch 5 processed.
Batch 6 processed.
Batch 7 processed.
Batch 8 processed.
Batch 9 processed.
Batch 10 processed.
Batch 11 processed.
Batch 12 processed.
Batch 13 processed.
Batch 14 processed.
Batch 15 processed.
Batch 16 processed.
Batch 17 processed.
Batch 18 processed.
Batch 19 processed.
Batch 20 processed.
Batch 21 processed.
Batch 22 processed.
Batch 23 processed.
Batch 24 processed.
Batch 25 processed.
Batch 26 processed.
Batch 27 processed.
Batch 28 processed.
Batch 29 processed.
Batch 30 processed.
Batch 31 processed.
Batch 32 processed.
Batch 33 processed.
Batch 34 processed.
Batch 35 processed.
Batch 36 processed.
Batch 37 processed.
Batch 38 processed.
Batch 39 processed.
Batch 40 processed.
Batch 41 processed.
Batch 42 processed.
Batch 43 processed.
Batch 44 processed.
Batch 45 processed.
Batch 46 processed.
Batch 47 processed.
Batch 48 processed.
Batch 49 processed.
Spark: Entertainment, People,

In [5]:
with open('../../data/classifications_new_user.txt', 'w') as text_file:
    for batch_response in all_classifications:
        text_file.write(f"{batch_response}\n\n")

In [16]:
#process text file
import re

users = list(usernames_with_data.keys())

# Path to your text file
file_path = '../../data/classifications_new_user.txt'

# Initialize an empty dictionary to store usernames and categories
user_categories = {}

# Read the text file
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

    # Loop through each line in the text file
    for line in lines:
        # Remove leading/trailing whitespace
        line = line.strip()
        line = line.replace("User: ", "")
        line = line.replace("User:", "")
        line = line.replace("User", "")
        line = line.strip()

        # Check if the line contains a username from the list
        for username in users:
            if username in line:
                # Extract categories (after the first colon `:`)
                # Use regular expression to split categories, handling cases where `:` might be followed by spaces
                match = re.match(r"([^\:]+)\:(.*)", line)
                if match:
                    found_username = match.group(1).strip()  # Get username (before the colon)
                    categories_str = match.group(2).strip()  # Get categories (after the colon)

                    # Clean up the categories string (remove extra spaces and split by commas)
                    categories = [cat.strip() for cat in categories_str.split(',')]

                    # Store the username and categories in the dictionary
                    user_categories[found_username] = categories

# Now `user_categories` contains the usernames and their associated categories
print(user_categories)

#verification
missing_users = [user for user in users if user not in user_categories]
print(len(missing_users))

# 2. Print the result
if not missing_users:
    print("All users are present in the user_categories dictionary.")
else:
    print(f"Missing users: {', '.join(missing_users)}")

{'Spark': ['Entertainment', 'People', 'Science', 'Culture'], 'Slash': ['Information', 'Technology', 'Business', 'Concepts'], 'Mattspac': ['Sports', 'Entities', 'Education', 'Society'], 'Az1568': ['Knowledge', 'History', 'Culture', 'Health'], 'Addihockey10': ['Entertainment', 'Human behavior', 'Economy', 'Communication'], 'Arcticocean': ['Government', 'Law', 'Philosophy', 'Geography'], 'Jena Fi': ['Music', 'Family', 'Education', 'Children'], 'LeoFrank': ['Transportation', 'Places', 'Economy', 'Business'], 'Dan1679': ['Technology', 'Education', 'Information', 'People'], 'FormerEditor2718': ['Entertainment', 'Technology', 'Society', 'Information'], 'PhilKnight': ['Politics', 'Religion', 'Entertainment', 'Other'], 'Awilley': ['Culture', 'People', 'Business', 'History'], 'AlasdairEdits': ['Law', 'Government', 'Education', 'Information'], '72Dino': ['Geography', 'Business', 'Sports', 'Education'], 'LadyofShalott': ['Culture', 'People', 'Entertainment', 'Society'], 'AlexJ': ['Sports', 'Techno

In [17]:
with open("../../data/user_categories_new.json", "w") as json_file:
    json.dump(user_categories, json_file, indent=4) 