In [26]:
!pip install openai
!pip install transformers torch huggingface-hub sentencepiece
!pip install openai==0.28.1 pandas numpy scikit-learn torch transformers pytorch-lightning tensorboard python-dotenv
!pip install lightning




In [27]:
import os
from collections import Counter
from typing import List, Tuple

import matplotlib.pyplot as plt
import pandas as pd
import torch
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import drive
import openai

# Ensure that NLTK resources are downloaded before use
import nltk
nltk.download('stopwords')
nltk.download('punkt')


In [None]:
# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [28]:
# Load environment variables from .env file
load_dotenv('/content/gdrive/MyDrive/individual_project/.env')

# Set OpenAI API key from environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")

In [30]:
# Mount Google Drive
drive.mount('/content/gdrive')

# Define file paths
absolute_path = "/content/gdrive/My Drive/"
dataset_path = absolute_path + "Dataset/"
input_file = dataset_path + 'balanced_events.csv'

# Load the original dataset
df = pd.read_csv(input_file)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Mounted at /content/gdrive


In [31]:
# Ensure all values in 'general_rules_tokens' are strings, replacing NaNs with empty strings
df['general_rules_tokens'] = df['general_rules_tokens'].fillna('').astype(str)

# Filter for relevant categories
filtered_df = df[df['category_label'].isin([0, 4])]

# Group data by category
grouped = filtered_df.groupby('category_label')['general_rules_tokens'].apply(lambda x: ' '.join(x)).reset_index()

# Display the grouped data
print(grouped)

   category_label                               general_rules_tokens
0               0  participant found used campaign participant bo...
1               4  participation least old get reward sign direct...


In [22]:
# Function to get most common general rules
def get_most_common_rules(text: str, n: int = 10) -> List[Tuple[str, int]]:
    """
    Returns the `n` most common rules from the input text.

    Args:
        text (str): The input text containing the rules.
        n (int): The number of most common rules to return. Defaults to 10.

    Returns:
        List[Tuple[str, int]]: A list of tuples where each tuple contains a rule and its frequency.
    """
    tokens = text.split()
    counter = Counter(tokens)
    return counter.most_common(n)


# Apply the function to each category
grouped['most_common_rules'] = grouped['general_rules_tokens'].apply(get_most_common_rules)

# Display the results
print(grouped[['category_label', 'most_common_rules']])

   category_label                                  most_common_rules
0               0  [(bounty, 850), (campaign, 659), (must, 437), ...
1               4  [(bounty, 531), (campaign, 450), (must, 372), ...


In [17]:
# Define the category mapping at the top level
category_mapping = {0: "Bounty", 4: "Bounty, ICO"}

def get_most_common_rules_gpt4(text: str, chunk_size: int = 3000, category_label: int = None) -> List[str]:
    """
    Retrieves the most common rules using OpenAI's GPT-4 model by summarizing the input text in chunks.

    Args:
        text (str): The input text containing the rules to be summarized.
        chunk_size (int): The size of the text chunks sent to the GPT-4 model. Defaults to 3000 characters.
        category_label (int): The category label used to reference the rules' context. Defaults to None.

    Returns:
        List[str]: A list of summarized and most common rules for the given category.
    """
    category_name = category_mapping.get(category_label, 'Unknown Category')

    # Split the text into manageable chunks
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    all_rules = []

    for chunk in chunks:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "You are an expert assistant specializing in analyzing and summarizing text. "
                        "You are tasked with reviewing the rules for the '{category_name}' category. "
                        "Carefully analyze the following text and extract the most frequently mentioned rules or guidelines. "
                        "Your goal is to identify and summarize the essential rules and requirements, focusing on recurring themes and patterns. "
                        "Group similar rules together, consolidate them into clear and complete statements, and avoid any repetition or redundancy. "
                        "Each rule should be presented as a full sentence, clearly conveying the requirement or guideline. "
                        "Do not include phrases like 'Based on the provided text' or 'According to the text,' and do not number the rules. "
                        "The rules should be formatted as bullet points, providing a direct and concise summary for the '{category_name}' category."
                    ).format(category_name=category_name)
                },
                {
                    "role": "user",
                    "content": f"Please extract and summarize the most common rules or guidelines from the following text for the '{category_name}' category: {chunk}"
                }
            ]
        )

        # Extract and clean the rules
        rules = response['choices'][0]['message']['content']
        rules_list = [rule.strip() for rule in rules.split('\n') if rule.strip()]
        all_rules.extend(rules_list)

    # Count the frequency of each rule and select the top 10
    rule_counts = Counter(all_rules)
    top_10_rules = rule_counts.most_common(10)

    # Return only the rules, without their counts
    return [rule for rule, count in top_10_rules]

In [18]:
# Apply the function to each category in the DataFrame
grouped['unique_rules_gpt4'] = grouped.apply(
    lambda row: get_most_common_rules_gpt4(row['general_rules_tokens'], category_label=row['category_label']),
    axis=1
)

# Function to display results in a readable format
def display_unique_results(grouped: pd.DataFrame) -> None:
    """
    Displays the most common rules for each category in the DataFrame.

    Args:
        grouped (pd.DataFrame): A DataFrame containing category labels and their associated most common rules.
    """
    for idx, row in grouped.iterrows():
        category_label = row['category_label']
        unique_rules = row['unique_rules_gpt4']

        print(f"Most common rules for category {category_mapping[category_label]}:")
        for rule in unique_rules:
            print(f"- {rule}")
        print("\n")

# Display the results
display_unique_results(grouped)

Most common rules for category Bounty:
- - Participants must provide all work via a mandatory submission form and are encouraged to post their work in designated threads.
- - Weekly reports must be submitted using a new post, and previous reports should be edited or quoted as necessary, with a clear indication of the work completed.
- - Applicants are required to join the official Telegram chat and avoid using someone else's wallet address for rewards.
- - All submissions must be original, creative, and relevant to the campaign to avoid disqualification for unethical behavior.
- - Any offensive or inappropriate behavior will result in immediate disqualification from the campaign.
- - Participants must check the status of their submissions against the status sheet regularly and adhere to designated deadlines for reporting work.
- - If task requirements are not met within a week, the entire submission may be deemed incomplete and result in disqualification.
- - Participants are responsib