In [290]:
from octoai.text_gen import ChatMessage
from octoai.client import OctoAI
import os
from supabase import create_client, Client
import re
import json
import pandas as pd

In [291]:
system_prompt = """
You are a legal helper and determined to help users understand Terms of Service of online services. 
The users are helpless and need information on different categories. 
For everyone of the categories, please assess the Terms of Service and evaluate whether they are favorable for the users or not. These are the tasks:
1. Please provide a short and concise summary of the terms of services for each category in less than 50 words.
2. Please give grades on a scale of 0-10 for every category. 0 hereby represents a terrible condition for the user (e.g. unsafe practices, allowing data to be shared with third parties or not rights to sue the company for malpractices) 10 shall reflect highest levels of favorability for users (e.g. on-device storing of passwords and hashings sent to server, content availability that doesn't vary depending on locations, password sharing not forbidden, the right to sue the company)
3. Please provide snippets from the Terms of Service of the specific service that you see as very important for making your final judgement.

These are the categories that the terms of service shall be assessed by:
1. Content Availability: This section is about where the content can be consumed from geographicalls as well as device-wise. It shall also include information on how the catalog of content is differs (e.g. by billing address of geo location).
2. Legal dispute resolution: This section is about how and in which ways the users can sue the company. In unfavorable situations this requires an arbitrage, in more favorable situations, the users can sue the company directly.

After giving the summary, scores per category and providing text snippets, please assess if there are any other information that you consider important for the users to know and share that, too.

Provide the answer in JSON format:
{
  "category_title": title of the categories (e.g. Content Availability),
  "score": the evaluation score of that category (e.g. 8),
  "description": the consise summary of the content in the category,
  "reference": snippets from the Terms of Service that were taken into account for making the score assessment
  }

For the other important information structure it like this:
{
  "category_title": Other Information,
  "score": null,
  "description": title of Additional important information category as well as Text about additional important information
  }

Only output 1 JSON code snippet without any accompanying text. This is very important!
DO NOT OUTPUT ANYTHING ELSE THAT WHAT I ASKED YOU 
DO OUTPUT YOUR ANSWER IN JSON FORMAT 
DO FOLLOW THE JSON STRUCTURE AS DESCRIBED
DO NOT USE ANY OTHER TYPE THAN THE ONE I GAVE YOU, YOU MUST FOLLOW IT OTHERWISE USERS WILL HAVE A BAD EXPERIENCE
"""

In [295]:
class ChatBot:
    def __init__(self):
        """
        Initializes the ChatBot instance with the given user and system prompts.
        """
        self.message_conversation = [
            ChatMessage(
                content=system_prompt,
                role="system"
            )
        ]
        self.available_companies = []
        self.initialize_client()
        self.initialize_supabase()


    def initialize_client(self):
        """
        Initializes the OctoAI client with the provided API key.
        """
        self.client = OctoAI(api_key=os.environ["OCTOAI_API_TOKEN"])
        print('🤖 log: OctoAI client initialized')

    def initialize_supabase(self):
        """
        Initializes the Supabase client with the provided API key.
        """
        url = 'https://ceaulemroculazvkgwqe.supabase.co'
        key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNlYXVsZW1yb2N1bGF6dmtnd3FlIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTcxNDkzMTYyOSwiZXhwIjoyMDMwNTA3NjI5fQ.uVvBx5CojTxVKtzt6r1lZPVlkmRn-SDZ2x8LZBGBjgo'
        self.supabase: Client = create_client(url, key)
        print('🤖 log: supabase initialized')

        self.set_companies()

    def set_companies(self):
        """
        Sets the available companies for the chatbot.
        """
        data = self.supabase.table('websites').select("*").execute().data 
        self.companies = [d['name'] for d in data]
        print('🤖 log: companies available:', ', '.join(self.companies)) 


    def add_message(self, message, role="user"):
        """
        Adds a new message to the chat and runs the language model (LLM).
        """
        self.message_conversation.append(
            ChatMessage(
                content=message,
                role=role 
            )
        )
        print('✅ log: message added')

    def run_llm(self):
        """
        Runs the language model (LLM) on the current message conversation.
        """
        print('🤖 log: running LLM')
        response = self.client.text_gen.create_chat_completion_stream(
            max_tokens=512,
            messages=self.message_conversation,
            model="mixtral-8x22b-finetuned",
            presence_penalty=0,
            temperature=0.1,
            top_p=0.9
        )

        answer = ""
        try: 
            for chunk in response:
                try:
                    content = chunk.choices[0].delta.content
                    if content:  # Check if content is not None
                        answer += content
                except Exception as e:
                    # print(e)
                    pass
        except Exception as e:
            # print(e)
            pass

        return answer
    

    def fetch_company_terms(self, company_name):
        """
        Fetches the company's terms of service from the Supabase database.
        """
        company_terms = self.supabase.table('websites').select("*").eq('name', company_name).execute().data[0]['terms']
        return company_terms


    def ask_about_company(self, company_name):
        """
        Asks about a company's history, products, and achievements.
        """
        if company_name not in self.companies:
            print(f"🚫 logs: Company {company_name} not found. Please choose from the following companies: {self.companies}")

        else: 
            company_term_of_service = self.fetch_company_terms(company_name)
            user_prompt = f"""
                I would like to know about the terms of Service for {company_name}. Adhere to the JSON structure as described. 
                
                {company_term_of_service} 
            """
            self.add_message(user_prompt, role="user")

        answer = self.run_llm()
        return answer
    
    def overall_pipeline(self, company_name):
        """
        Calls the ask about company function and then parses and uploads the answer to the DB
        """

        answer = self.ask_about_company(company_name)
        print(answer)

        tmp = [x.strip() for x in answer.replace('```json', '```').replace('\n', "").split("```") if x.strip() != ""]


        try: 
            final_answer = json.loads(tmp[0])

            # file_path = "netflix_tldr.json"

            # # Write the data to the JSON file
            # with open(file_path, "w") as json_file:
            #     json.dump(final_answer, json_file)

            final_answer
            print(final_answer)
            print('✅ log: JSON with LLM response extracted successfully')

            self.supabase.table('websites').update({'tldr': final_answer}).eq('name', company_name).execute()        
            print('✅ log: pipeline completed')

        except Exception as e: 
            print(e)
            print('TMPPPPPP', len(tmp), f"\n|||{tmp}|||\n")





# Example usage
chat_bot = ChatBot()

🤖 log: OctoAI client initialized
🤖 log: supabase initialized
🤖 log: companies available: Twitch, Spotify, TikTok, YouTube, Apple, Vercel, X, Facebook, Wikipedia, Netflix


In [289]:
# Define the columns to check
result = chat_bot.supabase.table('websites').select("name, tldr").neq('terms', "").execute()

# Check if the query was successful
if result.data is None:
    print("Error:", result.error)
else:
    # Extract the data from the result
    data = result.data

    # Convert the data to a Pandas DataFrame
    df = pd.DataFrame(data)

missing_tldr = df.loc[:, 'name'][df.loc[:, 'tldr'].isna()].to_list()[::-1]

print(len(missing_tldr))
missing_tldr

# #iterating over the company names that need to be backfilled
for item in missing_tldr:
    print(item)
    

    # try:
        # Ask about a company
    chat_bot.overall_pipeline(item)
    
    print(f'✅: Backfill for {item} completed')
    
    

    # except:

    #     continue
    

2
Apple
✅ log: message added
🤖 log: running LLM
 ```json
[
  {
    "category_title": "Content Availability",
    "score": 5,
    "description": "Apple's terms allow for personal, non-commercial use of content, with some limitations on device usage and content availability varying by Home Country.",
    "reference": "A. INTRODUCTION: Content may be used on compatible devices, with certain content subject to geographic and device restrictions. I. CONTENT AND SERVICE AVAILABILITY: Terms vary by Home Country and content types; some services/content may be unavailable when traveling."
  },
  {
    "category_title": "Legal dispute resolution",
    "score": 3,
    "description": "Apple's terms heavily favor arbitration and limit class action participation, which can be less favorable for users compared to direct lawsuits.",
    "reference": "S. MISCELLANEOUS TERMS APPLICABLE TO ALL SERVICES: Disputes are subject to the laws of the State of California, and users generally must resolve disputes

In [297]:
# Ask about a company
chat_bot.overall_pipeline(
    "Netflix"
)

✅ log: message added
🤖 log: running LLM
 ```json
[
  {
    "category_title": "Content Availability",
    "score": 5,
    "description": "Content access varies by subscription plan and geographic location. Offline viewing available for some content based on plan.",
    "reference": "You may access Netflix content primarily within the country in which you have established your account and only in geographic locations where we offer your subscription plan and have licensed such content. The content that may be available will vary by geographic location and will change from time to time."
  },
  {
    "category_title": "Legal dispute resolution",
    "score": 2,
    "description": "Arbitration required for disputes, limiting users' ability to sue. Class action waiver included.",
    "reference": "You and Netflix agree that any dispute, claim or controversy arising out of or relating in any way to the Netflix service, these Terms of Use and this Arbitration Agreement, shall be determined by

In [282]:
chat_bot.supabase.table('websites').select("*").eq('name', 'Netflix').execute().data[0]['tldr']

[{'category_title': 'Content Availability',
  'score': 7,
  'description': 'Content can be accessed globally, but availability may vary by location due to licensing restrictions.',
  'reference': 'The Service allows you to discover, watch and share videos and other content, provides a forum for people to connect, inform, and inspire others across the globe, and acts as a distribution platform for original content creators and advertisers large and small.'},
 {'category_title': 'Legal dispute resolution',
  'score': 5,
  'description': 'Users can only bring legal disputes in California courts, and must do so within one year. Arbitration is not explicitly mentioned.',
  'reference': 'All claims arising out of or relating to these terms or the Service will be governed by California law, except California’s conflict of laws rules, and will be litigated exclusively in the federal or state courts of Santa Clara County, California, USA. You and YouTube consent to personal jurisdiction in thos

In [284]:
# def fetch_tos(service):
#     """
#     Function to retrieve the TOS for the specific Service
#     """

#     file_path = f"{service}_terms.md"
#     try:
#         with open(file_path, 'r', encoding='utf-8') as file:
#             markdown_content = file.read()
#         return markdown_content
#     except FileNotFoundError:
#         print(f"File '{file_path}' not found.")
#         return None

# # Calling the function to retrieve the markdown TOS for the requested service
# service = 'youtube'
# markdown_string = fetch_tos(service)
# if markdown_string:
#     print('✅ log: Sevice TOS added')