In [41]:
from mistralai.client import MistralClient
import os
import pandas as pd

In [42]:
api_key = os.environ["MISTRAL_API_KEY"]
client = MistralClient(api_key=api_key, timeout=120)

In [43]:

# Files mapping:
# nothing -> raw
# 0 -> summary 
# 1 -> consolidated summary
# 2 -> directionality 

import os
import re
from datetime import datetime

def get_filenames(kind=None, start_date=None, end_date=None):
    directory = "data/google/search_history/"
    file_pattern = r"^(\d{4}-\d{2}-\d{2})(?:\.(\d+))?\.(?:csv|txt)$"

    def is_date_in_range(file_date):
        if start_date is None and end_date is None:
            return True
        elif start_date is None:
            return file_date <= end_date
        elif end_date is None:
            return start_date <= file_date
        else:
            return start_date <= file_date <= end_date

    def match_kind(file_kind):
        if kind is None:
            return file_kind is None
        else:
            return file_kind == str(kind)

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date, file_kind = match.groups()
                if is_date_in_range(file_date) and match_kind(file_kind):
                    filenames.append(os.path.join(root, file))

    return filenames

In [44]:
from mistralai.models.chat_completion import ChatMessage

errors = []
usage = []

def get_completion(prompt, model="mistral-small", filename=None):
    messages = [ChatMessage(role="user", content=prompt)]

    try:
        chat_response = client.chat(
            model=model,
            messages=messages,
        )
    except Exception as e:
        if filename is not None:
            errors.append(filename)
        return f"ERROR: {str(e)}"
    
    usage.append(chat_response.usage.total_tokens)
    
    return chat_response.choices[0].message.content

In [45]:
from tqdm import tqdm 

In [46]:
summary_prompt = '''
Here is a list of Google search history records for a given day.
What can you guess about the user? What is the user's intent behind the main sessions?

At the end of your analysis, provide a JSON object categorizing the day with any of these broad category tags that apply to the activities: 
[
  "science", 
  "arts and culture", 
  "organization and planning", 
  "goal-setting and self-improvement", 
  "educational content", 
  "random browsing and procrastination", 
  "entertainment-focused", 
  "social/extroverted activities", 
  "solitary/introspective content", 
  "solo hobbies",
  "helping others and charity work", 
  "empathy and emotional intelligence", 
  "relationships", 
  "competitive content", 
  "critical content", 
  "individual success",
  "anxiety", 
  "stress management and coping mechanisms", 
  "health-related concerns", 
  "relaxation content",  
  "well-being/positivity"
]

Additionally, add a field that contains a more fine-grained set of interests that you can infer from the data.
The final result should look something like this:
{
  "broad_categories": [
    "science", 
    "arts and culture", 
    "educational content", 
    "random browsing and procrastination",
    "critical content", 
    "individual success"
  ], 
  "narrow_interests": [ 
    "U2 rock band", 
    "Javascript programming"
  ]
}

Make sure to only use the provided tags for the "broad_categories" field.
'''

filenames = get_filenames()
pbar = tqdm(filenames)
for filename in pbar:
    pbar.set_postfix({'current': filename})
    with open(filename, "r") as f:
        text = f.read()

        result = get_completion(summary_prompt + text, model="mistral-small", filename=filename)
    
    with open(filename.replace(".csv", ".0.txt"), "w") as f:
        f.write(result)

print(f"ERRORS: {len(errors)}. {errors}")    

usage_df = pd.DataFrame(usage)
usage_df.to_csv("usage.csv")

  0%|          | 0/1506 [00:00<?, ?it/s, current=data/google/search_history/2018-11/2018-11-07.csv]

100%|██████████| 1506/1506 [4:04:02<00:00,  9.72s/it, current=data/google/search_history/2023-08/2023-08-17.csv]  

ERRORS: 2. ['data/google/search_history/2019-01/2019-01-22.csv', 'data/google/search_history/2022-05/2022-05-26.csv']



