In [146]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [147]:
import pandas as pd

usage_df = pd.read_csv('usage.csv')
usage_df["0"].sum()

3929079

In [148]:
import json_repair


def extract_json(text):
    text = text.replace("\n", "")
    start_index = text.rfind("{")
    end_index = text.rfind("}")

    json_response = {}

    if start_index != -1 and end_index != -1 and start_index < end_index:
        json_text = text[start_index : end_index + 1]
        try:
            json_response = json_repair.loads(json_text)
        except:
            pass

    return json_response

In [149]:
import os
import re
from datetime import datetime

def get_filenames(kind=None, start_date=None, end_date=None):
    directory = "data/google/search_history/"
    file_pattern = r"^(\d{4}-\d{2}-\d{2})(?:\.(\d+))?\.(?:csv|txt)$"

    def is_date_in_range(file_date):
        if start_date is None and end_date is None:
            return True
        elif start_date is None:
            return file_date <= end_date
        elif end_date is None:
            return start_date <= file_date
        else:
            return start_date <= file_date <= end_date

    def match_kind(file_kind):
        if kind is None:
            return file_kind is None
        else:
            return file_kind == str(kind)

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date, file_kind = match.groups()
                if is_date_in_range(file_date) and match_kind(file_kind):
                    filenames.append(os.path.join(root, file))

    return filenames

In [150]:
markers = {
  "openness": {
    "positive": [
        "science", 
        "arts and culture", 
    ],
  },
  "conscientiousness": {
    "positive": [
        "organization and planning", 
        "goal-setting and self-improvement", 
        "educational content", 
    ],
    "negative": [
        "random browsing and procrastination", 
        "entertainment-focused", 
    ]
  },
  "extraversion": {
    "positive": [
        "social/extroverted activities", 
    ],
    "negative": [
        "solitary/introspective content", 
        "solo hobbies",
    ]
  },
  "agreeableness": {
    "positive": [
        "helping others and charity work", 
        "empathy and emotional intelligence", 
        "relationships", 
    ],
    "negative": [
        "competitive content", 
        "critical content", 
        "individual success",
    ]
  },
  "neuroticism": {
    "positive": [
        "anxiety", 
        "stress management and coping mechanisms", 
    ],
    "negative": [
        "health-related concerns", 
        "relaxation content",  
        "well-being/positivity"
    ]
  }
}


In [151]:
def intersection(a, b): 
    return not set(a).isdisjoint(b)

In [152]:
from collections import defaultdict

# Function to return a new defaultdict of dict
def defaultdict_of_dict():
    return defaultdict(int)

def defaultdict_to_dict(d):
    if isinstance(d, defaultdict):
        d = {k: defaultdict_to_dict(v) for k, v in d.items()}
    return d

In [153]:
json_errors = 0
counter = defaultdict(defaultdict_of_dict)
interests = set()

for file in get_filenames(0):
    with open(file, "r") as f:
        text = f.read()
        tags = extract_json(text)

        if "narrow_interests" in tags:
            interests.update(tags["narrow_interests"])

        if "broad_categories" not in tags:
            json_errors +=1
        else:
            bc = tags["broad_categories"]

            for trait, values in markers.items():
                for polarity, keywords in values.items():
                    if intersection(bc, keywords):
                        counter[trait][polarity] += 1

In [154]:
len(interests)

9012

In [155]:
total_days = len(get_filenames(0)) - json_errors

In [156]:
counter = defaultdict_to_dict(counter)
counter

{'conscientiousness': {'positive': 1207, 'negative': 697},
 'extraversion': {'negative': 724, 'positive': 147},
 'openness': {'positive': 620},
 'agreeableness': {'negative': 365, 'positive': 173},
 'neuroticism': {'positive': 155, 'negative': 608}}

In [157]:
normalized_scores = {}
for trait, values in counter.items():
    positive = values.get('positive', 0)
    negative = values.get('negative', 0)
    normalized_score = (positive - negative) / total_days / 2  # Dividing by 2 to scale to -0.5 to +0.5
    normalized_scores[trait] = normalized_score + 0.5 # Adding 0.5 baseline

normalized_scores

{'conscientiousness': 0.7011041009463722,
 'extraversion': 0.2724763406940063,
 'openness': 0.7444794952681388,
 'agreeableness': 0.4242902208201893,
 'neuroticism': 0.32137223974763407}

In [161]:
openai_model = 'gpt-4-1106-preview'

context_length = {
    'gpt-4': 1024*8,
    'gpt-4-32k': 1024*32,
    'gpt-4-1106-preview': 1024*128
}

import os, tiktoken
import openai
import random

openai.api_key = os.environ["OPENAI_API_KEY"]

output_tokens = 1000
max_context = context_length[openai_model] - output_tokens
encoding = tiktoken.encoding_for_model(openai_model)
get_n_tokens = lambda text: len(encoding.encode(text))


prompt_base = f"""
I am conduncting an analysis to estimate my "openness" personality trait based on my Google search history.
After a prelimiary analysis, I arrived to a baseline score of {normalized_scores['openness']:.2f}, by taking into account how often I search for topics related to science, arts and culture.

To improve the accuracy of the score, I want to take into the account the diversity (or lack thereof) of my more fine grained interests.
I will now provide you a summary of my interests extracted from my Google search history over a period of 5 years.
After the analysis, provide a JSON object with a "modiefier" value between -0.5 an 0.5, which will be added or subtracted to the baseline score.

Here is the list of interests:
\n\n
"""

def get_prompt(interests):
    prompt_base_tokens = get_n_tokens(prompt_base)
    interest_tokens = [get_n_tokens(interest) for interest in interests]
    total_interest_tokens = sum(interest_tokens) + len(interests) # account for separators

    # Truncate the interests array if the total number of tokens exceeds max_context
    if prompt_base_tokens + total_interest_tokens > max_context:
        remaining_tokens = max_context - prompt_base_tokens
        truncated_interests = []
        current_tokens = 0
        for i, interest_token in enumerate(interest_tokens):
            if current_tokens + interest_token <= remaining_tokens:
                truncated_interests.append(interests[i])
                current_tokens += interest_token
            else:
                break
    else:
        truncated_interests = interests

    print(f"Saved {len(truncated_interests)} interests out of {len(interests)}")

    return f"{prompt_base}{",".join(truncated_interests)}"

data = list(map(str, interests))
random.shuffle(data)

client = openai.OpenAI(api_key= os.environ["OPENAI_API_KEY"])
completion = client.chat.completions.create(
  model=openai_model,
  messages=[
    {"role": "user", "content": get_prompt(data)},
  ],
  max_tokens=output_tokens
)

result = completion.choices[0].message

print(result)

Saved 9012 interests out of 9012
ChatCompletionMessage(content='```json\n{\n  "modifier": 0.25\n}\n```\n\nThe reason for the modifier value of 0.25 is based on the diversity and breadth of interests and topics provided. The list indicates an array of subjects, ranging from arts, culture, science, technology, gaming, history, psychology, finance, and global events, suggesting a high degree of openness to experience—characteristic of intellectual curiosity and a willingness to engage with a wide variety of ideas. This positive modifier acknowledges the exploration of complex and varied fields, which complements the baseline "openness" score by reflecting an individual\'s nuanced and expansive set of interests.', role='assistant', function_call=None, tool_calls=None)


AttributeError: 'ChatCompletionMessage' object has no attribute 'replace'

In [163]:
json_answer = extract_json(result.content)
json_answer

{'modifier': 0.25}