In [15]:
import pandas as pd
import json
import time
import re
from supabase import create_client

In [18]:
from together import Together

TOGETHER_API_KEY = "5a532872525382e32ebc396c6cc682d3b8d8d5ea428ef9468404286bb1417f2c"

class GooperBackendModel:
    def __init__(self):
        self.client = Together(api_key=TOGETHER_API_KEY)
        self.get_post_description_system_prompt = open("../prompts/describe_image.txt", "r").read()
        self.get_profile_description_system_prompt = open("../prompts/summarize_influencer.txt", "r").read()

        self.embedding_model = "BAAI/bge-base-en-v1.5"
        self.text_model = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
        self.vision_model = "meta-llama/Llama-Vision-Free"

    def get_post_description(self, imageUrl):
        
        systemPrompt = self.get_post_description_system_prompt
        
        response = self.client.chat.completions.create(
            model= self.vision_model,
            messages=[
                {   
                    "role": "system",
                    "content": [
                        {"type": "text", "text": systemPrompt},
                    ],
                    "role": "user",
                    "content": [
                        {"type": "text", "text": systemPrompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": imageUrl,
                            },
                        },
                    ],
                }
            ],
        )
        
        return response.choices[0].message.content
    
    def get_profile_description(self, prompt):

        systemPrompt = self.get_profile_description_system_prompt

        response = self.client.chat.completions.create(
            model= self.text_model,
            messages=[
                {   
                    "role": "system",
                    "content": systemPrompt
                },
                {
                    "role": "user",
                    "content": "Write a summary for this profile:\n" + prompt,
                }
            ],
            max_tokens=512,
            temperature=0.7,
            top_p = 0.7,
            top_k = 50,
            stream=False,
        )
        
        return response.choices[0].message.content
    
    def get_embeddings(self, text):
        
        out = self.client.embeddings.create(
            input=text,
            model=self.embedding_model,
        )
        
        return out.data[0].embedding

In [92]:
data = json.load(open('241221_apify.json'))

From apify .json dataset create a custom polished dataset

In [93]:
df = []

for i in range(len(data)):
    try:
        entry = {
            'url': data[i]['inputUrl'],
            'username': data[i]['username'],
            'biography': data[i]['biography'],
            'followersCount': data[i]['followersCount'],
            'followsCount': data[i]['followsCount'],
            'businessCategoryName': data[i]['businessCategoryName'],
            'latestPosts': []
        }

        # keep only those with [2k, 10k] followers
        if entry['followersCount'] < 2000 or entry['followersCount'] > 10000:
            continue

        # retrieve last 5 post images and captions
        for j in range(len(data[i]['latestPosts'])):
            
            if data[i]['latestPosts'][j]['type'] == 'video':
                continue

            if len(entry['latestPosts']) >= 5:
                continue

            entry['latestPosts'].append({'displayUrl': data[i]['latestPosts'][j]['displayUrl'],
                                        'caption': data[i]['latestPosts'][j]['caption'],
                                        })
            
        if len(entry['latestPosts']) < 5:
            continue

        df.append(entry)

    except:
        print(f'Failed at {i}')

print(f'Successfully saved {len(df)} entries')    

Failed at 168
Failed at 175
Failed at 667
Successfully saved 90 entries


Get description of each image

In [None]:
model = GooperBackendModel()

for i in range(len(df)):
    
    print('-'*50)
    print(f'Processing profile {i+1:03}/{len(df)+1}')
    start = time.time()

    for j in range(len(df[i]['latestPosts'])):
        try:
            df[i]['latestPosts'][j]['description'] = model.get_post_description(df[i]['latestPosts'][j]['displayUrl'])
            
            print(f'\t{j}. ', df[i]['latestPosts'][j]['description'])
        except Exception as e:
            print(f'Failed at {i} {j}: {e}')

    end = time.time()
    
    print(f'Completed in {end-start:.2f}s')

with open(f'241221_apify_desc_{int(time.time())}.json', 'w') as f:
    json.dump(df, f)

Summarize entire profile given name, bio, image descriptions and captions

In [5]:
def format_prompt(profile):

    prompt = f"""Write a summary of this Instagram profile:

Instagram Handle:
@{profile['username']}

Business Category:
{profile['businessCategoryName']}

Bio:
{profile['biography']}

"""
    for j in range(len(profile['latestPosts'])):
        prompt += f"Description of post {j+1}:\n{profile['latestPosts'][j]['description']} \nCaption of post {j+1}:\n{profile['latestPosts'][j]['caption']} \n\n"
    
    return prompt


In [13]:
df = json.load(open("241221_apify_desc_1734810510.json", "r"))
model = GooperBackendModel()
timestamp = int(time.time())

for i in range(len(df)):

    start = time.time()
    print('-'*50)
    print(f'Processing profile {i+1:03}/{len(df)+1}')

    prompt = format_prompt(df[i])
    desc = model.get_profile_description(prompt)
    desc = re.sub(r'\n+', ' ', desc)
    df[i]['description'] = desc

    print(f'\t{desc}')
    end = time.time()
    print(f'Completed in {end-start:.2f}s')

    with open(f'241221_apify_complete_{timestamp}.json', 'w') as f:
        json.dump(df, f)

self.get_post_description_system_prompt:
You are a social media analyst and your task is to summarize the content and context of a given image, which represents an Instagram post.
Describe the main objects, people, or scenes in the image, including their appearance, actions, and expressions.
Identify the mood, aesthetic, or theme of the image and any notable unique features.  
Make the summary concise yet detailed enough to help understand the overall content and intent of the post.

Guidelines:
- Do not use bullet points, titles or lists.
- Write a single sentence starting with "This image..."
- Use not more than 30 words

self.get_influencer_description_system_prompt:
You are a social media analyst.
Your task is to generate a concise, professional summary of an influencer's Instagram profile based on its publicly available information.

Guidelines for the Summary:
- Tone: Informative and professional.
- Content: Highlight the profile's main themes, the type of content posted, audienc

Generate embeddings and save to Supabase

In [30]:
from supabase import create_client

SUPABSE_URL = "https://edcqmzluacqdqqmmklik.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImVkY3Ftemx1YWNxZHFxbW1rbGlrIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MzQwMTM1MDgsImV4cCI6MjA0OTU4OTUwOH0.Po1jIO14A6mCuN1xo-K6ikpKR1XlGt4_ivoYVX2raSU"

model = GooperBackendModel()
supabase = create_client(SUPABSE_URL, SUPABASE_KEY)

df = json.load(open("./241221_apify_complete_1734869240.json", "r"))

for i in range(len(df)):
    print('-'*50)
    print(f'Processing profile {i+1:03}/{len(df)+1}')
    try:
        emb = model.get_embeddings(df[i]['description'])

        supabase.table('influencers_new').insert([{
            'username': df[i]['username'],
            'biography': df[i]['biography'],
            'followers_count': df[i]['followersCount'],
            'follows_count': df[i]['followsCount'],
            'business_category': df[i]['businessCategoryName'],
            'description': df[i]['description'],
            'embeddings': emb,
        }]).execute()
        print('Done')
    except Exception as e:
        print('Failed')


--------------------------------------------------
Processing profile 001/91
Done
--------------------------------------------------
Processing profile 002/91
Done
--------------------------------------------------
Processing profile 003/91
Done
--------------------------------------------------
Processing profile 004/91
Done
--------------------------------------------------
Processing profile 005/91
Done
--------------------------------------------------
Processing profile 006/91
Done
--------------------------------------------------
Processing profile 007/91
Done
--------------------------------------------------
Processing profile 008/91
Done
--------------------------------------------------
Processing profile 009/91
Done
--------------------------------------------------
Processing profile 010/91
Done
--------------------------------------------------
Processing profile 011/91
Done
--------------------------------------------------
Processing profile 012/91
Done
----------------

In [39]:
ans = 1
for i in df:
    ans *= i['url'] == f'https://www.instagram.com/{i["username"]}/'
print(ans)

1
