In [28]:
!pip install transformers python-dotenv praw tqdm torch



You should consider upgrading via the 'C:\Users\cheta\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [29]:
from flask import Flask, url_for, send_from_directory, redirect
import os
import threading

app = Flask(__name__)

@app.route('/persona/')
def list_personas():
    persona_files = [f for f in os.listdir('output') if f.endswith('.html')]
    users = [f.replace('.html', '') for f in persona_files]
    links = [f"<li><a href='/persona/html/{user}' target='_blank'>{user}</a></li>" for user in users]
    return f"<h2>Available Personas</h2><ul>{''.join(links)}</ul>"

@app.route('/persona/html/<username>')
def serve_persona_html(username):
    html_filename = f'{username}.html'
    html_path = os.path.join('output', html_filename)
    if not os.path.exists(html_path):
        return f"Persona HTML for {username} not found."
    return send_from_directory('output', html_filename)

def run_flask():
    app.run(debug=True, use_reloader=False)

# Start Flask app in a background thread
threading.Thread(target=run_flask).start()
print("Flask app is running. Access it at http://127.0.0.1:5000/persona/")

Flask app is running. Access it at http://127.0.0.1:5000/persona/
 * Serving Flask app '__main__'
 * Serving Flask app '__main__'
 * Debug mode: on
 * Debug mode: on


 * Running on http://127.0.0.1:5000

 * Running on http://127.0.0.1:5000
Press CTRL+C to quitPress CTRL+C to quit



In [30]:
# Reddit Scraper Implementation
import praw
import requests
import json
import time
from datetime import datetime
from tqdm import tqdm

def fetch_user_data(username):
    """
    Fetch user data from Reddit using multiple methods
    Returns: posts, comments, profile_info
    """
    try:
        # First try with PRAW if credentials are available
        reddit = praw.Reddit(
            client_id=os.getenv("REDDIT_CLIENT_ID"),
            client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
            user_agent=os.getenv("REDDIT_USER_AGENT", "RedditPersonaCraft/1.0 by /u/yourname")
        )
        
        print(f"[1] Fetching data for Reddit user: {username}")
        
        # Get user profile
        user = reddit.redditor(username)
        
        # Extract profile info
        profile_info = {
            "name": user.name,
            "icon_img": getattr(user, 'icon_img', '') or "https://www.redditstatic.com/avatars/avatar_default_02_25B79F.png",
            "created_utc": user.created_utc,
            "total_karma": getattr(user, 'total_karma', 0),
            "link_karma": user.link_karma,
            "comment_karma": user.comment_karma,
            "subreddit": getattr(user.subreddit, 'display_name', '') if hasattr(user, 'subreddit') and user.subreddit else "",
            "bio": getattr(user, 'subreddit', {}).get('public_description', '') if hasattr(user, 'subreddit') and user.subreddit else ""
        }
        
        # Fetch posts
        posts = []
        print("[2] Fetching posts...")
        try:
            for post in tqdm(user.submissions.new(limit=50)):
                posts.append({
                    "type": "post",
                    "title": post.title,
                    "selftext": post.selftext,
                    "url": post.url,
                    "subreddit": post.subreddit.display_name,
                    "created_utc": post.created_utc
                })
        except Exception as e:
            print(f"Error fetching posts: {e}")
        
        # Fetch comments
        comments = []
        print("[3] Fetching comments...")
        try:
            for comment in tqdm(user.comments.new(limit=100)):
                comments.append({
                    "type": "comment",
                    "body": comment.body,
                    "subreddit": comment.subreddit.display_name,
                    "created_utc": comment.created_utc,
                    "link_url": f"https://www.reddit.com{comment.permalink}"
                })
        except Exception as e:
            print(f"Error fetching comments: {e}")
        
        print(f"[4] Successfully fetched {len(posts)} posts and {len(comments)} comments using PRAW")
        return posts, comments, profile_info
        
    except Exception as e:
        print(f"PRAW method failed: {e}")
        print("[5] Falling back to web scraping method...")
        
        # Fallback to web scraping
        return fetch_user_data_web_scraping(username)

def fetch_user_data_web_scraping(username):
    """
    Fallback method using web scraping when PRAW fails
    """
    try:
        # Basic profile info
        profile_info = {
            "name": username,
            "icon_img": "https://www.redditstatic.com/avatars/avatar_default_02_25B79F.png",
            "created_utc": time.time(),
            "total_karma": 0,
            "link_karma": 0,
            "comment_karma": 0,
            "subreddit": "",
            "bio": ""
        }
        
        # Try to fetch some basic data from Reddit's JSON API
        headers = {
            'User-Agent': 'RedditPersonaCraft/1.0'
        }
        
        posts = []
        comments = []
        
        try:
            # Fetch user's posts
            posts_url = f"https://www.reddit.com/user/{username}/submitted/.json?limit=50"
            response = requests.get(posts_url, headers=headers)
            
            if response.status_code == 200:
                data = response.json()
                for post_data in data.get('data', {}).get('children', []):
                    post = post_data.get('data', {})
                    posts.append({
                        "type": "post",
                        "title": post.get('title', ''),
                        "selftext": post.get('selftext', ''),
                        "url": post.get('url', ''),
                        "subreddit": post.get('subreddit', ''),
                        "created_utc": post.get('created_utc', time.time())
                    })
            
            # Fetch user's comments
            comments_url = f"https://www.reddit.com/user/{username}/comments/.json?limit=100"
            response = requests.get(comments_url, headers=headers)
            
            if response.status_code == 200:
                data = response.json()
                for comment_data in data.get('data', {}).get('children', []):
                    comment = comment_data.get('data', {})
                    comments.append({
                        "type": "comment",
                        "body": comment.get('body', ''),
                        "subreddit": comment.get('subreddit', ''),
                        "created_utc": comment.get('created_utc', time.time()),
                        "link_url": f"https://www.reddit.com{comment.get('permalink', '')}"
                    })
            
            print(f"[6] Web scraping fetched {len(posts)} posts and {len(comments)} comments")
            
        except Exception as e:
            print(f"Web scraping also failed: {e}")
            print("Using sample data for demonstration...")
            
            # If all else fails, use sample data
            posts = [
                {
                    "type": "post",
                    "title": "Sample Post Title",
                    "selftext": "This is a sample post to demonstrate the persona building functionality.",
                    "url": "https://www.reddit.com/r/sample",
                    "subreddit": "sample",
                    "created_utc": time.time()
                }
            ]
            
            comments = [
                {
                    "type": "comment",
                    "body": "This is a sample comment to demonstrate the persona building functionality.",
                    "subreddit": "sample",
                    "created_utc": time.time(),
                    "link_url": "https://www.reddit.com/r/sample/comments/sample"
                }
            ]
        
        return posts, comments, profile_info
        
    except Exception as e:
        print(f"All methods failed: {e}")
        return [], [], {"name": username, "icon_img": "", "created_utc": time.time(), "total_karma": 0, "link_karma": 0, "comment_karma": 0, "subreddit": "", "bio": ""}

print("Reddit scraper functions loaded successfully!")

Reddit scraper functions loaded successfully!


In [31]:
#persona_builder
import os
from dotenv import load_dotenv
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

load_dotenv()

LLM_PROVIDER = os.getenv("LLM_PROVIDER", "transformers")
MODEL_NAME = os.getenv("TRANSFORMERS_MODEL", "distilgpt2")

def chunk_texts(texts, max_chars=1500):
    chunks = []
    current = ""
    for t in texts:
        if len(current) + len(t) < max_chars:
            current += t + "\n\n"
        else:
            chunks.append(current)
            current = t + "\n\n"
    if current:
        chunks.append(current)
    return chunks

def build_persona(posts, comments):
    all_texts = posts + comments
    chunks = chunk_texts(all_texts, max_chars=1500)
    system_prompt = (
        "You are a helpful AI assistant. Analyze the following Reddit posts and comments to generate a detailed user persona. "
        "Predict and fill in the following fields based on the user's activity. Format your output as follows:\n"
        "Name: <predicted name>\n"
        "Age: <predicted age>\n"
        "Occupation: <predicted occupation>\n"
        "Status: <predicted status>\n"
        "Location: <predicted location>\n"
        "Tier: <predicted tier>\n"
        "Archetype: <predicted archetype>\n"
        "\nTraits:\n<list traits, one per line>\n"
        "\nMotivations:\n<list motivations, one per line>\n"
        "\nPersonality:\n<list personality attributes, one per line>\n"
        "\nBehaviour & Habits:\n<list habits, one per line>\n"
        "\nGoals & Needs:\n<list goals, one per line>\n"
        "\nFrustrations:\n<list frustrations, one per line>\n"
        "\nBase your predictions on the user's posts and comments. If information is missing, make a reasonable guess based on context."
    )
    generator = pipeline("text-generation", model=MODEL_NAME)
    persona_dict = {}
    for chunk in chunks:
        prompt = f"{system_prompt}\n\n{chunk}"
        try:
            result = generator(prompt, max_length=1024, do_sample=True, temperature=0.7)[0]['generated_text']
        except Exception as e:
            print(f"Error generating persona chunk: {e}")
            continue
        import re
        required_fields = [
            'Name:', 'Age:', 'Occupation:', 'Status:', 'Location:', 'Tier:', 'Archetype:',
            'Traits:', 'Motivations:', 'Personality:', 'Behaviour & Habits:', 'Goals & Needs:', 'Frustrations:'
        ]
        for field in required_fields:
            match = re.search(rf'{field}\s*(.*)', result)
            if match:
                value = match.group(1).strip()
                if not value or value.startswith('<') or value.lower().startswith('list') or value.startswith('(AI guess'):
                    persona_dict[field] = None
                else:
                    persona_dict[field] = value
            else:
                persona_dict[field] = None

    persona_text = ""
    for field in required_fields:
        persona_text += f"{field} {persona_dict[field]}\n"

    # Add user's posts and comments to persona text
    persona_text += "\n[User's Reddit Posts:]\n"
    for post in posts:
        persona_text += post + "\n"
    persona_text += "\n[User's Reddit Comments:]\n"
    for comment in comments:
        persona_text += comment + "\n"
    return persona_text

reddit_url = input('Enter the Reddit profile URL (e.g., https://www.reddit.com/user/kojied): ')

if reddit_url.startswith('https://www.reddit.com/user/'):
    username = reddit_url.strip('/').split('/')[-1]
    print(f"Extracted username: {username}")
    posts, comments, profile_info = fetch_user_data(username)
    print(f"Posts fetched: {len(posts)} | Comments fetched: {len(comments)}")
    import json
    temp_data = {
        "username": username,
        "posts": posts,
        "comments": comments,
        "profile_info": profile_info
    }
    with open("temp.json", "w", encoding="utf-8") as tempf:
        json.dump(temp_data, tempf, ensure_ascii=False, indent=2)
    print("Scraped data stored in temp.json")
    post_strs = [f"Post: {p['title']}\nText: {p['selftext']}\nSubreddit: {p['subreddit']}\nURL: {p['url']}\nDate: {p['created_utc']}" for p in posts]
    comment_strs = [f"Comment: {c['body']}\nSubreddit: {c['subreddit']}\nDate: {c['created_utc']}" for c in comments]
    persona = build_persona(post_strs, comment_strs)
    print("[7] Saving persona to output directory...")
    os.makedirs('output', exist_ok=True)
    output_path = f"output/{username}_persona.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(persona)
    print(f"Persona saved to {output_path}")
    import subprocess
    subprocess.run(["python", "update_persona_html.py", username])
    print(f"Persona HTML generated as output/{username}.html using sample1.html template.")
    print("[8] Script completed successfully.")
else:
    print("Please enter a valid Reddit profile URL (e.g., https://www.reddit.com/user/kojied)")


Extracted username: Hungry-Move-6603
[1] Fetching data for Reddit user: Hungry-Move-6603


  "bio": getattr(user, 'subreddit', {}).get('public_description', '') if hasattr(user, 'subreddit') and user.subreddit else ""


[2] Fetching posts...


3it [00:00,  7.15it/s]
3it [00:00,  7.15it/s]


[3] Fetching comments...


12it [00:00, 24.72it/s]



[4] Successfully fetched 3 posts and 12 comments using PRAW
Posts fetched: 3 | Comments fetched: 12
Scraped data stored in temp.json


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you

[7] Saving persona to output directory...
Persona saved to output/Hungry-Move-6603_persona.txt
Persona HTML generated as output/Hungry-Move-6603.html using sample1.html template.
[8] Script completed successfully.


In [32]:
import json
from typing import List

def render_list(items: List[str], tag: str = "li"):
    return "\n".join([f"<{tag}>{item}</{tag}>" for item in items])

def update_html_template(template_path, output_path, persona_data):
    with open(template_path, "r", encoding="utf-8") as f:
        html = f.read()
    html = html.replace("Lucas Mellor", persona_data.get("name", ""))
    html = html.replace("31", persona_data.get("age", ""))
    html = html.replace("Content Manager", persona_data.get("occupation", ""))
    html = html.replace("Single", persona_data.get("status", ""))
    html = html.replace("London, UK", persona_data.get("location", ""))
    html = html.replace("Early Adopters", persona_data.get("tier", ""))
    html = html.replace("The Creator", persona_data.get("archetype", ""))
    html = html.replace("“I want to spend less time ordering a healthy takeaway and more time enjoying my meal.”", persona_data.get("quote", ""))
    html = html.replace("https://styles.redditmedia.com/t5_6zabzi/styles/communityIcon_m4kry55rs0m91.png?width=128&frame=1&auto=webp&s=c91a28d237fe9d3973fd33faec71ddfe4685f785", persona_data.get("user_img", ""))
    trait_html = render_list(persona_data.get("traits", []), tag="div class=\"trait-box\"")
    html = html.replace('<div class="trait-box">Practical</div>\n            <div class="trait-box">Adaptable</div>\n            <div class="trait-box">Spontaneous</div>\n            <div class="trait-box">Active</div>', trait_html)
    motivation_html = "".join([f'<div class="motivation-row"><span><strong>{m}</strong></span><div class="bar"><div class="bar-fill" style="width:80%"></div></div></div>' for m in persona_data.get("motivations", [])])
    html = html.replace('<div class="motivation-row"><span><strong>Convenience</strong></span><div class="bar"><div class="bar-fill" style="width:100%"></div></div></div>\n        <div class="motivation-row"><span><strong>Wellness</strong></span><div class="bar"><div class="bar-fill" style="width:90%"></div></div></div>\n        <div class="motivation-row"><span><strong>Speed</strong></span><div class="bar"><div class="bar-fill" style="width:85%"></div></div></div>\n        <div class="motivation-row"><span><strong>Preferences</strong></span><div class="bar"><div class="bar-fill" style="width:70%"></div></div></div>\n        <div class="motivation-row"><span><strong>Comfort</strong></span><div class="bar"><div class="bar-fill" style="width:60%"></div></div></div>\n        <div class="motivation-row"><span><strong>Dietary Needs</strong></span><div class="bar"><div class="bar-fill" style="width:80%"></div></div></div>', motivation_html)
    personality_html = "".join([f'<div class="personality-row"><span>{p}</span><div class="bar"><div class="bar-fill" style="width:50%"></div></div></div>' for p in persona_data.get("personality", [])])
    html = html.replace('<div class="personality-row"><span>Introvert</span><div class="bar"><div class="bar-fill" style="width:50%"></div></div><span>Extrovert</span></div>\n        <div class="personality-row"><span>Intuition</span><div class="bar"><div class="bar-fill" style="width:90%"></div></div><span>Sensing</span></div>\n        <div class="personality-row"><span>Feeling</span><div class="bar"><div class="bar-fill" style="width:30%"></div></div><span>Thinking</span></div>\n        <div class="personality-row"><span>Perceiving</span><div class="bar"><div class="bar-fill" style="width:75%"></div></div><span>Judging</span></div>', personality_html)
    habits_html = render_list(persona_data.get("habits", []))
    html = html.replace('<li>Rarely cooked before lockdown</li>\n          <li>Orders all meals online</li>\n          <li>Joined online HIIT sessions</li>\n          <li>Struggles with work-life balance</li>\n          <li>Tries to choose healthy options</li>\n          <li>Orders takeaway 3–4 times/week</li>', habits_html)
    goals_html = render_list(persona_data.get("goals", []))
    html = html.replace('<li>Maintain healthy lifestyle during lockdown</li>\n          <li>Wants full meal info before ordering</li>\n          <li>Select based on dietary needs</li>\n          <li>Swift delivery and easy ordering</li>', goals_html)
    frustrations_html = render_list(persona_data.get("frustrations", []))
    html = html.replace('<li>Menus lack images or descriptions</li>\n          <li>No healthy food category</li>\n          <li>Unclear meal contents</li>\n          <li>Pre-orders not labeled properly</li>\n          <li>Confusing restaurant menus</li>', frustrations_html)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html)

In [33]:
# Get the actual username from the previously run cells
if 'actual_username' in globals():
    username = actual_username
elif 'username' in globals() and username and username not in ['<predicted name>', 'unknown']:
    username = username
else:
    # If username is not properly set, get it from profile_info or use a default
    username = profile_info.get("name", "unknown_user") if 'profile_info' in globals() else "unknown_user"

print(f"Using username: {username}")

persona_data = {
    "name": profile_info.get("name", username) if 'profile_info' in globals() else username,
    "age": "28",
    "occupation": "Software Engineer",
    "status": "Active",
    "location": "San Francisco, CA",
    "tier": "Regular",
    "archetype": "The Explorer",
    "traits": ["Curious", "Analytical", "Friendly", "Adaptable"],
    "motivations": ["Learning", "Growth", "Community"],
    "personality": ["Introvert", "Intuitive", "Thinking", "Perceiving"],
    "habits": ["Posts regularly", "Helps others", "Explores new topics"],
    "goals": ["Expand knowledge", "Connect with others"],
    "frustrations": ["Lack of feedback", "Unclear rules"],
    "quote": "I want to spend less time ordering a healthy takeaway and more time enjoying my meal.",
    "user_img": profile_info.get("icon_img", "") if 'profile_info' in globals() else ""
}

# Ensure the output directory exists
import os
os.makedirs('output', exist_ok=True)

# Update HTML template
update_html_template("templates/sample1.html", f"output/{username}.html", persona_data)
print(f"Persona HTML generated at output/{username}.html")

Using username: Hungry-Move-6603
Persona HTML generated at output/Hungry-Move-6603.html


In [34]:
import re
def parse_persona_text(persona_text):
    def extract_list(field, text):
        match = re.search(rf'{field}\\s*(.*?)(?:\\n[A-Z][a-zA-Z ]+:|$)', text, re.DOTALL)
        if match:
            items = [i.strip('- ').strip() for i in match.group(1).split('\n') if i.strip()]
            return items
        return []
    fields = {
        "name": re.search(r'Name:\s*(.*)', persona_text),
        "age": re.search(r'Age:\s*(.*)', persona_text),
        "occupation": re.search(r'Occupation:\s*(.*)', persona_text),
        "status": re.search(r'Status:\s*(.*)', persona_text),
        "location": re.search(r'Location:\s*(.*)', persona_text),
        "tier": re.search(r'Tier:\s*(.*)', persona_text),
        "archetype": re.search(r'Archetype:\s*(.*)', persona_text),
        "traits": extract_list('Traits:', persona_text),
        "motivations": extract_list('Motivations:', persona_text),
        "personality": extract_list('Personality:', persona_text),
        "habits": extract_list('Behaviour & Habits:', persona_text),
        "goals": extract_list('Goals & Needs:', persona_text),
        "frustrations": extract_list('Frustrations:', persona_text),
    }
    persona_data = {
        "name": fields["name"].group(1).strip() if fields["name"] else profile_info.get("name", username),
        "age": fields["age"].group(1).strip() if fields["age"] else "(AI guess)",
        "occupation": fields["occupation"].group(1).strip() if fields["occupation"] else "(AI guess)",
        "status": fields["status"].group(1).strip() if fields["status"] else "(AI guess)",
        "location": fields["location"].group(1).strip() if fields["location"] else "(AI guess)",
        "tier": fields["tier"].group(1).strip() if fields["tier"] else "(AI guess)",
        "archetype": fields["archetype"].group(1).strip() if fields["archetype"] else "(AI guess)",
        "traits": fields["traits"],
        "motivations": fields["motivations"],
        "personality": fields["personality"],
        "habits": fields["habits"],
        "goals": fields["goals"],
        "frustrations": fields["frustrations"],
        "quote": "I want to spend less time ordering a healthy takeaway and more time enjoying my meal.",
        "user_img": profile_info.get("icon_img", "")
    }
    return persona_data

persona_data = parse_persona_text(persona)
update_html_template("templates/sample1.html", f"output/{username}.html", persona_data)
print(f"Persona HTML generated at output/{username}.html with AI-predicted values.")

Persona HTML generated at output/Hungry-Move-6603.html with AI-predicted values.


In [35]:
# Ensure persona_data['name'] is set to the actual Reddit username before saving to ai.json
persona_data["name"] = username
persona_data["profile_url"] = reddit_url
# Ensure user_img is set from profile_info or a default image
persona_data["user_img"] = profile_info.get("icon_img", "https://www.redditstatic.com/avatars/avatar_default_02_25B79F.png")
with open("ai.json", "w", encoding="utf-8") as f:
    import json
    json.dump(persona_data, f, ensure_ascii=False, indent=2)
print("AI-predicted persona data saved to ai.json")

AI-predicted persona data saved to ai.json


In [36]:
import os
import json
from transformers import pipeline
import torch

username = profile_info.get('name', 'unknown') if 'profile_info' in globals() else 'unknown'
persona_txt_path = f'output/{username}_persona.txt'
with open(persona_txt_path, 'r', encoding='utf-8') as f:
    scraped_persona_text = f.read()

device = 0 if torch.cuda.is_available() else -1
generator = pipeline('text-generation', model='distilgpt2', device=device)
prompt = (
    "You are a helpful AI assistant. Analyze the following Reddit posts and comments to generate a detailed user persona.\n"
    "Fill in any missing fields. Format your output as follows:\n"
    "Name: <predicted name>\n"
    "Age: <predicted age>\n"
    "Occupation: <predicted occupation>\n"
    "Status: <predicted status>\n"
    "Location: <predicted location>\n"
    "Tier: <predicted tier>\n"
    "Archetype: <predicted archetype>\n"
    "Quote: <representative quote from user>\n"
    "Traits:\n<list traits, one per line>\n"
    "Motivations:\n<list motivations, one per line>\n"
    "Personality:\n<list personality attributes, one per line>\n"
    "Behaviour & Habits:\n<list habits, one per line>\n"
    "Goals & Needs:\n<list goals, one per line>\n"
    "Frustrations:\n<list frustrations, one per line>\n"
    "Base your predictions on the user's posts and comments. If information is missing, make a reasonable guess based on context.\n"
)
full_prompt = prompt + scraped_persona_text
result = generator(full_prompt, max_length=512, do_sample=True, temperature=0.7)[0]['generated_text']

import re
def extract_list(field, text):
    match = re.search(rf'{field}\\s*(.*?)(?:\\n[A-Z][a-zA-Z ]+:|$)', text, re.DOTALL)
    if match:
        items = [i.strip('- ').strip() for i in match.group(1).split('\n') if i.strip()]
        return items
    return []

fields = {
    "name": re.search(r'Name:\s*(.*)', result),
    "age": re.search(r'Age:\s*(.*)', result),
    "occupation": re.search(r'Occupation:\s*(.*)', result),
    "status": re.search(r'Status:\s*(.*)', result),
    "location": re.search(r'Location:\s*(.*)', result),
    "tier": re.search(r'Tier:\s*(.*)', result),
    "archetype": re.search(r'Archetype:\s*(.*)', result),
    "quote": re.search(r'Quote:\s*(.*)', result),
    "traits": extract_list('Traits:', result),
    "motivations": extract_list('Motivations:', result),
    "personality": extract_list('Personality:', result),
    "habits": extract_list('Behaviour & Habits:', result),
    "goals": extract_list('Goals & Needs:', result),
    "frustrations": extract_list('Frustrations:', result),
}

persona_data = {
    "name": fields["name"].group(1).strip() if fields["name"] else username,
    "age": fields["age"].group(1).strip() if fields["age"] else "(AI guess)",
    "occupation": fields["occupation"].group(1).strip() if fields["occupation"] else "(AI guess)",
    "status": fields["status"].group(1).strip() if fields["status"] else "(AI guess)",
    "location": fields["location"].group(1).strip() if fields["location"] else "(AI guess)",
    "tier": fields["tier"].group(1).strip() if fields["tier"] else "(AI guess)",
    "archetype": fields["archetype"].group(1).strip() if fields["archetype"] else "(AI guess)",
    "quote": fields["quote"].group(1).strip() if fields["quote"] else "I want to spend less time ordering a healthy takeaway and more time enjoying my meal.",
    "traits": fields["traits"] if fields["traits"] else ["Curious", "Friendly", "Adaptable"],
    "motivations": fields["motivations"] if fields["motivations"] else ["Growth", "Community"],
    "personality": fields["personality"] if fields["personality"] else ["Introvert", "Intuitive"],
    "habits": fields["habits"] if fields["habits"] else ["Posts regularly", "Explores new topics"],
    "goals": fields["goals"] if fields["goals"] else ["Expand knowledge", "Connect with others"],
    "frustrations": fields["frustrations"] if fields["frustrations"] else ["Lack of feedback", "Unclear rules"],
    "profile_url": reddit_url,
    "user_img": profile_info.get("icon_img", "https://www.redditstatic.com/avatars/avatar_default_02_25B79F.png")
}

with open('ai.json', 'w', encoding='utf-8') as f:
    json.dump(persona_data, f, ensure_ascii=False, indent=2)
print('AI-predicted persona data saved to ai.json')

# Update website HTML from ai.json
with open('ai.json', 'r', encoding='utf-8') as f:
    persona_data = json.load(f)
update_html_template("templates/sample1.html", f"output/{username}.html", persona_data)
print(f"Persona HTML updated at output/{username}.html using latest ai.json values.")

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you 

AI-predicted persona data saved to ai.json
Persona HTML updated at output/Hungry-Move-6603.html using latest ai.json values.


In [37]:
import random
import json

def fill_random_persona(persona_data, username):
    random_names = [username, "Alex", "Sam", "Jordan", "Taylor", "Casey"]
    random_occupations = ["Engineer", "Designer", "Student", "Manager", "Writer"]
    random_locations = ["Delhi", "Lucknow", "Mumbai", "Bangalore", "Remote"]
    random_status = ["Active", "Single", "Busy", "Exploring", "Learning"]
    random_tiers = ["Regular", "Early Adopter", "Expert", "Newbie"]
    random_archetypes = ["The Explorer", "The Creator", "The Helper", "The Analyst"]
    random_traits = ["Curious", "Friendly", "Adaptable", "Practical", "Spontaneous"]
    random_motivations = ["Growth", "Community", "Convenience", "Wellness"]
    random_personality = ["Introvert", "Extrovert", "Intuitive", "Thinking"]
    random_habits = ["Posts regularly", "Explores new topics", "Helps others"]
    random_goals = ["Expand knowledge", "Connect with others", "Learn new skills"]
    random_frustrations = ["Lack of feedback", "Unclear rules", "Slow response"]
    default_quote = "I want to spend less time ordering a healthy takeaway and more time enjoying my meal."
    
    persona_data['name'] = username if not persona_data.get('name') or persona_data['name'] in [None, '', '(AI guess)', '<predicted name>'] else persona_data['name']
    persona_data['age'] = str(random.randint(18, 45)) if not persona_data.get('age') or persona_data['age'] in [None, '', '(AI guess)', '<predicted age>'] else persona_data['age']
    persona_data['occupation'] = random.choice(random_occupations) if not persona_data.get('occupation') or persona_data['occupation'] in [None, '', '(AI guess)', '<predicted occupation>'] else persona_data['occupation']
    persona_data['status'] = random.choice(random_status) if not persona_data.get('status') or persona_data['status'] in [None, '', '(AI guess)', '<predicted status>'] else persona_data['status']
    persona_data['location'] = random.choice(random_locations) if not persona_data.get('location') or persona_data['location'] in [None, '', '(AI guess)', '<predicted location>'] else persona_data['location']
    persona_data['tier'] = random.choice(random_tiers) if not persona_data.get('tier') or persona_data['tier'] in [None, '', '(AI guess)', '<predicted tier>'] else persona_data['tier']
    persona_data['archetype'] = random.choice(random_archetypes) if not persona_data.get('archetype') or persona_data['archetype'] in [None, '', '(AI guess)', '<predicted archetype>'] else persona_data['archetype']
    persona_data['traits'] = random.sample(random_traits, 3) if not persona_data.get('traits') or not persona_data['traits'] else persona_data['traits']
    persona_data['motivations'] = random.sample(random_motivations, 2) if not persona_data.get('motivations') or not persona_data['motivations'] else persona_data['motivations']
    persona_data['personality'] = random.sample(random_personality, 2) if not persona_data.get('personality') or not persona_data['personality'] else persona_data['personality']
    persona_data['habits'] = random.sample(random_habits, 2) if not persona_data.get('habits') or not persona_data['habits'] else persona_data['habits']
    persona_data['goals'] = random.sample(random_goals, 2) if not persona_data.get('goals') or not persona_data['goals'] else persona_data['goals']
    persona_data['frustrations'] = random.sample(random_frustrations, 2) if not persona_data.get('frustrations') or not persona_data['frustrations'] else persona_data['frustrations']
    persona_data['quote'] = default_quote if not persona_data.get('quote') or persona_data['quote'] in [None, '', '(AI guess)', '<representative quote from user>'] else persona_data['quote']
    
    return persona_data
actual_username = "Hungry-Move-6603"
try:
    with open('ai.json', 'r', encoding='utf-8') as f:
        persona_data = json.load(f)
except FileNotFoundError:
    persona_data = {}
persona_data = fill_random_persona(persona_data, actual_username)
if 'profile_url' not in persona_data:
    persona_data['profile_url'] = f"https://www.reddit.com/user/{actual_username}"
persona_data['name'] = actual_username
if 'profile_info' in globals() and profile_info.get('icon_img'):
    persona_data['user_img'] = profile_info['icon_img']
else:
    persona_data['user_img'] = 'https://www.redditstatic.com/avatars/avatar_default_02_25B79F.png'

with open('ai.json', 'w', encoding='utf-8') as f:
    json.dump(persona_data, f, ensure_ascii=False, indent=2)

print('ai.json updated with random/dynamic values for missing fields.')

update_html_template("templates/sample1.html", f"output/{actual_username}.html", persona_data)
print(f"Persona HTML updated at output/{actual_username}.html using latest ai.json values.")

ai.json updated with random/dynamic values for missing fields.
Persona HTML updated at output/Hungry-Move-6603.html using latest ai.json values.


In [38]:
# Summary of Results
print("=" * 60)
print("RedditPersonaCraft - Persona Generation Complete!")
print("=" * 60)

print(f"\n✅ Successfully scraped data for user: {actual_username}")
print(f"✅ Generated persona text file: output/{actual_username}_persona.txt")
print(f"✅ Generated HTML persona: output/{actual_username}.html")
print(f"✅ Created AI prediction data: ai.json")

print("\n📊 Data Summary:")
print(f"   - Posts fetched: {len(posts) if 'posts' in globals() else 0}")
print(f"   - Comments fetched: {len(comments) if 'comments' in globals() else 0}")
print(f"   - Profile info: {'✓' if 'profile_info' in globals() else '✗'}")

print("\n🌐 Web Interface:")
print("   - Persona List: http://127.0.0.1:5000/persona/")
print(f"   - User Persona: http://127.0.0.1:5000/persona/html/{actual_username}")

print("\n📁 Generated Files:")
import os
if os.path.exists(f"output/{actual_username}.html"):
    print(f"   ✅ output/{actual_username}.html")
if os.path.exists(f"output/{actual_username}_persona.txt"):
    print(f"   ✅ output/{actual_username}_persona.txt")
if os.path.exists("ai.json"):
    print("   ✅ ai.json")
if os.path.exists("temp.json"):
    print("   ✅ temp.json")

print("\n🎉 Persona generation workflow completed successfully!")
print("You can now view the generated persona in your browser or check the output files.")

RedditPersonaCraft - Persona Generation Complete!

✅ Successfully scraped data for user: Hungry-Move-6603
✅ Generated persona text file: output/Hungry-Move-6603_persona.txt
✅ Generated HTML persona: output/Hungry-Move-6603.html
✅ Created AI prediction data: ai.json

📊 Data Summary:
   - Posts fetched: 3
   - Comments fetched: 12
   - Profile info: ✓

🌐 Web Interface:
   - Persona List: http://127.0.0.1:5000/persona/
   - User Persona: http://127.0.0.1:5000/persona/html/Hungry-Move-6603

📁 Generated Files:
   ✅ output/Hungry-Move-6603.html
   ✅ output/Hungry-Move-6603_persona.txt
   ✅ ai.json
   ✅ temp.json

🎉 Persona generation workflow completed successfully!
You can now view the generated persona in your browser or check the output files.
