### Pull in larger format data

In [1]:
# Imports
import json
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup  # Added for HTML cleaning

In [11]:
# Import required libraries
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Read the bug report data
with open('../data/bug_report_small.json', 'r') as f:
    data = json.load(f)

# Create lists to store the extracted data
posts_data = []

# Process each discussion
for i, discussion in enumerate(data['discussions']):
    # Debug print for first few discussions
    if i < 2:  # Print first 2 discussions
        print(f"\nDiscussion {i}:")
        print("Title:", discussion.get('title'))
        print("post_stream type:", type(discussion.get('post_stream')))
        # print("post_stream value:", discussion.get('post_stream'))
    
    discussion_title = discussion.get('title', '')
    post_stream = discussion.get('post_stream')
    
    if post_stream and 'posts' in post_stream:
        for post in post_stream['posts']:
            posts_data.append({
                'post_discussion_title': discussion_title,
                'post_number': post.get('post_number', ''),
                'post_content': post.get('cooked', ''),  # 'cooked' contains the post content
                'created_at': post.get('created_at', ''),
                'username': post.get('username', ''),
                'post_type': post.get('post_type', ''),
                'like_count': post.get('like_count', 0),
                'reads': post.get('reads', 0)
            })
    else:
        print(f"Warning: No valid post_stream for discussion titled: {discussion_title}")

# Create DataFrame
df = pd.DataFrame(posts_data)

# Convert timestamp to datetime if we have any data
if not df.empty:
    df['created_at'] = pd.to_datetime(df['created_at'])

# Display basic information about the dataset
print("\nDataset Info:")
print(f"Number of discussions: {len(data['discussions'])}")
print(f"Number of posts: {len(df)}")
print("\nDataFrame Head:")
df.head()


Discussion 0:
Title: Long context mode gone in newest update
post_stream type: <class 'dict'>

Discussion 1:
Title: Cursor removing Itself?
post_stream type: <class 'dict'>

Dataset Info:
Number of discussions: 20
Number of posts: 360

DataFrame Head:


Unnamed: 0,post_discussion_title,post_number,post_content,created_at,username,post_type,like_count,reads
0,Long context mode gone in newest update,1,<p>I was using the long context mode quite a l...,2024-11-24 11:40:28.914000+00:00,Dryec,1,0,289
1,Long context mode gone in newest update,2,<p>Pro user here. I’ve noticed it’s gone for m...,2024-11-24 12:11:19.442000+00:00,saeedmahmud,1,0,284
2,Long context mode gone in newest update,4,<p>Missing it too…</p>,2024-11-24 15:17:38.616000+00:00,mtf,1,0,278
3,Long context mode gone in newest update,5,<p>Without Long Context Cursor would lose its ...,2024-11-24 22:37:22.014000+00:00,fun_strange,1,0,259
4,Long context mode gone in newest update,6,"<p>Pro heavy user here, got everyone I know ex...",2024-11-25 09:31:30.920000+00:00,one1zero1one,1,0,243


In [13]:
# Define category mapping
CATEGORY_MAP = {
    6: 'bug-report',
    5: 'feature-request',
    4: 'general',
    7: 'feedback',
    8: 'help'
}

In [14]:
# Read the bug report data
with open('../data/bug_report_small.json', 'r') as f:
    data = json.load(f)

# Create lists to store the extracted data
posts = []

# Process each discussion
for thread in data['discussions']:
    # Discussion-level features (shared across all posts in thread)
    discussion_features = {
        'post_discussion_id': thread.get('id'),
        'post_discussion_title': thread.get('title'),
        'post_discussion_created_at': pd.to_datetime(thread.get('created_at')),
        'post_discussion_views': thread.get('views'),
        'post_discussion_reply_count': thread.get('posts_count'),
        'post_discussion_like_count': thread.get('like_count'),
        'post_discussion_participant_count': thread.get('participant_count'),
        'post_discussion_word_count': float(thread.get('word_count', 0)),
        'post_category_id': thread.get('category_id'),
        'post_category_name': CATEGORY_MAP.get(thread.get('category_id')),
        'post_discussion_tags': thread.get('tags', []),
        'post_discussion_url': f"https://forum.cursor.com/t/{thread.get('slug')}/{thread.get('id')}",
    }
    
    # Extract tags (up to 4)
    tags = thread.get('tags', [])
    for i in range(4):
        discussion_features[f'tag{i+1}'] = tags[i] if i < len(tags) else None
        
    # Post-level features
    for post in thread.get('post_stream', {}).get('posts', []):
        post_data = {
            **discussion_features,  # Include all discussion features
            'post_id': post.get('id'),
            'post_author': post.get('username'),
            'post_author_id': post.get('user_id'),
            'post_created_at': post.get('created_at'),
            'post_content': BeautifulSoup(post.get('cooked', ''), 'html.parser').get_text(),
            'post_content_raw': post.get('cooked', ''),
            'post_read_count': post.get('reads', 0),
            'post_reply_count': post.get('reply_count', 0),
            'post_number': post.get('post_number'),
            'accepted_answer_post': float(post.get('accepted_answer', False)),
            'post_url': f"https://forum.cursor.com/t/{thread.get('slug')}/{thread.get('id')}/{post.get('post_number')}",
        }
        posts.append(post_data)
            
df = pd.DataFrame(posts)

# Display basic information about the dataset
print("Dataset Info:")
print(f"Number of discussions: {len(data['discussions'])}")
print(f"Number of posts: {len(df)}")
print("\nDataFrame Head:")
df.head()

Dataset Info:
Number of discussions: 20
Number of posts: 360

DataFrame Head:


Unnamed: 0,post_discussion_id,post_discussion_title,post_discussion_created_at,post_discussion_views,post_discussion_reply_count,post_discussion_like_count,post_discussion_participant_count,post_discussion_word_count,post_category_id,post_category_name,...,post_author,post_author_id,post_created_at,post_content,post_content_raw,post_read_count,post_reply_count,post_number,accepted_answer_post,post_url
0,29449,Long context mode gone in newest update,2024-11-24 11:40:28.870000+00:00,2851,59,280,42,3534.0,6,bug-report,...,Dryec,14170,2024-11-24T11:40:28.914Z,I was using the long context mode quite a lot ...,<p>I was using the long context mode quite a l...,289,0,1,0.0,https://forum.cursor.com/t/long-context-mode-g...
1,29449,Long context mode gone in newest update,2024-11-24 11:40:28.870000+00:00,2851,59,280,42,3534.0,6,bug-report,...,saeedmahmud,574,2024-11-24T12:11:19.442Z,Pro user here. I’ve noticed it’s gone for me a...,<p>Pro user here. I’ve noticed it’s gone for m...,284,0,2,0.0,https://forum.cursor.com/t/long-context-mode-g...
2,29449,Long context mode gone in newest update,2024-11-24 11:40:28.870000+00:00,2851,59,280,42,3534.0,6,bug-report,...,mtf,952,2024-11-24T15:17:38.616Z,Missing it too…,<p>Missing it too…</p>,278,0,4,0.0,https://forum.cursor.com/t/long-context-mode-g...
3,29449,Long context mode gone in newest update,2024-11-24 11:40:28.870000+00:00,2851,59,280,42,3534.0,6,bug-report,...,fun_strange,1646,2024-11-24T22:37:22.014Z,Without Long Context Cursor would lose its use...,<p>Without Long Context Cursor would lose its ...,259,0,5,0.0,https://forum.cursor.com/t/long-context-mode-g...
4,29449,Long context mode gone in newest update,2024-11-24 11:40:28.870000+00:00,2851,59,280,42,3534.0,6,bug-report,...,one1zero1one,14196,2024-11-25T09:31:30.920Z,"Pro heavy user here, got everyone I know excit...","<p>Pro heavy user here, got everyone I know ex...",243,1,6,0.0,https://forum.cursor.com/t/long-context-mode-g...
