In [3]:
import pandas as pd
import json

# Read JSON Lines file (each line is a separate JSON object)
data = []
with open('mindset.jl', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))

df = pd.DataFrame(data)

condition = 'ptsd'
df_adhd = df[df['label'].apply(lambda x: condition in x)]

data = df_adhd.explode('posts').reset_index(drop=True)

posts_normalized = pd.json_normalize(data['posts'])
keep_cols = [col for col in data.columns if col != 'posts']
data = pd.concat([data[keep_cols].reset_index(drop=True), posts_normalized], axis=1)
data.head()

Unnamed: 0,username,label,created_at,subreddit,id,title,score,selftext
0,124890,"[bipolar, ocd, anxiety, autism, ptsd, depressi...",1661153709,GeneralHospital,wun2j9,now that spencer admitted his love for trina..,2,how long do you all think it ll be before the ...
1,124890,"[bipolar, ocd, anxiety, autism, ptsd, depressi...",1658631113,SkincareAddiction,w6l9l8,can i use good molecules discoloration correct...,4,i have been dealing with hormonal acne due to ...
2,124890,"[bipolar, ocd, anxiety, autism, ptsd, depressi...",1654939984,DipPowderNails,v9u0ul,is white dip usually hard to work with or was ...,1,"i dipped my nails for the first time today, an..."
3,124890,"[bipolar, ocd, anxiety, autism, ptsd, depressi...",1654582077,scientology,v6o2yt,South Park Scientology episode End Credits,38,i noticed that they used the same two names fo...
4,124890,"[bipolar, ocd, anxiety, autism, ptsd, depressi...",1653978258,SkincareAddiction,v1jusy,[Skin Concerns] what to do/use for breakouts d...,3,i have always had clear but dry skin. but ever...


In [4]:
with open('ptsd_diagnosed_posts_map.json') as f:
  map = json.load(f)

diagnosis_data = []
for user_id, posts in map.items():
    if posts:
        # Assuming the first post in the list is the relevant diagnosis post
        diagnosis_post = posts[0]
        diagnosis_data.append({
            'user_id': int(user_id),
            'diagnosis_timestamp': diagnosis_post['created_utc']
        })

diagnosis_df = pd.DataFrame(diagnosis_data)
diagnosis_df.head()

Unnamed: 0,user_id,diagnosis_timestamp
0,236306,1643811026
1,547022,1596086102
2,220717,1715904688
3,544079,1624955284
4,161818,1688122939


In [6]:
data['username'] = data['username'].astype(str)
diagnosis_df['user_id'] = diagnosis_df['user_id'].astype(str)

df_with_diagnosis = data.merge(
    diagnosis_df[['user_id', 'diagnosis_timestamp']], 
    left_on='username', 
    right_on='user_id', 
    how='left'
)

df_with_diagnosis = df_with_diagnosis.drop('user_id', axis=1)

In [7]:
# drop where na
df_with_diagnosis = df_with_diagnosis.dropna(subset=['diagnosis_timestamp'])
# convert utc to datetime
df_with_diagnosis['diagnosis_timestamp'] = pd.to_datetime(df_with_diagnosis['diagnosis_timestamp'], unit='s')
df_with_diagnosis['created_at'] = pd.to_datetime(df_with_diagnosis['created_at'], unit='s')

In [8]:
# Create JSON map of username with list of all their posts and properties
# Order by creation date and include the diagnosis post in chronological order
user_posts_map = {}

for username, group in df_with_diagnosis.groupby('username'):
    # Get all posts for this user, sorted by creation date
    posts_list = group[['created_at', 'subreddit', 'title', 'selftext', 'diagnosis_timestamp']].copy()
    posts_list['is_diagnosis_post'] = False
    
    # Get the diagnosis post from the map for this user
    if username in map:
        diagnosis_posts = map[username]
        if diagnosis_posts:
            diag_post = diagnosis_posts[0]
            diagnosis_post_entry = {
                'created_at': pd.to_datetime(diag_post['created_utc'], unit='s'),
                'subreddit': diag_post.get('subreddit', ''),
                'title': diag_post.get('title', ''),
                'selftext': diag_post.get('selftext', ''),
                'diagnosis_timestamp': posts_list['diagnosis_timestamp'].iloc[0],
                'is_diagnosis_post': True
            }
            # Append diagnosis post to the list
            posts_list = pd.concat([posts_list, pd.DataFrame([diagnosis_post_entry])], ignore_index=True)
    
    # Sort by created_at
    posts_list = posts_list.sort_values('created_at').reset_index(drop=True)
    
    # Convert to list of dicts
    user_posts_map[username] = posts_list.to_dict('records')

print(f"Total users: {len(user_posts_map)}")
print(f"\nSample entry:")
sample_user = list(user_posts_map.keys())[0]
print(f"User: {sample_user}")
print(f"Number of posts: {len(user_posts_map[sample_user])}")
print(f"\nPosts (sorted by date):")
for i, post in enumerate(user_posts_map[sample_user][:5]):
    print(f"  {i+1}. {post['created_at']} - Diagnosis post: {post['is_diagnosis_post']}")

Total users: 1226

Sample entry:
User: 100555
Number of posts: 68

Posts (sorted by date):
  1. 2019-06-04 04:03:15 - Diagnosis post: False
  2. 2019-06-04 05:42:57 - Diagnosis post: False
  3. 2019-06-04 23:12:59 - Diagnosis post: False
  4. 2019-06-05 16:33:44 - Diagnosis post: False
  5. 2019-06-05 16:55:19 - Diagnosis post: False


In [10]:
# Flatten user_posts_map to a DataFrame and save as CSV
rows = []
for username, posts in user_posts_map.items():
    for post in posts:
        row = {'username': username, **post}
        rows.append(row)

output_df = pd.DataFrame(rows)
# Save only first 10 users
limited_usernames = list(user_posts_map.keys())[:10]
output_df_limited = output_df[output_df['username'].isin(limited_usernames)]
output_df_limited.to_csv('ptsd_user_posts_map.csv', index=False)