This file help figure out what are the main files that we are working with, and the important data. 

How many unique tweets are viewed by users in the dataset?<br>
    Among them, how many tweets are valid<br>
    Produce a user-viewed, content-valid, unique, gpt-predicted dataset of tweets. <br>
    => This is the dataset we want to work with


In [6]:
from langdetect import detect, LangDetectException
import json
import pandas as pd
import re

In [2]:
# remove tweets that are not viewed for 3s plus
df = pd.read_csv('../csvs/exposure_features.csv')
df_filtered = df[df['3s'] == 1]

df_filtered.to_csv('../csvs/exposure_features.csv', index=False)

print(f"Original dataset had {len(df)} rows")
print(f"After filtering for 3s=1: {len(df_filtered)} rows")
print(f"Removed {len(df) - len(df_filtered)} rows")


Original dataset had 1401643 rows
After filtering for 3s=1: 1401643 rows
Removed 0 rows


In [8]:
# Print out first 100 rows from tweets.json for sample
with open('../tweets.json', 'r') as file:
    for i, line in enumerate(file):
        if i < 100:
            print(line.strip())
        else:
            break

{"position": 0, "sortIndex": "1810472417504526336", "text": "", "is_retweet": false, "card": {}, "quoting": {}, "tweet_id": "messageprompt-premium-plus-upsell-prompt", "batch_id": "00e9dbc788164dd79b3f1e2c8f12ecbd", "user_id": "C26E207D61F94497895EF589F2E6CF26"}
{"position": 1, "sortIndex": "1810472417504526334", "text": "", "is_retweet": false, "card": {}, "quoting": {}, "tweet_id": "promoted-tweet-1720034014164070423-a74eb45854f9d6d", "batch_id": "00e9dbc788164dd79b3f1e2c8f12ecbd", "user_id": "C26E207D61F94497895EF589F2E6CF26"}
{"position": 2, "sortIndex": "1810472417504526333", "text": "SKETCH DID WHAT??? https://t.co/xe9fKFsMXq", "is_retweet": false, "card": {}, "quoting": {}, "tweet_id": "tweet-1810210153385578638", "bookmark_count": 10910, "favorite_count": 119495, "lang": "en", "reply_count": 1399, "retweet_count": 2452, "is_quote_status": false, "quote_count": 182, "conversation_id_str": "1810210153385578638", "created_at": "Mon Jul 08 07:11:47 +0000 2024", "media": [{"type": "

In [14]:
with open('../tweets.json', 'r') as file:
    for i, _ in enumerate(file):
        if i % 10000 == 0:
            print(f"Line: {i}")


Line: 0
Line: 10000
Line: 20000
Line: 30000
Line: 40000
Line: 50000
Line: 60000
Line: 70000
Line: 80000
Line: 90000
Line: 100000
Line: 110000
Line: 120000
Line: 130000
Line: 140000
Line: 150000
Line: 160000
Line: 170000
Line: 180000
Line: 190000
Line: 200000
Line: 210000
Line: 220000
Line: 230000
Line: 240000
Line: 250000
Line: 260000
Line: 270000
Line: 280000
Line: 290000
Line: 300000
Line: 310000
Line: 320000
Line: 330000
Line: 340000
Line: 350000
Line: 360000
Line: 370000
Line: 380000
Line: 390000
Line: 400000
Line: 410000
Line: 420000
Line: 430000
Line: 440000
Line: 450000
Line: 460000
Line: 470000
Line: 480000
Line: 490000
Line: 500000
Line: 510000
Line: 520000
Line: 530000
Line: 540000
Line: 550000
Line: 560000
Line: 570000
Line: 580000
Line: 590000
Line: 600000
Line: 610000
Line: 620000
Line: 630000
Line: 640000
Line: 650000
Line: 660000
Line: 670000
Line: 680000
Line: 690000
Line: 700000
Line: 710000
Line: 720000
Line: 730000
Line: 740000
Line: 750000
Line: 760000
Line: 770000


KeyboardInterrupt: 

In [None]:
# Create a set of tweet ids to work with
"""
Create a set of all unique tweet id
-> excluding lines with text length < 6 words
-> exclude tweets that are not english
-> exclude tweets that are not viewed by users
"""
unique_tweet_ids = set()
excluded_tweet_ids = dict()
viewed_tweet_ids = set(df_filtered['tweet_id_numeric'].values)
skipped_count = 0

with open('../tweets.json', 'r') as file:
    for i, line in enumerate(file):
        if i % 10000 == 0:
            print(f"Processed {i} lines. Included: {len(unique_tweet_ids)}, Excluded: {len(excluded_tweet_ids)}")
            print(f"Skipped lines due to Json error: {skipped_count}")
        
        try:
            line_object = json.loads(line)
            tweet_id_raw = line_object['tweet_id']
            line_text = line_object['text']
        except (json.JSONDecodeError, KeyError):
            skipped_count += 1
            continue

        # Extract tweet_id
        match = re.search(r'\b\d{10,}\b', tweet_id_raw)
        if not match:
            excluded_tweet_ids[tweet_id_raw] = {'invalid_id': 1}
            continue

        tweet_id_numeric = int(match.group())

        # Skip duplicates
        if tweet_id_numeric in unique_tweet_ids or tweet_id_numeric in excluded_tweet_ids:
            continue

        # Exclusion reasons
        reason = {}

        if len(line_text.split()) <= 6:
            reason['short'] = 1

        try:
            if detect(line_text) != 'en':
                reason['non_english'] = 1
        except LangDetectException:
            reason['non_english'] = 1

        if tweet_id_numeric not in viewed_tweet_ids:
            reason['not_viewed'] = 1

        if reason:
            excluded_tweet_ids[tweet_id_raw] = reason
        else:
            unique_tweet_ids.add(tweet_id_numeric)

print("\nFinished processing.")
print(f"Total included tweets: {len(unique_tweet_ids)}")
print(f"Total excluded tweets: {len(excluded_tweet_ids)}")

# Convert exclusions for analysis
excluded_tweets_df = pd.DataFrame([
    {'tweet_id': tid, **reason}
    for tid, reason in excluded_tweet_ids.items()
])
sample_excluded = list(excluded_tweet_ids.items())[:10]
print("\nSample of excluded tweets and reasons:")
for tid, reason in sample_excluded:
    print(f"Tweet ID: {tid}, Reason: {reason}")


Processed 0 lines. Included: 0, Excluded: 0
Skipped 0 lines due to invalid tweet_id format.
Processed 10000 lines. Included: 1414, Excluded: 3557
Processed 20000 lines. Included: 2437, Excluded: 6260
Processed 30000 lines. Included: 3331, Excluded: 9009
Processed 40000 lines. Included: 4342, Excluded: 11673
Processed 50000 lines. Included: 5314, Excluded: 14310
Processed 60000 lines. Included: 6199, Excluded: 17085
Processed 70000 lines. Included: 7331, Excluded: 20089
Processed 80000 lines. Included: 8268, Excluded: 22839
Processed 90000 lines. Included: 9349, Excluded: 25762
Processed 100000 lines. Included: 10152, Excluded: 28267
Processed 110000 lines. Included: 11076, Excluded: 30711
Processed 120000 lines. Included: 12167, Excluded: 33566
Processed 130000 lines. Included: 13006, Excluded: 36522
Processed 140000 lines. Included: 14020, Excluded: 39445
Processed 150000 lines. Included: 14888, Excluded: 41695
Processed 160000 lines. Included: 15820, Excluded: 44035
Processed 170000 

KeyboardInterrupt: 

In [3]:
# print the type of tweet_id in exposure_features.csv
print("\nType of tweet_id in exposure_features.csv:")
print(df_filtered['tweet_id_numeric'].dtype)


Type of tweet_id in exposure_features.csv:
int64
