In [1]:
from lib.util import load_file

In [2]:
def fix_json_file(file_path):
    """Fix the JSON file by adding brackets and commas."""

    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
        
        # Remove any existing comments
        lines = [line for line in lines if not line.strip().startswith('#')]
        
        # Add commas between lines and wrap with brackets
        fixed_json = '[\n' + ',\n'.join(line.strip() for line in lines) + '\n]'
        
        with open(file_path, 'w') as file:
            file.write(fixed_json)
        
        print(f"Fixed JSON file saved to {file_path}")
    
    except Exception as e:
        print(f"An error occurred: {e}")

In [3]:
# fix_json_file('pheme-raw/annotations/en-scheme-annotations.json')

def create_and_classify_annotations():
    """Create a dictionary for quick lookups and classify tweets."""
    
    annotations = load_file("pheme-raw/annotations/en-scheme-annotations.json")
    print(f"Number of annotations: {len(annotations)}")
    
    def classify(item):
        if item.get("support"):
            return "source"
        elif item.get("responsetype-vs-source") and item.get("responsetype-vs-previous"):
            return "deep reply"
        elif item.get("responsetype-vs-source"):
            return "direct reply"
        return "unknown"
    
    # Key is tweet ID: int
    annotations_dict = {
        int(item["tweetid"]): {
            "class": classify(item),
            "support": item.get("support"),
            "responsetype-vs-source": item.get("responsetype-vs-source"),
            "responsetype-vs-previous": item.get("responsetype-vs-previous")
        }
        for item in annotations
    }
    
    return annotations_dict

annotations_dict = create_and_classify_annotations()

Number of annotations: 4560


In [6]:
import json
import pandas as pd

class TweetParser:
    def __init__(self, event_name, output_dir="data/tweets"):
        self.event = event_name
        self.data = []
        self.output_dir = output_dir
    
    def append(self, tweet, thread_id):
        tweet_id = tweet['id']
        tweet_data = {
            "tweet_id": tweet_id,
            "thread_id": thread_id,
            "class": annotations_dict.get(tweet_id, {}).get('class', 'retweet'),
            "support": annotations_dict.get(tweet_id, {}).get('support'),
            "responsetype-vs-source": annotations_dict.get(tweet_id, {}).get('responsetype-vs-source'),
            "responsetype-vs-previous": annotations_dict.get(tweet_id, {}).get('responsetype-vs-previous'),
            "favorite_count": tweet['favorite_count'],
            "retweeted": tweet['retweeted'],
            "retweet_count": tweet['retweet_count'],
            "in_reply_to_user_id": tweet['in_reply_to_user_id'],
            "favorited": tweet['favorited'],
            "user_id": tweet['user']['id'],
            "created_at": tweet['created_at'],
            "place": tweet['place'],
        }
        
        self.data.append(tweet_data)

    def export(self):
        output_file = f"{self.output_dir}/{self.event}.csv"
        df = pd.DataFrame(data=self.data)
        df.to_csv(output_file, index=False)
        return output_file
        

    def print_data_info(self):
        for tweet_data in self.data:
            print(json.dumps(tweet_data, indent=4))
        
        print(f"Total tweets: {len(self.data)}")

In [9]:
import time
import os

def pheme_to_csv(event, Parser=TweetParser, output="data/tweets"):
    start = time.time()
    parser = Parser(event, output_dir=output)
    path = "pheme-raw/threads/en"
    thread_number = 0

    for thread in os.listdir(f"{path}/{event}"):
        thread_number += 1

        # for source_tweet in os.listdir(f"{path}/{event}/{thread}/source-tweets"):   # json 
        #     src_twt = load_file(f"{path}/{event}/{thread}/source-tweets/{source_tweet}")
        #     parser.append(src_twt, thread)

        # for reaction in os.listdir(f"{path}/{event}/{thread}/reactions"):
        #     tweet = load_file(f"{path}/{event}/{thread}/reactions/{reaction}")
        #     parser.append(tweet, thread)

        retweets_file = f"{path}/{event}/{thread}/retweets.json"
        if os.path.exists(retweets_file):
            # fix_json_file(retweets_file)
            retweets = load_file(retweets_file)
            for retweet in retweets:
                parser.append(retweet, thread)

    output_file = parser.export()
    print(f"{output_file} was generated in {(time.time() - start) / 60} minutes")

    parser.print_data_info()
    
    return None

pheme_to_csv("ebola-essien")

data/tweets/ebola-essien.csv was generated in 0.0006623029708862305 minutes
{
    "tweet_id": 521457341490012160,
    "thread_id": "521346721226711040",
    "class": "retweet",
    "support": null,
    "responsetype-vs-source": null,
    "responsetype-vs-previous": null,
    "favorite_count": 0,
    "retweeted": false,
    "retweet_count": 575,
    "in_reply_to_user_id": null,
    "favorited": false,
    "user_id": 2603092341,
    "created_at": "Mon Oct 13 00:28:13 +0000 2014",
    "place": null
}
{
    "tweet_id": 521545470670163968,
    "thread_id": "521346721226711040",
    "class": "retweet",
    "support": null,
    "responsetype-vs-source": null,
    "responsetype-vs-previous": null,
    "favorite_count": 0,
    "retweeted": false,
    "retweet_count": 575,
    "in_reply_to_user_id": null,
    "favorited": false,
    "user_id": 583974728,
    "created_at": "Mon Oct 13 06:18:24 +0000 2014",
    "place": null
}
{
    "tweet_id": 521560712754528257,
    "thread_id": "521346721226711