In [None]:
%%capture
! pip install twscrape
! pip install pandas

## Authentication

In [None]:
import subprocess

credentials = './fajarshiddiqqq_credentials.txt' # NOTE: Change this to the path of your credentials file
process = subprocess.Popen(['twscrape', 'add_accounts', credentials, 'username:password:email:email_password:_:_'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()

if stderr:
    print(stderr.decode('utf-8'))
else:
    print(stdout.decode('utf-8'))

In [None]:
import subprocess

process = subprocess.Popen(['twscrape', 'login_accounts'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if stderr:
    print(stderr.decode('utf-8'))
else:
    print(stdout.decode('utf-8'))

## Scraping Methods

In [11]:
import subprocess
import json

def get_nested_value(data_dict, nested_key):
    keys = nested_key.split(".")
    for key in keys:
        if isinstance(data_dict, dict):
            data_dict = data_dict.get(key)
        else:
            return None
    return data_dict

def tweet_details(tweet_id: str):
    """
    Available fields:
    id, id_str, url, date, user.id, user.id_str, user.url, user.username, user.displayname, user.rawDescription, user.created, user.followersCount, user.friendsCount, user.statusesCount, user.favouritesCount, user.listedCount, user.mediaCount, user.location, user.profileImageUrl, user.profileBannerUrl, user.protected, user.verified, user.blue, user.blueType, user.descriptionLinks, user._type, lang, rawContent, replyCount, retweetCount, likeCount, quoteCount, conversationId, conversationIdStr, hashtags, cashtags, mentionedUsers, links, viewCount, retweetedTweet, quotedTweet, place, coordinates, inReplyToTweetId, inReplyToTweetIdStr, inReplyToUser, source, sourceUrl, sourceLabel, media.photos, media.videos, media.animated, _type
    """
    process = subprocess.Popen(
        ["twscrape", "tweet_details", tweet_id],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    stdout, stderr = process.communicate()
    if stderr:
        return {"status": False, "meta": stderr, "data": None}
    else:
        tweet_data = stdout.strip().split("\n")
        tweet_json = json.loads(tweet_data[0])
        selected_fields = {
            "tweet_id": tweet_json.get("id_str"),
            "date": tweet_json.get("date"),
            "username": get_nested_value(tweet_json, "user.username"),
            "rawContent": tweet_json.get("rawContent")
        }
        return {"status": True, "meta": "success", "data": selected_fields}

def tweet_replies(tweet_id: str, limit=5):
    """
    Available fields:
    id, id_str, url, date, user.id, user.id_str, user.url, user.username, user.displayname, user.rawDescription, user.created, user.followersCount, user.friendsCount, user.statusesCount, user.favouritesCount, user.listedCount, user.mediaCount, user.location, user.profileImageUrl, user.profileBannerUrl, user.protected, user.verified, user.blue, user.blueType, user.descriptionLinks, user._type, lang, rawContent, replyCount, retweetCount, likeCount, quoteCount, conversationId, conversationIdStr, hashtags, cashtags, mentionedUsers, links, viewCount, retweetedTweet, quotedTweet, place, coordinates, inReplyToTweetId, inReplyToTweetIdStr, inReplyToUser.id, inReplyToUser.id_str, inReplyToUser.username, inReplyToUser.displayname, inReplyToUser._type, source, sourceUrl, sourceLabel, media.photos, media.videos, media.animated, _type
    """
    limit = max(limit, 2)
    process = subprocess.Popen(
        ["twscrape", "tweet_replies", tweet_id, f"--limit={limit}"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    stdout, stderr = process.communicate()
    if stderr:
        return {"status": False, "meta": stderr, "data": None}
    else:
        tweet_comments = stdout.strip().split("\n")
        tweet_comments_json = [
            {
                "comment_id": comment.get("id_str"),
                "date": comment.get("date"),
                "rawContent": comment.get("rawContent"),
                "tweet_id": comment.get("inReplyToTweetIdStr"),
            } for comment in (json.loads(c) for c in tweet_comments)
        ]
        return {"status": True, "meta": "success", "data": tweet_comments_json}

## Usage

In [None]:
import pandas as pd
from datetime import datetime

timenow = datetime.now().strftime("%Y%m%d%H%M%S")
id_to_scrape = ["1772965377338741248"]

replies_df = pd.DataFrame()
tweet_df = pd.DataFrame()

for tweet_id in id_to_scrape:
    replies = tweet_replies(tweet_id, 5)
    tweet = tweet_details(tweet_id)

    if replies["status"] and tweet["status"]:
        df_replies = pd.DataFrame(replies["data"])
        replies_df = pd.concat([replies_df, df_replies], ignore_index=True)

        df_tweet = pd.DataFrame(tweet["data"], index=[0])
        tweet_df = pd.concat([tweet_df, df_tweet], ignore_index=True)

    else:
        print(f"Failed to scrape tweet {tweet_id}")

filename = f"replies_{timenow}.csv"
replies_df.to_csv(filename, index=False)

filename = f"tweet_{timenow}.csv"
tweet_df.to_csv(filename, index=False)
