In [1]:
%%capture
! pip install twscrape
! pip install pandas

## Authentication

In [2]:
import subprocess

credentials = 'credentials.txt' # NOTE: Change this to the path of your credentials file
process = subprocess.Popen(['twscrape', 'add_accounts', credentials, 'username:password:email:email_password:_:_'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()

if stderr:
    print(stderr.decode('utf-8'))
else:
    print(stdout.decode('utf-8'))

2024-04-10 08:17:34.813 | INFO     | twscrape.db:migrate:92 - Running migration to v1
2024-04-10 08:17:34.821 | INFO     | twscrape.db:migrate:92 - Running migration to v2
2024-04-10 08:17:34.832 | INFO     | twscrape.db:migrate:92 - Running migration to v3
2024-04-10 08:17:34.846 | INFO     | twscrape.accounts_pool:add_account:97 - Account fajarshiddiqqq added successfully (active=False)



In [3]:
import subprocess

process = subprocess.Popen(['twscrape', 'login_accounts'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if stderr:
    print(stderr.decode('utf-8'))
else:
    print(stdout.decode('utf-8'))

2024-04-10 08:17:50.411 | INFO     | twscrape.accounts_pool:login_all:166 - [1/1] Logging in fajarshiddiqqq - fajarshiddiqqq@gmail.com
2024-04-10 08:17:53.175 | INFO     | twscrape.accounts_pool:login:145 - Logged in to fajarshiddiqqq successfully



## Scraping Methods

In [5]:
import subprocess
import json

def get_nested_value(data_dict, nested_key):
    keys = nested_key.split(".")
    for key in keys:
        if isinstance(data_dict, dict):
            data_dict = data_dict.get(key)
        else:
            return None
    return data_dict

def tweet_details(tweet_id: str):
    """
    Available fields:
    id, id_str, url, date, user.id, user.id_str, user.url, user.username, user.displayname, user.rawDescription, user.created, user.followersCount, user.friendsCount, user.statusesCount, user.favouritesCount, user.listedCount, user.mediaCount, user.location, user.profileImageUrl, user.profileBannerUrl, user.protected, user.verified, user.blue, user.blueType, user.descriptionLinks, user._type, lang, rawContent, replyCount, retweetCount, likeCount, quoteCount, conversationId, conversationIdStr, hashtags, cashtags, mentionedUsers, links, viewCount, retweetedTweet, quotedTweet, place, coordinates, inReplyToTweetId, inReplyToTweetIdStr, inReplyToUser, source, sourceUrl, sourceLabel, media.photos, media.videos, media.animated, _type
    """
    process = subprocess.Popen(
        ["twscrape", "tweet_details", tweet_id],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    stdout, stderr = process.communicate()
    if stderr:
        return {"status": False, "meta": stderr, "data": None}
    else:
        tweet_data = stdout.strip().split("\n")
        tweet_json = json.loads(tweet_data[0])
        selected_fields = {
            "tweet_id": tweet_json.get("id_str"),
            "date": tweet_json.get("date"),
            "username": get_nested_value(tweet_json, "user.username"),
            "rawContent": tweet_json.get("rawContent")
        }
        return {"status": True, "meta": "success", "data": selected_fields}

def tweet_replies(tweet_id: str, limit=5):
    """
    Available fields:
    id, id_str, url, date, user.id, user.id_str, user.url, user.username, user.displayname, user.rawDescription, user.created, user.followersCount, user.friendsCount, user.statusesCount, user.favouritesCount, user.listedCount, user.mediaCount, user.location, user.profileImageUrl, user.profileBannerUrl, user.protected, user.verified, user.blue, user.blueType, user.descriptionLinks, user._type, lang, rawContent, replyCount, retweetCount, likeCount, quoteCount, conversationId, conversationIdStr, hashtags, cashtags, mentionedUsers, links, viewCount, retweetedTweet, quotedTweet, place, coordinates, inReplyToTweetId, inReplyToTweetIdStr, inReplyToUser.id, inReplyToUser.id_str, inReplyToUser.username, inReplyToUser.displayname, inReplyToUser._type, source, sourceUrl, sourceLabel, media.photos, media.videos, media.animated, _type
    """
    limit = max(limit, 2)
    process = subprocess.Popen(
        ["twscrape", "tweet_replies", tweet_id, f"--limit={limit}"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    stdout, stderr = process.communicate()
    if stderr:
        return {"status": False, "meta": stderr, "data": None}
    else:
        tweet_comments = stdout.strip().split("\n")
        tweet_comments_json = [
            {
                "comment_id": comment.get("id_str"),
                "date": comment.get("date"),
                "rawContent": comment.get("rawContent"),
                "tweet_id": comment.get("inReplyToTweetIdStr"),
            } for comment in (json.loads(c) for c in tweet_comments)
        ]
        return {"status": True, "meta": "success", "data": tweet_comments_json}

## Usage

In [7]:
import pandas as pd
import os

id_to_scrape = ["1777668029624574360"]

replies_csv = "replies.csv"
tweet_csv = "tweet.csv"


# Check file availability
if os.path.exists(replies_csv):
    replies_df = pd.read_csv(replies_csv)
else:
    replies_df = pd.DataFrame()

if os.path.exists(tweet_csv):
    tweet_df = pd.read_csv(tweet_csv)
else:
    tweet_df = pd.DataFrame()

# Scraping process
for tweet_id in id_to_scrape:
    replies = tweet_replies(tweet_id, 500)
    tweet = tweet_details(tweet_id)

    if replies["status"] and tweet["status"]:
        df_replies = pd.DataFrame(replies["data"])
        replies_df = pd.concat([replies_df, df_replies], ignore_index=True)

        df_tweet = pd.DataFrame(tweet["data"], index=[0])
        tweet_df = pd.concat([tweet_df, df_tweet], ignore_index=True)

    else:
        print(f"Failed to scrape tweet {tweet_id}")

# Save to CSV
replies_df.to_csv(replies_csv, index=False)
tweet_df.to_csv(tweet_csv, index=False)