Social media scraping
===

In [25]:
import os
import json
import subprocess
import pandas as pd
import seaborn as sns


data_dir = "data/twitter"

if os.path.exists(data_dir) == False:
    os.makedirs(data_dir)

## Tweets by User

In [7]:
def dl_user(user, max_results=None, local=False):
    """
    Function to download tweets by username.
    Set local to True, if tweets have already
    been downloaded and are available in data_dir.
    
    Returns a DataFrame.

    """
    
    if local == False:
        with open(data_dir + f"/user-{user}.json", "w+") as fo:
            if max_results == None:
                cmd_list = ["snscrape", "--jsonl", "twitter-user", user]
            else:
                cmd_list = [
                    "snscrape",
                    "--jsonl",
                    "-n " + str(max_results),
                    "twitter-user",
                    user,
                ]
            p = subprocess.Popen(cmd_list, stdout=fo)
            p.wait()

    with open(data_dir + f"/user-{user}.json", "r") as fo:
        tweets = fo.readlines()

    tweets = [json.loads(tweets[i]) for i in range(0, len(tweets))]
    print("loaded", len(tweets), "tweets\n")
    
    df_tweets = pd.DataFrame(tweets)
    df_tweets["date"] = pd.to_datetime(df_tweets["date"])

    return df_tweets

Scrape the last 100 tweets from `@derstandard` and store in json file

In [8]:
df_tweets = dl_user("derstandardat", 100) 

df_tweets[["date", "rawContent", "hashtags"]].head()

loaded 100 tweets



Unnamed: 0,date,rawContent,hashtags
0,2023-01-25 09:33:54+00:00,"""IBES"" 2023: Jana ist raus – Werden Sie sie ve...",
1,2023-01-25 09:04:32+00:00,Chanels Haute Couture: Show führte in den Zirk...,
2,2023-01-25 09:04:31+00:00,Künstliche Intelligenz soll neue Gerüche entde...,
3,2023-01-25 09:04:30+00:00,Regierung einigt sich auf höhere Strafen für D...,
4,2023-01-25 08:51:15+00:00,"Freigabe: Investor IFM darf weitere 9,99 Proze...",


## Tweets by Hashtag

In [9]:
def dl_hashtag(hashtag, max_results=None, local=False):
    """
    Function to download tweets by hashtag.
    Set local to True, if tweets have already
    been downloaded and are available in data_dir.
    
    """
    
    if local==False:
        with open(data_dir + f"/hashtag-{hashtag}.json", "w+") as fo:
            if max_results == None:
                cmd_list = ["snscrape", "--jsonl", "twitter-hashtag", hashtag]
            else:
                cmd_list = ["snscrape", "--jsonl", "-n " + str(max_results), "twitter-hashtag", hashtag]
            p = subprocess.Popen(
                cmd_list, stdout=fo
            )
            p.wait()
            
    with open(data_dir+f"/hashtag-{hashtag}.json", "r") as fo:
        tweets = fo.readlines()

    tweets = [json.loads(tweets[i]) for i in range(0, len(tweets))]
    print("loaded", len(tweets), "tweets\n")

    df_tweets = pd.DataFrame(tweets)
    df_tweets["date"] = pd.to_datetime(df_tweets["date"])

    return df_tweets

Scrape the last 100 tweets with `#chinesevirus` and store in json file

In [10]:
df_tweets = dl_hashtag("chinesevirus", 100)  # max_results=None for all (default)

df_tweets[["date", "rawContent", "hashtags"]].head()

loaded 100 tweets



Unnamed: 0,date,rawContent,hashtags
0,2023-01-25 03:03:00+00:00,@visitmaldives And finally the #ChineseVirus w...,[ChineseVirus]
1,2023-01-24 20:34:49+00:00,@zhang_heqing #chineseVirus,[chineseVirus]
2,2023-01-24 05:10:21+00:00,@visitmaldives Welcoming first batch of #Chine...,[ChineseVirus]
3,2023-01-23 21:50:41+00:00,"Kammie, sweety...#ChineseVirus!!\nAny other si...",[ChineseVirus]
4,2023-01-23 13:28:54+00:00,3 years to the deadly outbreak @ Wuhan and eve...,"[ChineseVirus, WuhanVirus]"


In [11]:
df_tweets = dl_hashtag("plandemie", 100)

df_tweets[["date", "rawContent", "hashtags"]].head()

loaded 100 tweets



Unnamed: 0,date,rawContent,hashtags
0,2023-01-25 09:32:53+00:00,@MartinStamer Du nervst #Panimache\n#CoronaSca...,"[Panimache, CoronaScam, CoronaIstNurEinMachtin..."
1,2023-01-25 09:04:42+00:00,@DigitalerC @zeitonline Heute am Jahrestag der...,"[Plandemie, Immunsystem, Wieler, BioNTech]"
2,2023-01-25 08:34:11+00:00,@DilanYesilgoz #goedemorgen #woensdag #koffie ...,"[goedemorgen, woensdag, koffie, wordwakker, de..."
3,2023-01-25 07:03:14+00:00,@UteLehmann5 Die Altbestände müssen vernichtet...,"[plandemie, ploetzlichundunerwartet]"
4,2023-01-25 01:13:11+00:00,Seit der ganzen #Plandemie und #Putin haben wi...,"[Plandemie, Putin, Erdogan]"


All

In [None]:
df_tweets = dl_hashtag("plandemic", None, False)

df_tweets[["date", "rawContent", "hashtags"]].head()

In [None]:
plandemic = df_tweets.copy()

df_plandemic = pd.DataFrame(plandemic)

df_plandemic["date"] = pd.to_datetime(df_plandemic.date)

df_plandemic["year"] = df_plandemic.date.dt.year
df_plandemic["month"] = df_plandemic.date.dt.month

df_grp = df_plandemic.groupby(["year", "month"]).count()[["id"]].reset_index()

sns.lineplot(df_grp.id)