In [1]:
import pandas as pd
import json
import pytumblr
from datetime import datetime as dt
from bs4 import BeautifulSoup

In [2]:
# Authenticate via Tumblr API
with open("credentials.json") as fin:
    credentials = json.load(fin)

CONSUMER_KEY = credentials["CONSUMER_KEY"]
CONSUMER_SECRET = credentials["CONSUMER_SECRET"]
OAUTH_TOKEN = credentials["OAUTH_TOKEN"]
OAUTH_SECRET = credentials["OAUTH_SECRET"]

client = pytumblr.TumblrRestClient(CONSUMER_KEY, CONSUMER_SECRET, OAUTH_TOKEN, OAUTH_SECRET)

In [3]:
# Retrieve posts from Tumblr and create dataset
blog_name = "redglassbird"  # Source:   https://www.tumblr.com/redglassbird
num_posts = 1050  # Total number of posts
limit = 50  # Max number of posts that can be retrieved in a single request
list_posts = []
for i in range(num_posts // limit):
    posts = client.posts(blog_name, limit = limit)
    for post in posts["posts"]:
        # Parse Datetime
        datetime_str = post["date"]
        datetime = dt.strptime(datetime_str, "%Y-%m-%d %H:%M:%S %Z")

        # Parse HTML body as text
        html_body = post["body"]
        soup = BeautifulSoup(html_body, "html.parser")
        body = soup.get_text()

        dict_post = {
            "Blog": post["blog_name"],
            "Datetime": datetime,
            "URL": post["post_url"],
            "Title": post["title"],
            "Body": body
        }
        list_posts.append(dict_post)

# Save dataset as CSV
file_csv = f"tumblr-{blog_name}.csv"
df = pd.DataFrame(list_posts)
df.to_csv(file_csv, index = False)
print(f"Number of entries: {len(df)}\n")
df.info()
display(df)

Number of entries: 1050

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Blog      1050 non-null   object        
 1   Datetime  1050 non-null   datetime64[ns]
 2   URL       1050 non-null   object        
 3   Title     1008 non-null   object        
 4   Body      1050 non-null   object        
dtypes: datetime64[ns](1), object(4)
memory usage: 41.1+ KB


Unnamed: 0,Blog,Datetime,URL,Title,Body
0,redglassbird,2024-06-03 14:57:47,https://www.tumblr.com/blog/view/redglassbird/...,,abalidoth:veil-of-exordia:confusing Odysseus a...
1,redglassbird,2024-06-03 02:54:21,https://www.tumblr.com/blog/view/redglassbird/...,,hello sharks today I am asking for five billio...
2,redglassbird,2024-06-01 12:11:44,https://www.tumblr.com/blog/view/redglassbird/...,,mandyseley:i-am-an-adult-i-swear:Sometimes the...
3,redglassbird,2024-05-26 12:06:26,https://www.tumblr.com/blog/view/redglassbird/...,,I think orthodontists made up retainers to pic...
4,redglassbird,2024-05-25 20:07:13,https://www.tumblr.com/blog/view/redglassbird/...,,I take my whiskey neat (I dont drink) my coffe...
...,...,...,...,...,...
1045,redglassbird,2023-10-30 17:18:57,https://www.tumblr.com/blog/view/redglassbird/...,,i think there should be an IQ test but for ele...
1046,redglassbird,2023-10-20 03:17:09,https://www.tumblr.com/blog/view/redglassbird/...,,The whores persist (impression of myself when ...
1047,redglassbird,2023-10-19 17:15:58,https://www.tumblr.com/blog/view/redglassbird/...,,I think we should as a society utilize the fir...
1048,redglassbird,2023-10-16 22:43:19,https://www.tumblr.com/blog/view/redglassbird/...,,you’re telling me this salesman is selling sna...
