# Harvest data from Mastodon

In [39]:
from mastodon import Mastodon
import json, time
import pandas as pd
import requests

# For initialization, some server/feature/operation does not need token, while for others, apply for token
m = Mastodon(
    api_base_url=f'https://mastodon.au'
    # use the url of the server such as https://aus.social for https://aus.social/explore
)

# Use timer or loop to achieve streaming harvester, be ware of the frequency
lastid = m.timeline(timeline='public', since_id=None, limit=1, remote=True)[0]['id']
time.sleep(10)
mastodon_data = m.timeline(timeline='public', since_id=lastid, remote=True)
for i in range(len(mastodon_data)):
    print(f"{mastodon_data[i].account.acct}: {mastodon_data[i].id}")

anandamide@dataare.cool: 112421508232889124
Galehawk@toot.community: 112421507979733193
dachengzi@go5.dev: 112421507878887151


# Harvest data from BoM

In [40]:
bom_data = requests.get('http://reg.bom.gov.au/fwo/IDV60901/IDV60901.95936.json').json()['observations']['data']
for i in range(len(bom_data)):
    print(f"{bom_data[i]['local_date_time']} air_temp: {bom_data[i]['air_temp']},apparent_t: {bom_data[i]['apparent_t']},vis_km:{bom_data[i]['vis_km']}, rain_trace:{bom_data[i]['rain_trace']}, rel_hum:{bom_data[i]['rel_hum']}, wind_spd_kmh: {bom_data[i]['wind_spd_kmh']} ")

11/06:00pm air_temp: 16.1,apparent_t: 15.1,vis_km:10, rain_trace:0.0, rel_hum:69, wind_spd_kmh: 6 
11/05:30pm air_temp: 16.5,apparent_t: 15.0,vis_km:10, rain_trace:0.0, rel_hum:67, wind_spd_kmh: 9 
11/05:00pm air_temp: 16.9,apparent_t: 16.0,vis_km:10, rain_trace:0.0, rel_hum:66, wind_spd_kmh: 6 
11/04:30pm air_temp: 17.3,apparent_t: 15.4,vis_km:10, rain_trace:0.0, rel_hum:64, wind_spd_kmh: 11 
11/04:00pm air_temp: 17.7,apparent_t: 16.1,vis_km:10, rain_trace:0.0, rel_hum:62, wind_spd_kmh: 9 
11/03:30pm air_temp: 17.8,apparent_t: 15.5,vis_km:10, rain_trace:0.0, rel_hum:62, wind_spd_kmh: 13 
11/03:00pm air_temp: 18.1,apparent_t: 15.8,vis_km:10, rain_trace:0.0, rel_hum:61, wind_spd_kmh: 13 
11/02:30pm air_temp: 18.5,apparent_t: 16.3,vis_km:10, rain_trace:0.0, rel_hum:61, wind_spd_kmh: 13 
11/02:00pm air_temp: 18.6,apparent_t: 16.8,vis_km:10, rain_trace:0.0, rel_hum:61, wind_spd_kmh: 11 
11/01:30pm air_temp: 18.7,apparent_t: 16.6,vis_km:10, rain_trace:0.0, rel_hum:62, wind_spd_kmh: 13 
11/0

# Data preprocessing

In [41]:
mas_df = pd.DataFrame(mastodon_data)

In [42]:
import geopandas as gpd
import random
from shapely.geometry import Point


melbourne_area = gpd.read_file("melbourne.geojson")
melbourne_polygon = melbourne_area.geometry.unary_union

def generate_random_points_in_polygon(poly, num_points):
    minx, miny, maxx, maxy = poly.bounds
    points = []
    while len(points) < num_points:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p):
            points.append(p)
    return points

num = len(mas_df)
random_points = generate_random_points_in_polygon(melbourne_polygon, num)

x_coords = [point.x for point in random_points]
y_coords = [point.y for point in random_points]

mas_df['longitude'] = x_coords
mas_df['latitude'] = y_coords

In [43]:
mas_df = mas_df[['created_at', 'content', 'longitude', 'latitude']]
mas_df

Unnamed: 0,created_at,content,longitude,latitude
0,2024-05-11 08:27:53+00:00,<p>Today feels like a good day to see Challeng...,145.013062,-37.902176
1,2024-05-11 08:27:55+00:00,"<p><a href=""https://www.youtube.com/watch?v=jD...",144.952333,-37.83072
2,2024-05-11 08:27:40+00:00,<p>手机里看到的。肉眼能看到光在跳舞，人类的眼睛还是挺瞎的:aru_0160:</p>,145.032113,-37.742463


In [44]:
from textblob import TextBlob
import re

def preprocess_text(text):
    text = re.sub(r'href=\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'["\']', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text[1:]
    return text


def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity


mas_df['content'] = mas_df['content'].apply(preprocess_text)
mas_df['sentiments'] = mas_df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)
mas_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments
0,2024-05-11 08:27:53+00:00,Today feels like a good day to see Challengersp,145.013062,-37.902176,0.7
1,2024-05-11 08:27:55+00:00,a relnofollow noopener noreferrer translateno...,144.952333,-37.83072,0.0125
2,2024-05-11 08:27:40+00:00,手机里看到的肉眼能看到光在跳舞人类的眼睛还是挺瞎的aru_0160p,145.032113,-37.742463,0.0


In [45]:
weather_df = pd.DataFrame(bom_data)

weather_df = weather_df[['local_date_time_full', 'apparent_t', 'air_temp', 'rain_trace', 'wind_spd_kmh', 'rel_hum', 'vis_km']]

def round_to_nearest_half_hour(dt):
    minute = dt.minute
    if minute < 15:
        return dt.replace(minute=0, second=0)
    elif minute < 45:
        return dt.replace(minute=30, second=0)
    else:
        return dt.replace(minute=0, second=0) + pd.Timedelta(hours=1)
mas_df['created_at'] = pd.to_datetime(mas_df['created_at'])
mas_df['rounded_created_at'] = mas_df['created_at'].apply(round_to_nearest_half_hour)

weather_df['local_date_time_full'] = pd.to_datetime(weather_df['local_date_time_full'], format='%Y%m%d%H%M%S')
weather_df

Unnamed: 0,local_date_time_full,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km
0,2024-05-11 18:00:00,15.1,16.1,0.0,6,69,10
1,2024-05-11 17:30:00,15.0,16.5,0.0,9,67,10
2,2024-05-11 17:00:00,16.0,16.9,0.0,6,66,10
3,2024-05-11 16:30:00,15.4,17.3,0.0,11,64,10
4,2024-05-11 16:00:00,16.1,17.7,0.0,9,62,10
...,...,...,...,...,...,...,...
158,2024-05-08 20:30:00,13.2,12.6,0.0,0,96,10
159,2024-05-08 20:00:00,14.0,13.3,0.0,0,93,10
160,2024-05-08 19:30:00,14.3,13.6,0.0,0,91,10
161,2024-05-08 19:00:00,14.5,13.8,0.0,0,90,10


In [46]:
mas_df['rounded_created_at'] = mas_df['rounded_created_at'].dt.tz_localize(None)
mas_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,rounded_created_at
0,2024-05-11 08:27:53+00:00,Today feels like a good day to see Challengersp,145.013062,-37.902176,0.7,2024-05-11 08:30:00
1,2024-05-11 08:27:55+00:00,a relnofollow noopener noreferrer translateno...,144.952333,-37.83072,0.0125,2024-05-11 08:30:00
2,2024-05-11 08:27:40+00:00,手机里看到的肉眼能看到光在跳舞人类的眼睛还是挺瞎的aru_0160p,145.032113,-37.742463,0.0,2024-05-11 08:30:00


In [47]:
merged_df = pd.merge(mas_df, weather_df, left_on='rounded_created_at', right_on='local_date_time_full', how='inner')

In [48]:
merged_df.drop(columns=['rounded_created_at', 'local_date_time_full'], inplace=True)
merged_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km
0,2024-05-11 08:27:53+00:00,Today feels like a good day to see Challengersp,145.013062,-37.902176,0.7,14.7,13.6,1.8,0,100,10
1,2024-05-11 08:27:55+00:00,a relnofollow noopener noreferrer translateno...,144.952333,-37.83072,0.0125,14.7,13.6,1.8,0,100,10
2,2024-05-11 08:27:40+00:00,手机里看到的肉眼能看到光在跳舞人类的眼睛还是挺瞎的aru_0160p,145.032113,-37.742463,0.0,14.7,13.6,1.8,0,100,10
