# Harvest data from Mastodon

In [2]:
from mastodon import Mastodon
import json, time
import pandas as pd
import requests

# For initialization, some server/feature/operation does not need token, while for others, apply for token
m = Mastodon(
    api_base_url=f'https://mastodon.au'
    # use the url of the server such as https://aus.social for https://aus.social/explore
)

# Use timer or loop to achieve streaming harvester, be ware of the frequency
lastid = m.timeline(timeline='public', since_id=None, limit=1, remote=True)[0]['id']
time.sleep(10)
mastodon_data = m.timeline(timeline='public', since_id=lastid, remote=True)
for i in range(len(mastodon_data)):
    print(f"{mastodon_data[i].account.acct}: {mastodon_data[i].id}")

AxiosNews@flipboard.com: 112445135678382726
anne@toot.cat: 112445135578694593
wrkmade@threads.net: 112445135464697060


# Harvest data from BoM

In [3]:
bom_data = requests.get('http://reg.bom.gov.au/fwo/IDV60901/IDV60901.95936.json').json()['observations']['data']
for i in range(len(bom_data)):
    print(f"{bom_data[i]['local_date_time']} air_temp: {bom_data[i]['air_temp']},apparent_t: {bom_data[i]['apparent_t']},vis_km:{bom_data[i]['vis_km']}, rain_trace:{bom_data[i]['rain_trace']}, rel_hum:{bom_data[i]['rel_hum']}, wind_spd_kmh: {bom_data[i]['wind_spd_kmh']} ")

15/10:00pm air_temp: 9.9,apparent_t: 9.7,vis_km:10, rain_trace:0.0, rel_hum:94, wind_spd_kmh: 0 
15/09:30pm air_temp: 10.2,apparent_t: 10.0,vis_km:10, rain_trace:0.0, rel_hum:92, wind_spd_kmh: 0 
15/09:00pm air_temp: 10.0,apparent_t: 9.8,vis_km:10, rain_trace:0.0, rel_hum:95, wind_spd_kmh: 0 
15/08:30pm air_temp: 10.1,apparent_t: 9.9,vis_km:10, rain_trace:0.0, rel_hum:93, wind_spd_kmh: 0 
15/08:00pm air_temp: 10.4,apparent_t: 10.2,vis_km:10, rain_trace:0.0, rel_hum:91, wind_spd_kmh: 0 
15/07:30pm air_temp: 10.9,apparent_t: 10.6,vis_km:10, rain_trace:0.0, rel_hum:87, wind_spd_kmh: 0 
15/07:00pm air_temp: 11.5,apparent_t: 11.2,vis_km:10, rain_trace:0.0, rel_hum:83, wind_spd_kmh: 0 
15/06:30pm air_temp: 12.2,apparent_t: 11.9,vis_km:10, rain_trace:0.0, rel_hum:78, wind_spd_kmh: 0 
15/06:00pm air_temp: 12.2,apparent_t: 11.9,vis_km:10, rain_trace:0.0, rel_hum:79, wind_spd_kmh: 0 
15/05:30pm air_temp: 13.2,apparent_t: 12.8,vis_km:10, rain_trace:0.0, rel_hum:72, wind_spd_kmh: 0 
15/05:00pm air

# Data preprocessing

In [4]:
mas_df = pd.DataFrame(mastodon_data)

In [5]:
import fiona
from shapely.geometry import shape, Point
import random

with fiona.open("geo/melbourne.geojson", "r") as geojson:
    features = [feature for feature in geojson]

melbourne_polygon = shape(features[0]['geometry'])
for feature in features[1:]:
    melbourne_polygon = melbourne_polygon.union(shape(feature['geometry']))

def generate_random_points_in_polygon(poly, num_points):
    minx, miny, maxx, maxy = poly.bounds
    points = []
    while len(points) < num_points:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p):
            points.append(p)
    return points

num = len(mas_df)
random_points = generate_random_points_in_polygon(melbourne_polygon, num)

x_coords = [point.x for point in random_points]
y_coords = [point.y for point in random_points]

mas_df['longitude'] = x_coords
mas_df['latitude'] = y_coords

In [6]:
mas_df = mas_df[['created_at', 'content', 'longitude', 'latitude']]
mas_df

Unnamed: 0,created_at,content,longitude,latitude
0,2024-05-15 12:34:50+00:00,<p>Biden challenges Trump to summer presidenti...,144.962823,-37.803731
1,2024-05-15 12:36:32+00:00,<p>Flossing after eating a chia seed bagel is ...,144.956855,-37.820025
2,2024-05-15 12:31:15+00:00,<p>Braun Lectron Hobby Set Radio (Gray Control...,145.028302,-37.990486


In [7]:
from textblob import TextBlob
import re

def preprocess_text(text):
    text = re.sub(r'href=\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'["\']', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text[1:]
    return text


def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity


mas_df['content'] = mas_df['content'].apply(preprocess_text)
mas_df['sentiments'] = mas_df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)
mas_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments
0,2024-05-15 12:34:50+00:00,Biden challenges Trump to summer presidential ...,144.962823,-37.803731,0.0
1,2024-05-15 12:36:32+00:00,Flossing after eating a chia seed bagel is lik...,144.956855,-37.820025,0.0
2,2024-05-15 12:31:15+00:00,Braun Lectron Hobby Set Radio Gray Controls D...,145.028302,-37.990486,0.0


In [8]:
weather_df = pd.DataFrame(bom_data)

weather_df = weather_df[['local_date_time_full', 'apparent_t', 'air_temp', 'rain_trace', 'wind_spd_kmh', 'rel_hum', 'vis_km']]

def round_to_nearest_half_hour(dt):
    minute = dt.minute
    if minute < 15:
        return dt.replace(minute=0, second=0)
    elif minute < 45:
        return dt.replace(minute=30, second=0)
    else:
        return dt.replace(minute=0, second=0) + pd.Timedelta(hours=1)
mas_df['created_at'] = pd.to_datetime(mas_df['created_at'])
mas_df['rounded_created_at'] = mas_df['created_at'].apply(round_to_nearest_half_hour)

weather_df['local_date_time_full'] = pd.to_datetime(weather_df['local_date_time_full'], format='%Y%m%d%H%M%S')
weather_df

Unnamed: 0,local_date_time_full,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km
0,2024-05-15 22:00:00,9.7,9.9,0.0,0,94,10
1,2024-05-15 21:30:00,10.0,10.2,0.0,0,92,10
2,2024-05-15 21:00:00,9.8,10.0,0.0,0,95,10
3,2024-05-15 20:30:00,9.9,10.1,0.0,0,93,10
4,2024-05-15 20:00:00,10.2,10.4,0.0,0,91,10
...,...,...,...,...,...,...,...
140,2024-05-13 01:00:00,11.2,10.9,0.0,0,100,10
141,2024-05-13 00:30:00,11.3,11.0,0.0,0,100,9
142,2024-05-13 00:00:00,11.7,11.3,0.0,0,100,10
143,2024-05-12 23:30:00,12.5,11.9,0.0,0,100,10


In [9]:
mas_df['rounded_created_at'] = mas_df['rounded_created_at'].dt.tz_localize(None)
mas_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,rounded_created_at
0,2024-05-15 12:34:50+00:00,Biden challenges Trump to summer presidential ...,144.962823,-37.803731,0.0,2024-05-15 12:30:00
1,2024-05-15 12:36:32+00:00,Flossing after eating a chia seed bagel is lik...,144.956855,-37.820025,0.0,2024-05-15 12:30:00
2,2024-05-15 12:31:15+00:00,Braun Lectron Hobby Set Radio Gray Controls D...,145.028302,-37.990486,0.0,2024-05-15 12:30:00


In [10]:
merged_df = pd.merge(mas_df, weather_df, left_on='rounded_created_at', right_on='local_date_time_full', how='inner')

In [11]:
merged_df.drop(columns=['rounded_created_at', 'local_date_time_full'], inplace=True)
merged_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km
0,2024-05-15 12:34:50+00:00,Biden challenges Trump to summer presidential ...,144.962823,-37.803731,0.0,14.5,16.6,0.0,7,52,10
1,2024-05-15 12:36:32+00:00,Flossing after eating a chia seed bagel is lik...,144.956855,-37.820025,0.0,14.5,16.6,0.0,7,52,10
2,2024-05-15 12:31:15+00:00,Braun Lectron Hobby Set Radio Gray Controls D...,145.028302,-37.990486,0.0,14.5,16.6,0.0,7,52,10


In [12]:
import geopandas as gpd
from shapely.geometry import Point

sa2_gdf = gpd.read_file('geo/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp')
sa2_gdf = sa2_gdf[['SA2_CODE21', 'geometry','SA2_NAME21']]

geometry = [Point(xy) for xy in zip(merged_df['longitude'], merged_df['latitude'])]
geo_df = gpd.GeoDataFrame(merged_df, geometry=geometry)

geo_df.crs = sa2_gdf.crs

merged_df = gpd.sjoin(geo_df, sa2_gdf, how='left', op='within')

merged_df

  if await self.run_code(code, result, async_=asy):


Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km,geometry,index_right,SA2_CODE21,SA2_NAME21
0,2024-05-15 12:34:50+00:00,Biden challenges Trump to summer presidential ...,144.962823,-37.803731,0.0,14.5,16.6,0.0,7,52,10,POINT (144.96282 -37.80373),771,206041117,Carlton
1,2024-05-15 12:36:32+00:00,Flossing after eating a chia seed bagel is lik...,144.956855,-37.820025,0.0,14.5,16.6,0.0,7,52,10,POINT (144.95686 -37.82002),781,206041505,Melbourne CBD - West
2,2024-05-15 12:31:15+00:00,Braun Lectron Hobby Set Radio Gray Controls D...,145.028302,-37.990486,0.0,14.5,16.6,0.0,7,52,10,POINT (145.02830 -37.99049),833,208011168,Beaumaris


In [13]:
import pandas as pd
import json

with open('sa2.json', 'r') as file:
    age = json.load(file)

sa2_data = []
for feature in age['features']:
    sa2_code = feature['properties']['sa2_code_2021']
    median_age = feature['properties']['median_age_persons']
    sa2_data.append({'SA2_CODE21': sa2_code, 'median_age': median_age})

sa2_df = pd.DataFrame(sa2_data)

merged_df = pd.merge(merged_df, sa2_df, on='SA2_CODE21', how='left')

In [14]:
merged_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km,geometry,index_right,SA2_CODE21,SA2_NAME21,median_age
0,2024-05-15 12:34:50+00:00,Biden challenges Trump to summer presidential ...,144.962823,-37.803731,0.0,14.5,16.6,0.0,7,52,10,POINT (144.96282 -37.80373),771,206041117,Carlton,27.0
1,2024-05-15 12:36:32+00:00,Flossing after eating a chia seed bagel is lik...,144.956855,-37.820025,0.0,14.5,16.6,0.0,7,52,10,POINT (144.95686 -37.82002),781,206041505,Melbourne CBD - West,29.0
2,2024-05-15 12:31:15+00:00,Braun Lectron Hobby Set Radio Gray Controls D...,145.028302,-37.990486,0.0,14.5,16.6,0.0,7,52,10,POINT (145.02830 -37.99049),833,208011168,Beaumaris,47.0
