# Harvest data from Mastodon

In [1]:
from mastodon import Mastodon
import json, time
import pandas as pd
import requests

# For initialization, some server/feature/operation does not need token, while for others, apply for token
m = Mastodon(
    api_base_url=f'https://mastodon.au'
    # use the url of the server such as https://aus.social for https://aus.social/explore
)

# Use timer or loop to achieve streaming harvester, be ware of the frequency
lastid = m.timeline(timeline='public', since_id=None, limit=1, remote=True)[0]['id']
time.sleep(10)
mastodon_data = m.timeline(timeline='public', since_id=lastid, remote=True)
for i in range(len(mastodon_data)):
    print(f"{mastodon_data[i].account.acct}: {mastodon_data[i].id}")

lovenhl@channels.im: 112443387224783984
GeorgeLThomas@mastodon.world: 112443387189354371
MattHatton@aus.social: 112443387146237830
goon_master@mastodon.online: 112443387100490511
patrickcmiller@infosec.exchange: 112443386899605495
whatifbot@eigenmagic.net: 112443386894417009
news@jb.md: 112443386845585392
LehtoriTuomo@mementomori.social: 112443386817006366


# Harvest data from BoM

In [2]:
bom_data = requests.get('http://reg.bom.gov.au/fwo/IDV60901/IDV60901.95936.json').json()['observations']['data']
for i in range(len(bom_data)):
    print(f"{bom_data[i]['local_date_time']} air_temp: {bom_data[i]['air_temp']},apparent_t: {bom_data[i]['apparent_t']},vis_km:{bom_data[i]['vis_km']}, rain_trace:{bom_data[i]['rain_trace']}, rel_hum:{bom_data[i]['rel_hum']}, wind_spd_kmh: {bom_data[i]['wind_spd_kmh']} ")

15/02:30pm air_temp: 17.0,apparent_t: 15.4,vis_km:10, rain_trace:0.0, rel_hum:55, wind_spd_kmh: 6 
15/02:00pm air_temp: 16.8,apparent_t: 15.1,vis_km:10, rain_trace:0.0, rel_hum:54, wind_spd_kmh: 6 
15/01:30pm air_temp: 16.4,apparent_t: 14.5,vis_km:10, rain_trace:0.0, rel_hum:56, wind_spd_kmh: 7 
15/01:00pm air_temp: 17.0,apparent_t: 15.0,vis_km:10, rain_trace:0.0, rel_hum:52, wind_spd_kmh: 7 
15/12:30pm air_temp: 16.6,apparent_t: 14.5,vis_km:10, rain_trace:0.0, rel_hum:52, wind_spd_kmh: 7 
15/12:00pm air_temp: 16.5,apparent_t: 13.8,vis_km:10, rain_trace:0.0, rel_hum:55, wind_spd_kmh: 11 
15/11:30am air_temp: 15.7,apparent_t: 13.3,vis_km:10, rain_trace:0.0, rel_hum:56, wind_spd_kmh: 9 
15/11:00am air_temp: 15.6,apparent_t: 14.6,vis_km:10, rain_trace:0.0, rel_hum:64, wind_spd_kmh: 4 
15/10:30am air_temp: 14.1,apparent_t: 12.2,vis_km:10, rain_trace:0.0, rel_hum:71, wind_spd_kmh: 9 
15/10:00am air_temp: 12.7,apparent_t: 11.2,vis_km:10, rain_trace:0.0, rel_hum:75, wind_spd_kmh: 6 
15/09:30a

# Data preprocessing

In [18]:
mas_df = pd.DataFrame(mastodon_data)

In [19]:
import geopandas as gpd
import random
from shapely.geometry import Point


melbourne_area = gpd.read_file("geo/melbourne.geojson")
melbourne_polygon = melbourne_area.geometry.unary_union

def generate_random_points_in_polygon(poly, num_points):
    minx, miny, maxx, maxy = poly.bounds
    points = []
    while len(points) < num_points:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p):
            points.append(p)
    return points

num = len(mas_df)
random_points = generate_random_points_in_polygon(melbourne_polygon, num)

x_coords = [point.x for point in random_points]
y_coords = [point.y for point in random_points]

mas_df['longitude'] = x_coords
mas_df['latitude'] = y_coords

In [20]:
mas_df = mas_df[['created_at', 'content', 'longitude', 'latitude']]
mas_df

Unnamed: 0,created_at,content,longitude,latitude
0,2024-05-15 05:12:05+00:00,"<p><a href=""https://www.lovenhl.com/1166425/"" ...",144.921835,-37.831435
1,2024-05-15 05:12:05+00:00,<p>Wednesday Weekly Blogging Challenge: Funny ...,144.908716,-37.721948
2,2024-05-15 05:12:05+00:00,<p>Asked IT if was a “a small uh-oh or a big u...,145.016623,-37.816008
3,2024-05-15 05:11:58+00:00,,145.084846,-37.850674
4,2024-05-15 05:12:01+00:00,<p>The US is still falling behind on electroni...,144.870553,-37.729479
5,2024-05-15 05:12:01+00:00,"<p>What if moron, but also regret?</p>",144.947328,-37.819491
6,2024-05-15 05:11:53+00:00,<p>Justine Elliot talks up federal budget<br><...,145.065324,-37.894756
7,2024-05-15 05:12:00+00:00,<p>Tämän vuoden kanditöiden ohjaukset lähestyv...,145.043261,-37.815097


In [21]:
from textblob import TextBlob
import re

def preprocess_text(text):
    text = re.sub(r'href=\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'["\']', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text[1:]
    return text


def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity


mas_df['content'] = mas_df['content'].apply(preprocess_text)
mas_df['sentiments'] = mas_df['content'].apply(lambda x: TextBlob(x).sentiment.polarity)
mas_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments
0,2024-05-15 05:12:05+00:00,a relnofollow noopener noreferrer target_blan...,144.921835,-37.831435,0.0
1,2024-05-15 05:12:05+00:00,Wednesday Weekly Blogging Challenge Funny Thin...,144.908716,-37.721948,0.25
2,2024-05-15 05:12:05+00:00,Asked IT if was a a small uhoh or a big uhoh a...,145.016623,-37.816008,-0.216667
3,2024-05-15 05:11:58+00:00,,145.084846,-37.850674,0.0
4,2024-05-15 05:12:01+00:00,The US is still falling behind on electronic w...,144.870553,-37.729479,-0.021429
5,2024-05-15 05:12:01+00:00,What if moron but also regretp,144.947328,-37.819491,-0.8
6,2024-05-15 05:11:53+00:00,Justine Elliot talks up federal budgetbra rel...,145.065324,-37.894756,0.0
7,2024-05-15 05:12:00+00:00,Tämän vuoden kanditöiden ohjaukset lähestyvät ...,145.043261,-37.815097,0.0


In [22]:
weather_df = pd.DataFrame(bom_data)

weather_df = weather_df[['local_date_time_full', 'apparent_t', 'air_temp', 'rain_trace', 'wind_spd_kmh', 'rel_hum', 'vis_km']]

def round_to_nearest_half_hour(dt):
    minute = dt.minute
    if minute < 15:
        return dt.replace(minute=0, second=0)
    elif minute < 45:
        return dt.replace(minute=30, second=0)
    else:
        return dt.replace(minute=0, second=0) + pd.Timedelta(hours=1)
mas_df['created_at'] = pd.to_datetime(mas_df['created_at'])
mas_df['rounded_created_at'] = mas_df['created_at'].apply(round_to_nearest_half_hour)

weather_df['local_date_time_full'] = pd.to_datetime(weather_df['local_date_time_full'], format='%Y%m%d%H%M%S')
weather_df

Unnamed: 0,local_date_time_full,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km
0,2024-05-15 14:30:00,15.4,17.0,0.0,6,55,10
1,2024-05-15 14:00:00,15.1,16.8,0.0,6,54,10
2,2024-05-15 13:30:00,14.5,16.4,0.0,7,56,10
3,2024-05-15 13:00:00,15.0,17.0,0.0,7,52,10
4,2024-05-15 12:30:00,14.5,16.6,0.0,7,52,10
...,...,...,...,...,...,...,...
140,2024-05-12 17:30:00,15.0,14.5,0.0,0,82,10
141,2024-05-12 17:00:00,15.3,15.2,0.0,2,78,10
142,2024-05-12 16:30:00,15.1,16.0,0.0,7,74,10
143,2024-05-12 16:00:00,15.8,16.9,0.0,9,72,10


In [23]:
mas_df['rounded_created_at'] = mas_df['rounded_created_at'].dt.tz_localize(None)
mas_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,rounded_created_at
0,2024-05-15 05:12:05+00:00,a relnofollow noopener noreferrer target_blan...,144.921835,-37.831435,0.0,2024-05-15 05:00:00
1,2024-05-15 05:12:05+00:00,Wednesday Weekly Blogging Challenge Funny Thin...,144.908716,-37.721948,0.25,2024-05-15 05:00:00
2,2024-05-15 05:12:05+00:00,Asked IT if was a a small uhoh or a big uhoh a...,145.016623,-37.816008,-0.216667,2024-05-15 05:00:00
3,2024-05-15 05:11:58+00:00,,145.084846,-37.850674,0.0,2024-05-15 05:00:00
4,2024-05-15 05:12:01+00:00,The US is still falling behind on electronic w...,144.870553,-37.729479,-0.021429,2024-05-15 05:00:00
5,2024-05-15 05:12:01+00:00,What if moron but also regretp,144.947328,-37.819491,-0.8,2024-05-15 05:00:00
6,2024-05-15 05:11:53+00:00,Justine Elliot talks up federal budgetbra rel...,145.065324,-37.894756,0.0,2024-05-15 05:00:00
7,2024-05-15 05:12:00+00:00,Tämän vuoden kanditöiden ohjaukset lähestyvät ...,145.043261,-37.815097,0.0,2024-05-15 05:00:00


In [24]:
merged_df = pd.merge(mas_df, weather_df, left_on='rounded_created_at', right_on='local_date_time_full', how='inner')

In [25]:
merged_df.drop(columns=['rounded_created_at', 'local_date_time_full'], inplace=True)
merged_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km
0,2024-05-15 05:12:05+00:00,a relnofollow noopener noreferrer target_blan...,144.921835,-37.831435,0.0,6.3,7.0,0.0,0,100,10
1,2024-05-15 05:12:05+00:00,Wednesday Weekly Blogging Challenge Funny Thin...,144.908716,-37.721948,0.25,6.3,7.0,0.0,0,100,10
2,2024-05-15 05:12:05+00:00,Asked IT if was a a small uhoh or a big uhoh a...,145.016623,-37.816008,-0.216667,6.3,7.0,0.0,0,100,10
3,2024-05-15 05:11:58+00:00,,145.084846,-37.850674,0.0,6.3,7.0,0.0,0,100,10
4,2024-05-15 05:12:01+00:00,The US is still falling behind on electronic w...,144.870553,-37.729479,-0.021429,6.3,7.0,0.0,0,100,10
5,2024-05-15 05:12:01+00:00,What if moron but also regretp,144.947328,-37.819491,-0.8,6.3,7.0,0.0,0,100,10
6,2024-05-15 05:11:53+00:00,Justine Elliot talks up federal budgetbra rel...,145.065324,-37.894756,0.0,6.3,7.0,0.0,0,100,10
7,2024-05-15 05:12:00+00:00,Tämän vuoden kanditöiden ohjaukset lähestyvät ...,145.043261,-37.815097,0.0,6.3,7.0,0.0,0,100,10


In [26]:
import geopandas as gpd
from shapely.geometry import Point

sa2_gdf = gpd.read_file('geo/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp')
sa2_gdf = sa2_gdf[['SA2_CODE21', 'geometry','SA2_NAME21']]

geometry = [Point(xy) for xy in zip(merged_df['longitude'], merged_df['latitude'])]
geo_df = gpd.GeoDataFrame(merged_df, geometry=geometry)

geo_df.crs = sa2_gdf.crs

merged_df = gpd.sjoin(geo_df, sa2_gdf, how='left', op='within')

merged_df

  if await self.run_code(code, result, async_=asy):


Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km,geometry,index_right,SA2_CODE21,SA2_NAME21
0,2024-05-15 05:12:05+00:00,a relnofollow noopener noreferrer target_blan...,144.921835,-37.831435,0.0,6.3,7.0,0.0,0,100,10,POINT (144.92184 -37.83143),791,206051511,Port Melbourne Industrial
1,2024-05-15 05:12:05+00:00,Wednesday Weekly Blogging Challenge Funny Thin...,144.908716,-37.721948,0.25,6.3,7.0,0.0,0,100,10,POINT (144.90872 -37.72195),908,210011231,Strathmore
2,2024-05-15 05:12:05+00:00,Asked IT if was a a small uhoh or a big uhoh a...,145.016623,-37.816008,-0.216667,6.3,7.0,0.0,0,100,10,POINT (145.01662 -37.81601),816,207011519,Hawthorn - North
3,2024-05-15 05:11:58+00:00,,145.084846,-37.850674,0.0,6.3,7.0,0.0,0,100,10,POINT (145.08485 -37.85067),812,207011150,Glen Iris - East
4,2024-05-15 05:12:01+00:00,The US is still falling behind on electronic w...,144.870553,-37.729479,-0.021429,6.3,7.0,0.0,0,100,10,POINT (144.87055 -37.72948),904,210011226,Airport West
5,2024-05-15 05:12:01+00:00,What if moron but also regretp,144.947328,-37.819491,-0.8,6.3,7.0,0.0,0,100,10,POINT (144.94733 -37.81949),772,206041118,Docklands
6,2024-05-15 05:11:53+00:00,Justine Elliot talks up federal budgetbra rel...,145.065324,-37.894756,0.0,6.3,7.0,0.0,0,100,10,POINT (145.06532 -37.89476),840,208021176,Carnegie
7,2024-05-15 05:12:00+00:00,Tämän vuoden kanditöiden ohjaukset lähestyvät ...,145.043261,-37.815097,0.0,6.3,7.0,0.0,0,100,10,POINT (145.04326 -37.81510),818,207011521,Kew - South


In [29]:
import pandas as pd
import json

with open('sa2.json', 'r') as file:
    age = json.load(file)

sa2_data = []
for feature in age['features']:
    sa2_code = feature['properties']['sa2_code_2021']
    median_age = feature['properties']['median_age_persons']
    sa2_data.append({'SA2_CODE21': sa2_code, 'median_age': median_age})

sa2_df = pd.DataFrame(sa2_data)

merged_df = pd.merge(merged_df, sa2_df, on='SA2_CODE21', how='left')

In [30]:
merged_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,apparent_t,air_temp,rain_trace,wind_spd_kmh,rel_hum,vis_km,geometry,index_right,SA2_CODE21,SA2_NAME21,median_age
0,2024-05-15 05:12:05+00:00,a relnofollow noopener noreferrer target_blan...,144.921835,-37.831435,0.0,6.3,7.0,0.0,0,100,10,POINT (144.92184 -37.83143),791,206051511,Port Melbourne Industrial,32.0
1,2024-05-15 05:12:05+00:00,Wednesday Weekly Blogging Challenge Funny Thin...,144.908716,-37.721948,0.25,6.3,7.0,0.0,0,100,10,POINT (144.90872 -37.72195),908,210011231,Strathmore,41.0
2,2024-05-15 05:12:05+00:00,Asked IT if was a a small uhoh or a big uhoh a...,145.016623,-37.816008,-0.216667,6.3,7.0,0.0,0,100,10,POINT (145.01662 -37.81601),816,207011519,Hawthorn - North,35.0
3,2024-05-15 05:11:58+00:00,,145.084846,-37.850674,0.0,6.3,7.0,0.0,0,100,10,POINT (145.08485 -37.85067),812,207011150,Glen Iris - East,41.0
4,2024-05-15 05:12:01+00:00,The US is still falling behind on electronic w...,144.870553,-37.729479,-0.021429,6.3,7.0,0.0,0,100,10,POINT (144.87055 -37.72948),904,210011226,Airport West,39.0
5,2024-05-15 05:12:01+00:00,What if moron but also regretp,144.947328,-37.819491,-0.8,6.3,7.0,0.0,0,100,10,POINT (144.94733 -37.81949),772,206041118,Docklands,32.0
6,2024-05-15 05:11:53+00:00,Justine Elliot talks up federal budgetbra rel...,145.065324,-37.894756,0.0,6.3,7.0,0.0,0,100,10,POINT (145.06532 -37.89476),840,208021176,Carnegie,36.0
7,2024-05-15 05:12:00+00:00,Tämän vuoden kanditöiden ohjaukset lähestyvät ...,145.043261,-37.815097,0.0,6.3,7.0,0.0,0,100,10,POINT (145.04326 -37.81510),818,207011521,Kew - South,42.0
