# Harvest data from Mastodon

In [33]:
from mastodon import Mastodon
import json, time
import pandas as pd
import requests

# For initialization, some server/feature/operation does not need token, while for others, apply for token
m = Mastodon(
    api_base_url=f'https://mastodon.au'
    # use the url of the server such as https://aus.social for https://aus.social/explore
)

# Use timer or loop to achieve streaming harvester, be ware of the frequency
lastid = m.timeline(timeline='public', since_id=None, limit=1, remote=True)[0]['id']
time.sleep(10)
mastodon_data = m.timeline(timeline='public', since_id=lastid, remote=True)
for i in range(len(mastodon_data)):
    print(f"{mastodon_data[i].account.acct}: {mastodon_data[i].id}")

50years_music@mastodon.online: 112421117514732576
LunarLioness814@mastodon.social: 112421117472929480
benhuser@mastodon.online: 112421117437049725
kopio@mas.to: 112421117419036708


# Harvest data from BoM

In [34]:
bom_data = requests.get('http://reg.bom.gov.au/fwo/IDV60901/IDV60901.95936.json').json()['observations']['data']
for i in range(len(bom_data)):
    print(f"{bom_data[i]['local_date_time']} air_temp: {bom_data[i]['air_temp']}, wind_spd_kmh: {bom_data[i]['wind_spd_kmh']}")

11/04:30pm air_temp: 17.3, wind_spd_kmh: 11
11/04:00pm air_temp: 17.7, wind_spd_kmh: 9
11/03:30pm air_temp: 17.8, wind_spd_kmh: 13
11/03:00pm air_temp: 18.1, wind_spd_kmh: 13
11/02:30pm air_temp: 18.5, wind_spd_kmh: 13
11/02:00pm air_temp: 18.6, wind_spd_kmh: 11
11/01:30pm air_temp: 18.7, wind_spd_kmh: 13
11/01:00pm air_temp: 19.0, wind_spd_kmh: 7
11/12:30pm air_temp: 18.6, wind_spd_kmh: 7
11/12:00pm air_temp: 18.9, wind_spd_kmh: 7
11/11:30am air_temp: 18.6, wind_spd_kmh: 9
11/11:00am air_temp: 17.4, wind_spd_kmh: 9
11/10:30am air_temp: 16.8, wind_spd_kmh: 9
11/10:00am air_temp: 16.2, wind_spd_kmh: 9
11/09:30am air_temp: 15.6, wind_spd_kmh: 4
11/09:00am air_temp: 14.2, wind_spd_kmh: 0
11/08:30am air_temp: 13.6, wind_spd_kmh: 0
11/08:00am air_temp: 13.2, wind_spd_kmh: 0
11/07:30am air_temp: 13.1, wind_spd_kmh: 0
11/07:06am air_temp: 12.9, wind_spd_kmh: 0
11/07:00am air_temp: 13.0, wind_spd_kmh: 0
11/06:30am air_temp: 13.1, wind_spd_kmh: 0
11/06:00am air_temp: 13.4, wind_spd_kmh: 0
11/05

# Data processing

In [35]:
mas_df = pd.DataFrame(mastodon_data)

In [36]:
import geopandas as gpd
import random
from shapely.geometry import Point


melbourne_area = gpd.read_file("geo/melbourne.geojson")
melbourne_polygon = melbourne_area.geometry.unary_union

def generate_random_points_in_polygon(poly, num_points):
    minx, miny, maxx, maxy = poly.bounds
    points = []
    while len(points) < num_points:
        p = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if poly.contains(p):
            points.append(p)
    return points

num = len(mas_df)
random_points = generate_random_points_in_polygon(melbourne_polygon, num)

x_coords = [point.x for point in random_points]
y_coords = [point.y for point in random_points]

mas_df['longitude'] = x_coords
mas_df['latitude'] = y_coords

In [38]:
mas_df = mas_df[['created_at', 'content', 'longitude', 'latitude']]
mas_df

Unnamed: 0,created_at,content,longitude,latitude
0,2024-05-11 06:48:36+00:00,"<p>Hungry Eyes <br><a href=""https://mastodon.o...",145.056152,-37.909641
1,2024-05-11 06:48:34+00:00,<p>After a hard day of hunting &amp; squawking...,145.016419,-37.793232
2,2024-05-11 06:48:35+00:00,"<p>day 808 of <a href=""https://mastodon.online...",144.944995,-37.738408
3,2024-05-11 06:48:35+00:00,<p>Conversation with A.</p><p>A can't stop boa...,145.020769,-37.915104


In [39]:
from textblob import TextBlob
import re

def preprocess_text(text):
    text = re.sub(r'href=\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'["\']', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text


def sentiment_analysis(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity


mas_df['content'] = mas_df['content'].apply(preprocess_text)
mas_df['sentiments'] = mas_df['content'].apply(sentiment_analysis)
mas_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mas_df['content'] = mas_df['content'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mas_df['sentiments'] = mas_df['content'].apply(sentiment_analysis)


Unnamed: 0,created_at,content,longitude,latitude,sentiments
0,2024-05-11 06:48:36+00:00,pHungry Eyes bra classmention hashtag relnofo...,145.056152,-37.909641,0.0
1,2024-05-11 06:48:34+00:00,pAfter a hard day of hunting amp squawking the...,145.016419,-37.793232,-0.395833
2,2024-05-11 06:48:35+00:00,pday 808 of a classmention hashtag relnofollo...,144.944995,-37.738408,0.0
3,2024-05-11 06:48:35+00:00,pConversation with AppA cant stop boasting how...,145.020769,-37.915104,0.4


In [42]:
weather_df = pd.DataFrame(bom_data)

weather_df = weather_df[['local_date_time_full', 'apparent_t', 'air_temp', 'vis_km']]

def round_to_nearest_half_hour(dt):
    minute = dt.minute
    if minute < 15:
        return dt.replace(minute=0, second=0)
    elif minute < 45:
        return dt.replace(minute=30, second=0)
    else:
        return dt.replace(minute=0, second=0) + pd.Timedelta(hours=1)

mas_df['created_at'] = pd.to_datetime(mas_df['created_at'])
mas_df['rounded_created_at'] = mas_df['created_at'].apply(round_to_nearest_half_hour)

weather_df['local_date_time_full'] = pd.to_datetime(weather_df['local_date_time_full'], format='%Y%m%d%H%M%S')


merged_df = pd.merge(mas_df, weather_df, left_on='rounded_created_at', right_on='local_date_time_full', how='left')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mas_df['created_at'] = pd.to_datetime(mas_df['created_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mas_df['rounded_created_at'] = mas_df['created_at'].apply(round_to_nearest_half_hour)


In [43]:
merged_df

Unnamed: 0,created_at,content,longitude,latitude,sentiments,rounded_created_at,local_date_time_full,apparent_t,air_temp,vis_km
0,2024-05-11 06:48:36,pHungry Eyes bra classmention hashtag relnofo...,145.056152,-37.909641,0.0,2024-05-11 07:00:00,2024-05-11 07:00:00,13.9,13.0,9
1,2024-05-11 06:48:34,pAfter a hard day of hunting amp squawking the...,145.016419,-37.793232,-0.395833,2024-05-11 07:00:00,2024-05-11 07:00:00,13.9,13.0,9
2,2024-05-11 06:48:35,pday 808 of a classmention hashtag relnofollo...,144.944995,-37.738408,0.0,2024-05-11 07:00:00,2024-05-11 07:00:00,13.9,13.0,9
3,2024-05-11 06:48:35,pConversation with AppA cant stop boasting how...,145.020769,-37.915104,0.4,2024-05-11 07:00:00,2024-05-11 07:00:00,13.9,13.0,9
