## R_13 Mastadon Data Processing and Bulk Upload

In [22]:
import json
import subprocess

In [2]:
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np

def get_sentiment(x):
    """ Helper to extract sentiment """

    sia = SentimentIntensityAnalyzer()

    sia_out = sia.polarity_scores(x)

    neg = sia_out['neg']
    pos = sia_out['pos']
    neu = sia_out['neu']
    compound = sia_out['compound']

    sentiment = np.argmax([sia_out['neg'], sia_out['neu'], sia_out['pos']]) - 1

    return neg, pos, neu, compound, sentiment



In [9]:
import re
def extract(input):
    """ Helper to extract text in betweein paragraph tags """
    
    # Use regular expression to extract the first text between <p> and </p> tags
    match = re.search('<p>(.*?)</p>', input)
    if match:
        # Extract the matched text group
        text = match.group(1)
        # Remove any HTML tags within the matched text
        clean_text = re.sub('<.*?>', '', text)

        return clean_text
    return input

In [10]:
def process_mastadon(doc):
    """ Process Mastadon data """

    out = []

    for toot in doc:

        # skip non english
        if toot['doc']['language'] != 'en':
            continue

        document = {}

        # add in the columns we are interested in
        document['_id'] = toot['id']
        document['author_id'] = str(toot['doc']['account']['id'])
        if 'key_match' in toot['doc']:
            document['partial_match'] = 1
        else:
            document['partial_match'] = 0
        document['date'] = toot['doc']['created_at'][:10]
        document['text'] = extract(toot['doc']['content'])

        # get the sentiment output
        neg, pos, neu, compound, sentiment = get_sentiment(document['text'])

        document['neg_score'] = neg
        document['neu_score'] = neu
        document['pos_score'] = pos
        document['compound_score'] = compound
        document['sentiment'] = sentiment
    
        out.append(document)

    return out

## Download and Process

In [17]:
import requests

processed_mastadon_toots = []

url = 'http://172.26.130.136:5984/mastodon_social_politics/_all_docs'
params = {'include_docs': 'true', 'limit' : 10000}

# Authenticate if needed
auth = ('admin', 'password')

i = 0

# Iteratively download data
while True:
    response = requests.get(url, params=params, auth=auth)
    if response.status_code == 200:
        data = response.json()
        if len(data['rows']) == 0:
            # No more documents
            break
        doc = data['rows']
        
        processed_mastadon_toots.extend(process_mastadon(doc))

        # Process document
        params['skip'] = params.get('skip', 0) + 10000
        
        i+=10000
        print(i)
        
    else:
        print(f'Request failed with status code {response.status_code}')
        break


10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000


In [18]:
reformatted_json = eval(str(processed_mastadon_toots))
with open('./../data/curated/Mastadon_Cleaned/toot_cdb_bulk.json', 'w') as f:
    json.dump(reformatted_json, f)

In [20]:
# check how many toots
len(reformatted_json)

390598

## Move onto CDB

In [23]:
# create new database
subprocess.run(f'curl -X PUT http://172.26.133.251:5984/toot_database -u group9_admin:group9_H1', shell=True)

{"ok":true}


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    12  100    12    0     0    119      0 --:--:-- --:--:-- --:--:--   120


CompletedProcess(args='curl -X PUT http://172.26.133.251:5984/toot_database -u group9_admin:group9_H1', returncode=0)

In [1]:
# upload toots
for i in range(len(reformatted_json)//10000):
    out = {'docs': reformatted_json[i*10000:(i+1)*10000]}
    json_data = json.dumps(out).encode('utf-8')
    subprocess.run(['curl', '-X', 'POST', f'http://172.26.133.251:5984/toot_database/_bulk_docs', '--header', 'Content-Type: application/json', '--data-binary', '@-', '-u', 'group9_admin:group9_H1'], input=json_data)

out = {'docs': reformatted_json[(i+1)*10000:len(reformatted_json)]}
json_data = json.dumps(out).encode('utf-8')
subprocess.run(['curl', '-X', 'POST', f'http://172.26.133.251:5984/toot_database/_bulk_docs', '--header', 'Content-Type: application/json', '--data-binary', '@-', '-u', 'group9_admin:group9_H1'], input=json_data)
