# Marites Output Transform

Contains logic for the output transformation

In [1]:
import boto3
import tarfile
from io import BytesIO, TextIOWrapper
import json
import re
import pandas as pd
import os
from dotenv import load_dotenv
import pyTigerGraph as tg

load_dotenv()
print("Import successful.")

Import successful.


In [2]:
# constants
output_bucket = 'marites-comprehend-output'
file_path = '8454a2d2-1eee-4e6f-9107-aa3fe249bc80/elonmusk/368767127050-TS-9dc7aaeeb573b83c385ac0cca489f9cf/output/output.tar.gz'

tg_graph = "marites"
tg_host = os.environ.get("TG_HOST")
tg_password = os.environ.get("TG_PASSWORD")

## Extract file from S3 bucket

In [3]:
s3 = boto3.client('s3')
input_tar_file = s3.get_object(Bucket=output_bucket,Key=file_path)
input_tar_content = input_tar_file['Body'].read()

In [4]:
def map_result(data):
    json_data = json.loads(data)
    filename = json_data['File']
    line_num = json_data['Line']
    tag = re.search('(.*)_(.*).txt', filename).group(2)
    
    line_id = '{}-{}'.format(line_num, tag)
    
    results = []
    
    for entity in json_data['Entities']:
        for mention in entity['Mentions']:
            sentiment_data = mention['MentionSentiment']
            sentiment_scores = sentiment_data['SentimentScore']
            
            topic = {
                'line_id': line_id,
                'text': mention['Text'],
                'type': mention['Type'],
                'sentiment': sentiment_data['Sentiment'],
                'positive_score': sentiment_scores['Positive'],
                'negative_score': sentiment_scores['Negative'],
                'neutral_score': sentiment_scores['Neutral'],
                'mixed_score': sentiment_scores['Mixed']
            }

            results.append(topic)

    return results

In [5]:
entities = [];

with tarfile.open(fileobj=BytesIO(input_tar_content)) as tar:
    for tar_resource in tar:
        file = TextIOWrapper(tar.extractfile(tar_resource), encoding='utf-8')
        lines = file.readlines()
        for line in lines:
            results = map_result(line)
            entities.extend(results)


In [6]:
topic_df = pd.DataFrame(entities)

In [10]:
topic_df.head()

Unnamed: 0,line_id,text,type,sentiment,positive_score,negative_score,neutral_score,mixed_score
0,0-04-13-22-elonmusk,69.420%,QUANTITY,NEUTRAL,5e-06,1.9e-05,0.999973,3e-06
1,0-04-13-22-elonmusk,statistics,OTHER,NEUTRAL,0.0,5e-06,0.999994,0.0
2,1-04-13-22-elonmusk,first,QUANTITY,NEUTRAL,1e-06,0.0,0.999998,1e-06
3,2-04-13-22-elonmusk,you,PERSON,POSITIVE,1.0,0.0,0.0,0.0
4,2-04-13-22-elonmusk,everyone,PERSON,POSITIVE,0.999997,1e-06,2e-06,0.0


## Push output data to Tigergraph

In [7]:
conn = tg.TigerGraphConnection(host=tg_host, graphname=tg_graph, password=tg_password)
secret = conn.createSecret()
conn.getToken(secret=secret)

('99da2nc4m9ajplde6eaa5av3bk8bb5jl', 1652515653, '2022-05-14 08:07:33')

In [8]:
# Create topic vertex
conn.upsertVertexDataFrame(
    df=topic_df,
    vertexType='topic',
    v_id='text',
    attributes={ 'text': 'text', 'type': 'type' }
)

5873

In [9]:
# Create post_topic edges
conn.upsertEdgeDataFrame(
    df=topic_df,
    sourceVertexType='post',
    edgeType='topic_sentiment',
    targetVertexType='topic',
    from_id='line_id',
    to_id='text',
    attributes={ 
        'topic': 'text',
        'sentiment': 'sentiment',
        'positive_score': 'positive_score',
        'negative_score': 'negative_score',
        'neutral_score': 'neutral_score',
        'mixed_score': 'mixed_score'
    }
)

13430