# Marites Output Transform

Contains logic for the output transformation

In [91]:
import boto3
import tarfile
from io import BytesIO, TextIOWrapper
import json
import re
import pandas as pd
import os
from dotenv import load_dotenv
import pyTigerGraph as tg

load_dotenv()
print("Import successful.")

Import successful.


In [90]:
# constants
output_bucket = 'marites-comprehend-output'
file_path = '8454a2d2-1eee-4e6f-9107-aa3fe249bc80/elonmusk/368767127050-TS-9dc7aaeeb573b83c385ac0cca489f9cf/output/output.tar.gz'

tg_graph = "marites"
tg_host = os.environ.get("TG_HOST")
tg_password = os.environ.get("TG_PASSWORD")

## Extract file from S3 bucket

In [5]:
s3 = boto3.client('s3')
input_tar_file = s3.get_object(Bucket=output_bucket,Key=file_path)
input_tar_content = input_tar_file['Body'].read()

In [83]:
def map_result(data):
    json_data = json.loads(data)
    filename = json_data['File']
    line_num = json_data['Line']
    tag = re.search('(.*)_(.*).txt', filename).group(2)
    
    line_id = '{}-{}'.format(line_num, tag)
    
    results = []
    
    for entity in json_data['Entities']:
        for mention in entity['Mentions']:
            sentiment_data = mention['MentionSentiment']
            sentiment_scores = sentiment_data['SentimentScore']
            
            topic = {
                'line_id': line_id,
                'text': mention['Text'],
                'type': mention['Type'],
                'sentiment': sentiment_data['Sentiment'],
                'positive_score': sentiment_scores['Positive'],
                'negative_score': sentiment_scores['Negative'],
                'neutral_score': sentiment_scores['Neutral'],
                'mixed_score': sentiment_scores['Mixed']
            }

            results.append(topic)

    return results

In [84]:
entities = [];

with tarfile.open(fileobj=BytesIO(input_tar_content)) as tar:
    for tar_resource in tar:
        file = TextIOWrapper(tar.extractfile(tar_resource), encoding='utf-8')
        lines = file.readlines()
        for line in lines:
            results = map_result(line)
            entities.extend(results)


In [85]:
topic_df = pd.DataFrame(entities)

In [107]:
df = topic_df[['text', 'type']]
df = df.groupby('text')
df.head()

Unnamed: 0,text,type
0,69.420%,QUANTITY
1,statistics,OTHER
2,first,QUANTITY
3,you,PERSON
4,everyone,PERSON
...,...,...
14011,tomorrows,DATE
14017,Webcast,OTHER
14018,at ~7:55 a.m. ET,DATE
14020,"Friday, April 8 at 11:17 a.m.",DATE


## Push output data to Tigergraph

In [92]:
conn = tg.TigerGraphConnection(host=tg_host, graphname=tg_graph, password=tg_password)
secret = conn.createSecret()
conn.getToken(secret=secret)

('g5htpof7jhfnt6tnjt05q55pbfqotnvr', 1652509550, '2022-05-14 06:25:50')

In [95]:
# Create topic vertex
conn.upsertVertexDataFrame(
    df=topic_df,
    vertexType='topic',
    v_id='text',
    attributes={ '' }
)

5873

In [96]:
# Create post_topic edges
conn.upsertEdgeDataFrame(
    df=topic_df,
    sourceVertexType='post',
    edgeType='post_topic',
    targetVertexType='topic',
    from_id='line_id',
    to_id='text',
    attributes={ 'topic': 'text' }
)

KeyboardInterrupt: 