# Marites Input Transform

Code for transforming input CSV from S3 bucket and creating the graph

In [1]:
import pyTigerGraph as tg
from dotenv import load_dotenv
import os
import boto3
import pandas as pd
import tarfile
import re
from io import BytesIO

load_dotenv()
print("Import complete.")

Import complete.


In [3]:
# Constants

tg_graph = "marites"
tg_host = os.environ.get("TG_HOST")
tg_password = os.environ.get("TG_PASSWORD")

input_bucket = 'marites-comprehend-input'
input_file = 'tigergraph/f57f7e2b-a5eb-4e17-ad67-aee466d00d12/elonmusk.tar.gz'

In [4]:
conn = tg.TigerGraphConnection(host=tg_host, graphname=tg_graph, password=tg_password)

In [None]:
print(conn.gsql('ls', options=[]))

## Create graph schema

In [10]:
conn.gsql('use global drop all')

'Dropping all, about 1 minute ...\nAbort all active loading jobs\nTry to abort all loading jobs on graph marites, it may take a while ...\n[ABORT_SUCCESS] No active Loading Job to abort.\nResetting GPE...\nSuccessfully reset GPE and GSE\nStopping GPE GSE\nSuccessfully stopped GPE GSE in 0.004 seconds\nClearing graph store...\nSuccessfully cleared graph store\nStarting GPE GSE RESTPP\nSuccessfully started GPE GSE RESTPP in 0.151 seconds\nEverything is dropped.'

In [24]:
conn.gsql('''

drop edge following
drop edge created_post
drop edge topic_sentiment

drop vertex user
drop vertex post
drop vertex topic

drop graph marites

''')

'Successfully dropped edge types: [following].\nSuccessfully dropped edge types: [created_post].\nSuccessfully dropped edge types: [topic_sentiment].\nSuccessfully dropped vertex types: [user].\nSuccessfully dropped vertex types: [post].\nSuccessfully dropped vertex types: [topic].\nThe graph marites could not be dropped!'

In [11]:
print(conn.gsql('''
use global

create vertex user (primary_id username string, name string, username string)

create vertex post (
    primary_id line_id string,
    line_id string,
    tweet_id int,
    username string,
    text string,
    created_at datetime
)

create vertex topic (
    primary_id text string,
    text string,
    type string
)

create directed edge following (from user, to user, connect_day string)
create undirected edge created_post (from user, to post, created_at datetime)
create undirected edge topic_sentiment (
    from post,
    to topic,
    topic string,
    sentiment string,
    positive_score double,
    negative_score double,
    neutral_score double,
    mixed_score double
)

create graph marites(user, post, topic, following, created_post, topic_sentiment)
'''))

Successfully created vertex types: [user].
Successfully created vertex types: [post].
Successfully created vertex types: [topic].
Successfully created edge types: [following].
Successfully created edge types: [created_post].
Successfully created edge types: [topic_sentiment].
Stopping GPE GSE RESTPP
Successfully stopped GPE GSE RESTPP in 1.243 seconds
Starting GPE GSE RESTPP
Successfully started GPE GSE RESTPP in 0.145 seconds
The graph marites is created.


## Load input data from S3 bucket


In [11]:
filename = 'output/following.csv'
tag = re.search('(.*)/(.*).csv', filename).group(2)
tag

'following'

In [13]:
def get_frames_from_s3(bucket, s3_key):
    s3 = boto3.client('s3')
    input_tar_file = s3.get_object(Bucket=bucket, Key=s3_key)
    input_tar_content = input_tar_file['Body'].read()
    
    tar = tarfile.open(fileobj=BytesIO(input_tar_content))
    
    contents = {}
    
    for tar_resource in tar:
        filename = tar_resource.name
        key = re.search('(.*)/(.*).csv', filename).group(2)
        df = pd.read_csv(tar.extractfile(tar_resource), header=0)
        contents[key] = df
        
    return contents

def get_file_from_s3(bucket, folder, filename):
    s3 = boto3.client('s3')
    s3_key = '{}/{}'.format(folder, filename)
    response = s3.get_object(Bucket=bucket, Key=s3_key)
    status_code = response.get('ResponseMetadata', {}).get("HTTPStatusCode")
    if status_code == 200:
        return pd.read_csv(response.get("Body"))
    else:
        raise Exception("Unsuccessful S3 get object")

In [15]:
input_frames = get_frames_from_s3(input_bucket, input_file)

following = input_frames['following']
users = input_frames['users']
posts = input_frames['posts']

In [16]:
posts.head()

Unnamed: 0,tweet_id,username,created_at,text,line_id
0,1514720245113577473,elonmusk,2022-04-14T21:40:23.000Z,iu,1-04-15-22-elonmusk
1,1514698036760530945,elonmusk,2022-04-14T20:12:08.000Z,Taking Twitter private at $54.20 should be up ...,2-04-15-22-elonmusk
2,1514564966564651008,elonmusk,2022-04-14T11:23:21.000Z,I made an offer,3-04-15-22-elonmusk
3,1514529863855710214,elonmusk,2022-04-14T09:03:52.000Z,Photos from the @space_station of Dragon and t...,4-04-15-22-elonmusk
4,1512886651940491270,elonmusk,2022-04-09T20:14:20.000Z,69.420% of statistics are false,5-04-15-22-elonmusk


In [17]:
secret = conn.createSecret()
conn.getToken(secret=secret)

('3p7ar843fo0koq7fkn8uoml0lsvmgitt', 1652582104, '2022-05-15 02:35:04')

In [18]:
# Create users vertex
conn.upsertVertexDataFrame(
    df=users,
    vertexType='user',
    v_id='username',
    attributes={'name': 'name', 'username': 'username'})

115

In [19]:
# Create posts vertex
conn.upsertVertexDataFrame(df=posts, vertexType='post', v_id='line_id')

2356

In [20]:
# Create following edges
conn.upsertEdgeDataFrame(
    df=following,
    sourceVertexType='user',
    edgeType='following',
    targetVertexType='user',
    from_id='user',
    to_id='following',
    attributes={ 'connect_day': 'date' })

114

In [21]:
# Create tweet edges
conn.upsertEdgeDataFrame(
    df=posts,
    sourceVertexType='user',
    edgeType='created_post',
    targetVertexType='post',
    from_id='username',
    to_id='line_id',
    attributes={ 'created_at': 'created_at' })

2356