# Marites Input Transform

Code for transforming input CSV from S3 bucket and creating the graph

In [1]:
import pyTigerGraph as tg
from dotenv import load_dotenv
import os
import boto3
import pandas as pd

load_dotenv()
print("Import complete.")

Import complete.


In [2]:
# Constants

tg_graph = "marites"
tg_host = os.environ.get("TG_HOST")
tg_password = os.environ.get("TG_PASSWORD")

input_bucket = 'marites-comprehend-input'
input_folder = '8454a2d2-1eee-4e6f-9107-aa3fe249bc80/elonmusk/tigergraph'

In [None]:
conn = tg.TigerGraphConnection(host=tg_host, graphname=tg_graph, password=tg_password)

In [7]:
print(conn.gsql('ls', options=[]))

---- Global vertices, edges, and all graphs
Vertex Types:
- VERTEX person(PRIMARY_ID name STRING, name STRING, age INT, gender STRING, state STRING) WITH STATS="OUTDEGREE_BY_EDGETYPE"
Edge Types:
- UNDIRECTED EDGE friendship(FROM person, TO person, connect_day DATETIME)

Graphs:
- Graph social(person:v, friendship:e)
Jobs:


JSON API version: v2
Syntax version: v2



## Create graph schema

In [None]:
print(conn.gsql('use global drop all'))

In [5]:
print(conn.gsql('''
use global

create vertex user (primary_id username string, name string, username string)

create vertex post (
    primary_id line_id string,
    line_id string,
    tweet_id int,
    username string,
    text string,
    created_at datetime
)

create vertex topic (
    primary_id text string,
    text string,
    type string,
)

create directed edge following (from user, to user, connect_day string)
create undirected edge tweet (from user, to post, created_at datetime)
create undirected edge post_topic (
    from post,
    to topic,
    topic string,
    sentiment string,
    positive_score double,
    negative_score double,
    neutral_score double,
    mixed_score double
)

create graph marites(user, post, topic, following, tweet, post_topic)
'''))

Successfully created vertex types: [user].
Successfully created vertex types: [post].
Successfully created vertex types: [topic].
Successfully created edge types: [following].
Successfully created edge types: [tweet].
Successfully created edge types: [post_topic].
Stopping GPE GSE RESTPP
Successfully stopped GPE GSE RESTPP in 30.306 seconds
Starting GPE GSE RESTPP
Successfully started GPE GSE RESTPP in 0.201 seconds
The graph marites is created.


## Load input data from S3 bucket


In [6]:
def get_file_from_s3(bucket, folder, filename):
    s3 = boto3.client('s3')
    s3_key = '{}/{}'.format(folder, filename)
    response = s3.get_object(Bucket=bucket, Key=s3_key)
    status_code = response.get('ResponseMetadata', {}).get("HTTPStatusCode")
    if status_code == 200:
        return pd.read_csv(response.get("Body"))
    else:
        raise Exception("Unsuccessful S3 get object")

In [7]:
following = get_file_from_s3(input_bucket, input_folder, 'following.csv')
users = get_file_from_s3(input_bucket, input_folder, 'users.csv')
posts = get_file_from_s3(input_bucket, input_folder, 'posts.csv')

In [8]:
posts.head()

Unnamed: 0,tweet_id,username,created_at,text,line_id
0,1512886651940491270,elonmusk,2022-04-09T20:14:20.000Z,69.420% of statistics are false,0-04-13-22-elonmusk
1,1512886157876600833,elonmusk,2022-04-09T20:12:22.000Z,Truth is the first casualty.,1-04-13-22-elonmusk
2,1512813698011836422,elonmusk,2022-04-09T15:24:26.000Z,Thank you to everyone who came out to celebrat...,2-04-13-22-elonmusk
3,1512787864458870787,elonmusk,2022-04-09T13:41:47.000Z,Docking confirmed!,3-04-13-22-elonmusk
4,1512785529712123906,elonmusk,2022-04-09T13:32:31.000Z,TOP 10 most followed Twitter accounts: 1. @...,4-04-13-22-elonmusk


In [9]:
secret = conn.createSecret()
conn.getToken(secret=secret)

('nnab894n9i3h54tcslm4orldu6n99ccp', 1652509248, '2022-05-14 06:20:48')

In [10]:
# Create users vertex
conn.upsertVertexDataFrame(
    df=users,
    vertexType='user',
    v_id='username',
    attributes={'name': 'name', 'username': 'username'})

114

In [11]:
# Create posts vertex
conn.upsertVertexDataFrame(df=posts, vertexType='post', v_id='line_id')

2347

In [12]:
# Create following edges
conn.upsertEdgeDataFrame(
    df=following,
    sourceVertexType='user',
    edgeType='following',
    targetVertexType='user',
    from_id='user',
    to_id='following',
    attributes={ 'connect_day': 'date' })

113

In [13]:
# Create tweet edges
conn.upsertEdgeDataFrame(
    df=posts,
    sourceVertexType='user',
    edgeType='tweet',
    targetVertexType='post',
    from_id='username',
    to_id='line_id',
    attributes={ 'created_at': 'created_at' })

2347