# Marites Analyse

## Overview
Contains the logic for the analyse function

In [1]:
# Imports
import os
from dotenv import load_dotenv
import requests
from datetime import datetime
import pandas as pd
import re
import boto3
from io import StringIO
from uuid import uuid4

load_dotenv()
print("Import complete.")

Import complete.


In [2]:
max_twitter_posts = 100
max_following = 250
token = os.environ.get("BEARER_TOKEN")
test_username = 'elonmusk'

region = 'ap-southeast-1'
language_code = 'en'
input_bucket = 'marites-comprehend-input'
output_bucket = 'marites-comprehend-output'
data_access_role_arn = os.environ.get("DATA_ACCESS_ROLE")
input_doc_format = 'ONE_DOC_PER_LINE'

tg_input_folder = 'tigergraph' 
comprehend_input_folder = 'comprehend'

session_id = uuid4()

In [3]:
# Twitter Functions

search_url = "https://api.twitter.com/2/tweets/search/recent"
following_url = "https://api.twitter.com/2/users/{}/following"
lookup_username_url = "https://api.twitter.com/2/users/by/username/{}"


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """
    r.headers["Authorization"] = f"Bearer {token}"
    return r


def fetch_user_by_username(username):
    url = lookup_username_url.format(username)
    response = requests.get(url, auth=bearer_oauth)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_res = response.json()
    return json_res['data']

def map_tweets_to_post(raw_data):
    if 'data' not in raw_data:
        return []

    tweets = raw_data['data']
    username = raw_data['includes']['users'][0]['username']
    ref_tweets = { tweet['id']: tweet['text'] for tweet in raw_data['includes']['tweets'] } if 'includes' in raw_data and 'tweets' in raw_data['includes'] else {}
    
    results = []
    for t in tweets:
        post = { 
            'tweet_id': t['id'],
            'username': username,
            'created_at': t['created_at']
        }
        if 'referenced_tweets' in t:
            combined_text = []
            for rt in t['referenced_tweets']:
                rt_id = rt['id']
                if rt_id in ref_tweets:
                    rt_text = ref_tweets[rt_id]
                    combined_text.append(rt_text)
            post['text'] = ' '.join(combined_text)
        else:
            post['text'] = t['text']

        results.append(post)
    
    return results

def fetch_tweets_by_username(username):
    params = {
        "query": "from:{} -is:reply".format(username),
        "max_results": max_twitter_posts,
        "expansions": "referenced_tweets.id,author_id",
        "tweet.fields": "created_at",
        "user.fields": "username"
    }
    response = requests.get(search_url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    data = response.json()
    return map_tweets_to_post(data)

def fetch_following(user_id):
    url = following_url.format(user_id)
    params = {
        'max_results': max_following
    }
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_res = response.json()
    return json_res['data']

In [4]:
# Twitter data extraction

def get_user_tweets(users_to_search):
    processed = 0
    all_tweets = []
    for user in users_to_search:
        user_tweets = fetch_tweets_by_username(user)
        processed += 1
        all_tweets.extend(user_tweets)
        progress = round((processed / len(users_to_search)) * 100, 2)
        print("Processed {}/{} users ({}%)".format(processed, len(users_to_search), progress))
    user_tweets = pd.DataFrame(all_tweets)
    return user_tweets

def get_user_following_map(user, following):
    date = datetime.now().strftime("%m-%d-%y")
    username = user['username']
    follow_names = list(map(lambda x: x['username'], following))
    
    return pd.DataFrame({
        'user': [username] * len(following),
        'following': follow_names,
        'date': [date] * len(following)
    })


def clean_posts(data):
    user_tweets = data
    
    # Clean up the links from the text (they're useless to us)
    user_tweets['text'] = user_tweets['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

    # Remove all emojis
    user_tweets = user_tweets.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

    # Remove blank tweets
    user_tweets = user_tweets[user_tweets.text.str.strip().str.len() != 0]

    # Ensure that all text is in a single line
    user_tweets.text = user_tweets.text.str.replace('\n', ' ');
    user_tweets.text = user_tweets.text.str.replace('\r', ' ');
    
    return user_tweets

def extract_twitter_data(username):
    users_list = []
    user = fetch_user_by_username(username)
    user_following = fetch_following(user['id'])

    users_list.append(user)
    users_list.extend(user_following)
    
    users_to_search = list(map(lambda x: x['username'], users_list))
    
    posts_df = get_user_tweets(users_to_search)
    following_df = get_user_following_map(user, user_following) # user -> following edges
    users_df = pd.DataFrame(users_list) # users vertex
    
    return {
        'posts': clean_posts(posts_df),
        'following': following_df,
        'users': users_df
    }


In [16]:
data = extract_twitter_data(test_username)

Processed 1/114 users (0.88%)
Processed 2/114 users (1.75%)
Processed 3/114 users (2.63%)
Processed 4/114 users (3.51%)
Processed 5/114 users (4.39%)
Processed 6/114 users (5.26%)
Processed 7/114 users (6.14%)
Processed 8/114 users (7.02%)
Processed 9/114 users (7.89%)
Processed 10/114 users (8.77%)
Processed 11/114 users (9.65%)
Processed 12/114 users (10.53%)
Processed 13/114 users (11.4%)
Processed 14/114 users (12.28%)
Processed 15/114 users (13.16%)
Processed 16/114 users (14.04%)
Processed 17/114 users (14.91%)
Processed 18/114 users (15.79%)
Processed 19/114 users (16.67%)
Processed 20/114 users (17.54%)
Processed 21/114 users (18.42%)
Processed 22/114 users (19.3%)
Processed 23/114 users (20.18%)
Processed 24/114 users (21.05%)
Processed 25/114 users (21.93%)
Processed 26/114 users (22.81%)
Processed 27/114 users (23.68%)
Processed 28/114 users (24.56%)
Processed 29/114 users (25.44%)
Processed 30/114 users (26.32%)
Processed 31/114 users (27.19%)
Processed 32/114 users (28.07%

In [29]:
posts = data['posts']
posts['line_id'] = posts.index.map(lambda x: '{}-{}'.format(test_username, x))
posts

Unnamed: 0,tweet_id,username,created_at,text,id,line_id
0,1512886651940491270,elonmusk,2022-04-09T20:14:20.000Z,69.420% of statistics are false,0-elonmusk,elonmusk-0
1,1512886157876600833,elonmusk,2022-04-09T20:12:22.000Z,Truth is the first casualty.,1-elonmusk,elonmusk-1
2,1512813698011836422,elonmusk,2022-04-09T15:24:26.000Z,Thank you to everyone who came out to celebrat...,2-elonmusk,elonmusk-2
3,1512787864458870787,elonmusk,2022-04-09T13:41:47.000Z,Docking confirmed!,3-elonmusk,elonmusk-3
4,1512785529712123906,elonmusk,2022-04-09T13:32:31.000Z,TOP 10 most followed Twitter accounts: 1. @...,4-elonmusk,elonmusk-4
...,...,...,...,...,...,...
2512,1512405019013763076,SpaceX,2022-04-08T12:20:30.000Z,Ax-1 crew arrives at historic Launch Complex 3...,2512-elonmusk,elonmusk-2512
2513,1512398808537186304,SpaceX,2022-04-08T11:55:49.000Z,Watch Falcon 9 launch @Axiom_Spaces Ax-1 missi...,2513-elonmusk,elonmusk-2513
2514,1512360477988634625,SpaceX,2022-04-08T09:23:30.000Z,Targeting 11:17 a.m. ET for todays Falcon 9 la...,2514-elonmusk,elonmusk-2514
2515,1512065240116072466,SpaceX,2022-04-07T13:50:20.000Z,All systems are looking good for tomorrows Fal...,2515-elonmusk,elonmusk-2515


In [5]:
# Comprehend analysis

def upload_text_to_s3(data, bucket_name, file_name):
    text_buffer = StringIO()
    data.text.to_csv(text_buffer, sep=' ', index=False, header=False)
    s3_resource = boto3.resource('s3')
    return s3_resource.Object(bucket_name, '{}.txt'.format(file_name)).put(Body=text_buffer.getvalue())

def upload_csv_to_s3(data, bucket_name, file_name):
    buffer = StringIO()
    data.to_csv(buffer, index=False)
    s3_resource = boto3.resource('s3')
    return s3_resource.Object(bucket_name, '{}.csv'.format(file_name)).put(Body=buffer.getvalue())

def start_targeted_sentiment_job(input_s3_url, output_s3_url, job_tag):
    input_data_config = {
        'S3Uri': input_s3_url,
        'InputFormat': input_doc_format
    }

    output_data_config = {
        'S3Uri': output_s3_url
    }

    job_name = 'Targeted_Sentiment_Job_{}'.format(job_tag)
    
    comprehend = boto3.client('comprehend', region_name=region)
    return comprehend.start_targeted_sentiment_detection_job(InputDataConfig=input_data_config,
                                                             OutputDataConfig=output_data_config, 
                                                             DataAccessRoleArn=data_access_role_arn, 
                                                             LanguageCode=language_code,
                                                             JobName=job_name)

def analyse_tweets(username):
    date = datetime.now().strftime("%m-%d-%y")
    tag = "{}-{}".format(date, username)
    
    twitter_data = extract_twitter_data(username)

    posts = twitter_data['posts']
    posts['line_id'] = posts.index.map(lambda x: '{}-{}'.format(x, tag)) # used for mapping entities

    following = twitter_data['following']
    users = twitter_data['users']
    
    session_folder = '{}/{}'.format(session_id, username)
    tg_folder = '{}/{}'.format(tg_input_folder, session_folder) # Tigergraph files
    comp_folder = '{}/{}'.format(comprehend_input_folder, session_folder) # Comprehend files

    posts_filename = 'posts'
    following_filename = 'following'
    users_filename = 'users'
    
    # Upload data to Comprehend input folder
    print("Uploading comprehend input files...")
    upload_text_to_s3(posts, input_bucket, '{}/{}_{}'.format(comp_folder, posts_filename, tag))
    
    print("Uploading Tigergraph input files...")
    # Upload data to Tigergraph input folder
    upload_csv_to_s3(posts, input_bucket, '{}/{}'.format(tg_folder, posts_filename))
    upload_csv_to_s3(following, input_bucket, '{}/{}'.format(tg_folder, following_filename))
    upload_csv_to_s3(users, input_bucket, '{}/{}'.format(tg_folder, users_filename))
    
    print("Starting comprehend job...")
    # Start comprehend job
    input_s3_url = 's3://{}/{}'.format(input_bucket, comp_folder)
    output_s3_url = 's3://{}/{}'.format(output_bucket, session_folder)
    return start_targeted_sentiment_job(input_s3_url, output_s3_url, tag)


In [6]:
analyse_tweets(test_username)

Processed 1/114 users (0.88%)
Processed 2/114 users (1.75%)
Processed 3/114 users (2.63%)
Processed 4/114 users (3.51%)
Processed 5/114 users (4.39%)
Processed 6/114 users (5.26%)
Processed 7/114 users (6.14%)
Processed 8/114 users (7.02%)
Processed 9/114 users (7.89%)
Processed 10/114 users (8.77%)
Processed 11/114 users (9.65%)
Processed 12/114 users (10.53%)
Processed 13/114 users (11.4%)
Processed 14/114 users (12.28%)
Processed 15/114 users (13.16%)
Processed 16/114 users (14.04%)
Processed 17/114 users (14.91%)
Processed 18/114 users (15.79%)
Processed 19/114 users (16.67%)
Processed 20/114 users (17.54%)
Processed 21/114 users (18.42%)
Processed 22/114 users (19.3%)
Processed 23/114 users (20.18%)
Processed 24/114 users (21.05%)
Processed 25/114 users (21.93%)
Processed 26/114 users (22.81%)
Processed 27/114 users (23.68%)
Processed 28/114 users (24.56%)
Processed 29/114 users (25.44%)
Processed 30/114 users (26.32%)
Processed 31/114 users (27.19%)
Processed 32/114 users (28.07%

{'JobId': '446eb8136a44259c95fc82dfbddd1e8b',
 'JobArn': 'arn:aws:comprehend:ap-southeast-1:368767127050:targeted-sentiment-detection-job/446eb8136a44259c95fc82dfbddd1e8b',
 'JobStatus': 'SUBMITTED',
 'ResponseMetadata': {'RequestId': '6c634d5b-0744-412c-a2fa-56eed7ab150d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '6c634d5b-0744-412c-a2fa-56eed7ab150d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '192',
   'date': 'Thu, 14 Apr 2022 05:10:25 GMT'},
  'RetryAttempts': 0}}