# Marites Proto

## Overview
This notebook contains the core functions used within the Marites API

## Set up dependencies

The lines below sets up the constants and imports

In [32]:
# All dependencies
import os
from dotenv import load_dotenv
import requests
import pandas as pd
from datetime import datetime
import re
import boto3
from io import StringIO

load_dotenv()
print('Import complete.')

Import complete.


In [31]:
max_twitter_posts = 100
max_following = 250
token = os.environ.get("BEARER_TOKEN")
test_username = 'elonmusk'

region = 'ap-southeast-1'
language_code = 'en'
input_bucket = 'marites-comprehend-input'
output_bucket = 'marites-comprehend-output'
data_access_role_arn = os.environ.get("DATA_ACCESS_ROLE")
input_doc_format = 'ONE_DOC_PER_LINE'

session_id = '1' # we need to change this

## Lambda Functions

**API Functions**

- POST /analyse : Retrieves tweets, transforms raw data, places them into S3 bucket, triggers AWS Comprehend job
- GET /user : Retrieves user from Tigergraph
- GET /news : Retrieves news from News APIs

**Internal Functions**

- input_to_graph : Retrieves and parses the CSV from the input S3 bucket and pushes the data to Tigergraph
- output_to_graph : Retrieves and parses the CSV from the output S3 bucket and pushes the data to Tigergraph


## Analyse

In [20]:
# Twitter Functions

search_url = "https://api.twitter.com/2/tweets/search/recent"
following_url = "https://api.twitter.com/2/users/{}/following"
lookup_username_url = "https://api.twitter.com/2/users/by/username/{}"

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """
    r.headers["Authorization"] = f"Bearer {token}"
    return r

def map_tweets_to_user(tweets, username):
    if 'data' not in tweets:
        return []
    
    ref_tweets = {tweet['id']: tweet['text'] for tweet in tweets['includes']['tweets']} if 'includes' in tweets else {}
    raw_tweets = tweets['data']
    
    results = []
    for t in raw_tweets:
        result_tweet = { 'tweet_id': t['id'], 'username': username }
        if 'referenced_tweets' in t:
            combined_text = []
            for rt in t['referenced_tweets']:
                rt_id = rt['id']
                if rt_id in ref_tweets:
                    rt_text = ref_tweets[rt_id]
                    combined_text.append(rt_text)
            result_tweet['text'] = ' '.join(combined_text)
        else:
            result_tweet['text'] = t['text']
        results.append(result_tweet)
    
    return results

def fetch_tweets_by_username(username):
    params = {
        "query": "from:{} -is:reply".format(username),
        "max_results": max_twitter_posts,
        "expansions": "referenced_tweets.id"
    }
    response = requests.get(search_url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    data = response.json()
    return map_tweets_to_user(data, username)

def fetch_user_by_username(username):
    url = lookup_username_url.format(username)
    response = requests.get(url, auth=bearer_oauth)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_res = response.json()
    return json_res['data']

def fetch_following_by_username(username):
    user = fetch_user_by_username(username)
    url = following_url.format(user['id'])
    params = {
        'max_results': max_following
    }
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    json_res = response.json()
    return json_res['data']

In [29]:
# Twitter data extraction

def get_user_tweets(users_to_search):
    processed = 0
    all_tweets = []
    for user in users_to_search:
        user_tweets = fetch_tweets_by_username(user)
        processed += 1
        all_tweets.extend(user_tweets)
        progress = round((processed / len(users_to_search)) * 100, 2)
        print("Processed {}/{} users ({}%)".format(processed, len(users_to_search), progress))
    user_tweets = pd.DataFrame(all_tweets)
    return user_tweets

def get_user_following_map(user, following):
    date = datetime.now().strftime("%m-%d-%y")
    return pd.DataFrame({
        'user1': [user] * len(following),
        'user2': following,
        'date': [date] * len(following)
    })

def extract_twitter_data(username):
    following_res = fetch_following_by_username(username)
    user_following = list(map(lambda x: x['username'], following_res))

    users_to_search = user_following.copy()
    users_to_search.append(username)

    user_tweets = get_user_tweets(users_to_search)
    user_following_map = get_user_following_map(username, user_following)

    return {
        'user_tweets': user_tweets,
        'user_following': user_following_map
    }

In [35]:
# Comprehend analysis

def prepare_data(data):
    user_tweets = data
    
    # Clean up the links from the text (they're useless to us)
    user_tweets['text'] = user_tweets['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])

    # Remove all emojis
    user_tweets = user_tweets.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

    # Remove blank tweets
    user_tweets = user_tweets[user_tweets.text.str.strip().str.len() != 0]

    # Ensure that all text is in a single line
    user_tweets.text = user_tweets.text.str.replace('\n', ' ');
    user_tweets.text = user_tweets.text.str.replace('\r', ' ');
    
    return user_tweets

def upload_to_s3(data, bucket_name, file_name):
    text_buffer = StringIO()
    data.text.to_csv(text_buffer, sep=' ', index=False, header=False)
    s3_resource = boto3.resource('s3')
    return s3_resource.Object(bucket_name, file_name).put(Body=text_buffer.getvalue())


def start_targeted_sentiment_job(input_s3_url, output_s3_url):
    input_data_config = {
        'S3Uri': input_s3_url,
        'InputFormat': input_doc_format
    }

    output_data_config = {
        'S3Uri': output_s3_url
    }

    job_name = 'Targeted_Sentiment_Job_{}'.format(job_suffix)
    
    comprehend = boto3.client('comprehend', region_name=region)
    return comprehend.start_targeted_sentiment_detection_job(InputDataConfig=input_data_config,
                                                             OutputDataConfig=output_data_config, 
                                                             DataAccessRoleArn=data_access_role_arn, 
                                                             LanguageCode=language_code,
                                                             JobName=job_name)

def analyse_tweets(username):
    # twitter_data <- Extract twitter data

    # users CSV -> S3
    # user_following CSV -> S3
    # user_tweets CSV -> S3    
    
    # user_tweets TEXT -> S3

    # Important!
    #  - each tweet needs to be tagged with its line number