In [1]:
import sys
sys.path.insert(0, '../')
import config as cf
import pandas as pd
import re, json

### Load US-reopen Data:

In [2]:
df = pd.read_csv(cf.US_REOPEN_DATA)
print("Shape = ", df.shape)
df.info()

Shape =  (17359, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17359 entries, 0 to 17358
Data columns (total 8 columns):
id               17359 non-null int64
created_at       17359 non-null object
original_text    17359 non-null object
clean_text       17359 non-null object
sentiment        17359 non-null object
lang             17359 non-null object
screen_name      17359 non-null object
location         17359 non-null object
dtypes: int64(1), object(7)
memory usage: 1.1+ MB


###  IBM WatsonTM Tone Analyzer Service:
* python3 -m pip install --upgrade "ibm-watson>=4.4.0"
* https://cloud.ibm.com/apidocs/tone-analyzer?code=python

In [3]:
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

mVERSION = '2017-09-21'
mAPI_KEY ='<api_key>'
mEND_URL = "https://api.us-south.tone-analyzer.watson.cloud.ibm.com/instances/<instance_id>"

authenticator = IAMAuthenticator(mAPI_KEY)
tone_analyzer = ToneAnalyzerV3(
    version=mVERSION,
    authenticator=authenticator
)
tone_analyzer.set_service_url(mEND_URL)

### Convert a tweet to a single sentence (may differ meaning):

In [4]:
def get_ibm_tone_format_tweet(tweet):
    tweet = tweet.replace('\n\n','; ') #newlines
    if (tweet.startswith("RT ")): tweet = ' '.join(tweet.split()[2:]) #re-tweet
    tweet = re.sub(cf.RX_MENTION, '', tweet) #mention
    tweet = re.sub(cf.RX_HASHTAG, '', tweet) #hashtag
    tweet = re.sub(cf.RX_URL, '', tweet) #url
    tweet = re.sub(cf.RX_EMAIL, '', tweet) #email
    tweet = re.sub(r"[^A-Za-z0-9,;-_/]", ' ', tweet) #non-ascii?
    tweet = ' '.join(tweet.split()) #white spaces
    if len(tweet.split()) < 3: return None #
    tweet = (tweet if tweet[-1].isalnum() else tweet[:-1]) + " ." #dot
    tweet = "{}{}".format(tweet[0].upper(),tweet[1:])
    return tweet.strip()

df['tone_format_tweet'] = df['original_text'].apply(lambda x: get_ibm_tone_format_tweet(x))
df = df.dropna()
df.to_csv(cf.US_REOPEN_EMOTION, index=False)
print("Shape = ", df.shape)
df.info()

Shape =  (17359, 9)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17359 entries, 0 to 17358
Data columns (total 9 columns):
id                   17359 non-null int64
created_at           17359 non-null object
original_text        17359 non-null object
clean_text           17359 non-null object
sentiment            17359 non-null object
lang                 17359 non-null object
screen_name          17359 non-null object
location             17359 non-null object
tone_format_tweet    17359 non-null object
dtypes: int64(1), object(8)
memory usage: 1.3+ MB


### Call API for max_limit=100 sentences:

In [None]:
call_no = 0
max_limit = 100
max_idx, curr_idx, next_idx = df.shape[0], (call_no * max_limit + 0), (call_no * max_limit + max_limit)

while curr_idx < max_idx:
    call_no = call_no + 1
    df_sub = df.iloc[curr_idx:next_idx, :]
    
    tweet_100 = ""
    for tweet in df_sub['tone_format_tweet']:
        tweet_100 = tweet_100 + " " + tweet
        assert len(tweet_100.encode('utf-8')) < 127000 #127 KB

    # call API with format
    tone_analysis = tone_analyzer.tone(
            tone_input={'text': tweet_100},
            content_type='text/plain;charset=utf-8',
            sentences='true'
        ).get_result()
    
    # save json response
    with open("tones/itr_{}.json".format(call_no), 'w') as fj:
        json.dump(tone_analysis, fj)
    
    # update for next 100 sentences
    curr_idx = curr_idx + max_limit
    next_idx = next_idx + max_limit
    
    # deleay between consecutive API call
    time.sleep(1)
    print("Saved for itr-{}: {} sentences.".format(call_no, len(tone_analysis["sentences_tone"])))