In [1]:
import sys
sys.path.insert(0, '../')
import config as cf
import os, ast, re, glob, json
import pandas as pd
tone_path = "tones/itr_{}.json"
num_files = 442

### Load US-reopen-emotion Data:

In [2]:
df = pd.read_csv(cf.US_REOPEN_EMOTION)
print("Shape = ", df.shape)
df.info()

Shape =  (17359, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17359 entries, 0 to 17358
Data columns (total 9 columns):
id                   17359 non-null int64
created_at           17359 non-null object
original_text        17359 non-null object
clean_text           17359 non-null object
sentiment            17359 non-null object
lang                 17359 non-null object
screen_name          17359 non-null object
location             17359 non-null object
tone_format_tweet    17359 non-null object
dtypes: int64(1), object(8)
memory usage: 1.2+ MB


### Load Tone Dictionary:

In [3]:
tone_dict = {}
for i in range(1, num_files + 1):
    tone_file = tone_path.format(i)
    with open(tone_file, 'r') as ftone:
        data = json.load(ftone)
        for sent in data['sentences_tone']:
            key = str(sent['text'])
            val = list(sent['tones'])
            try:
                if val not in tone_dict[key]:
                    tone_dict[key].append(val)
            except KeyError:
                tone_dict[key] = [val]

### Retrive Emotions:

In [4]:
def get_tones(tweet):
    tones = []
    val = []
    try:
        val = tone_dict[tweet]
    except:
        pass
    for t in val:
        for c in t:
            for e in t:
                if e not in tones:
                    tones.append(e)
    return tones

df_emos = df.copy()
df_emos["emotions"] = df_emos["tone_format_tweet"].apply(get_tones)
print("Shape = ", df_emos.shape)
df_emos.info()

Shape =  (17359, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17359 entries, 0 to 17358
Data columns (total 10 columns):
id                   17359 non-null int64
created_at           17359 non-null object
original_text        17359 non-null object
clean_text           17359 non-null object
sentiment            17359 non-null object
lang                 17359 non-null object
screen_name          17359 non-null object
location             17359 non-null object
tone_format_tweet    17359 non-null object
emotions             17359 non-null object
dtypes: int64(1), object(9)
memory usage: 1.3+ MB


### Filter Empty Response:

In [5]:
def has_tone(x):
    e = x['emotions']
    e = ast.literal_eval(str(e)) 
    if len(e) > 0:
        return True
    return False
df_emos = df_emos[df_emos.apply(lambda x: has_tone(x), axis=1)]
print("Shape = ", df_emos.shape)
df_emos.info()

Shape =  (9957, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9957 entries, 0 to 17358
Data columns (total 10 columns):
id                   9957 non-null int64
created_at           9957 non-null object
original_text        9957 non-null object
clean_text           9957 non-null object
sentiment            9957 non-null object
lang                 9957 non-null object
screen_name          9957 non-null object
location             9957 non-null object
tone_format_tweet    9957 non-null object
emotions             9957 non-null object
dtypes: int64(1), object(9)
memory usage: 855.7+ KB


### Extract Key Tones:

In [6]:
def get_key_tones(x):
    tones = []
    x = ast.literal_eval(str(x))
    for t in x:
        if t['tone_name'] not in tones:
            tones.append(str(t['tone_name']))
    return tones

df_emos["key_tones"] = df_emos["emotions"].apply(get_key_tones)
df_emos.to_csv(cf.US_REOPEN_EMOTION, index=False)
print("Shape = ", df_emos.shape)
df_emos.info()

Shape =  (9957, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9957 entries, 0 to 17358
Data columns (total 11 columns):
id                   9957 non-null int64
created_at           9957 non-null object
original_text        9957 non-null object
clean_text           9957 non-null object
sentiment            9957 non-null object
lang                 9957 non-null object
screen_name          9957 non-null object
location             9957 non-null object
tone_format_tweet    9957 non-null object
emotions             9957 non-null object
key_tones            9957 non-null object
dtypes: int64(1), object(10)
memory usage: 933.5+ KB
