# ABOUT:
- FiQA dataset has an ABSA task whether aspects are assigned a sentiment score from -1 to 1
- this notebook 
    - process FiQA dataset into the format required to train SATSA model
        - sentiment scores are converted into classes

## FiQA train set
    {
      "1": {
        "sentence": "Royal Mail chairman Donald Brydon set to step down",
        "info": [
          {
            "snippets": "['set to step down']",
            "target": "Royal Mail",
            "sentiment_score": "-0.374",
            "aspects": "['Corporate/Appointment']"
          }
        ]
      },
      "7": {
        "sentence": "Stakes High for AstraZeneca Heart Drug Facing Tough Competition",
        "info": [
          {
            "snippets": "['Facing Tough Competition']",
            "target": "AstraZeneca",
            "sentiment_score": "-0.24",
            "aspects": "['Corporate/Risks']"
          }
        ]
      },

In [1]:
import pandas as pd
import re
import json
import os

In [2]:
def extract_desired_info_MAMS_format(json0, cutoff, cleaning_function = None):
    output = []
    text = cleaning_function(json0['sentence'].strip()) if cleaning_function else cleaning_function
    text+="."
    for info in json0['info']:
        aspect = info['target'].strip()
        # convert sentiment score to label
        sentiment_score = float(info['sentiment_score'])
        sentiment_label = "neutral" if abs(sentiment_score)<=cutoff else "negative" if sentiment_score<0 else "positive"
        # get start and end index
        try:
            start, end = next(re.finditer(aspect, text, re.IGNORECASE)).span()
        except:
            continue
        output.append({"text":text, "span": (start,end), "label" :sentiment_label})
    return output

def cleanText(text):                                       
    def remove_links(text):
        return re.sub(r'http\S+', '', text)                    
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_escape_sequence_char(text):
        return text.replace('\\', '').replace(u'\xa0', u'').replace('\n', '').replace('\t', '').replace('\r', '')
    def remove_non_alphanumeric(text):
        return re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return remove_non_alphanumeric(white_space_fix(remove_escape_sequence_char(remove_links(text))))

### read and process

In [3]:
threshhold = 0.2     # NEG<-0.2 , -2<=NEU<=0.2, POS>0.2
dataset = pd.DataFrame()
label_mapping = {"negative":0,"neutral":1,"positive":2}

In [4]:
paths = [r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\FiQA_ABSA_task1\task1_headline_ABSA_train.json",
         r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\FiQA_ABSA_task1\task1_post_ABSA_train.json"]
for path in paths:
    with open(path,"r", encoding="utf8") as f:
        fiqa = json.load(f)
        for k,v in fiqa.items():
            try:
                sample = extract_desired_info_MAMS_format(v, threshhold, cleanText)
            except:
                pass
            dataset = dataset.append(sample,ignore_index=True)   
dataset.label = dataset.label.map(label_mapping)

### Input for SATSA 
- text: a sentence e.g "The decor is not special at all but their.."
- span: the span of the aspect e.g (4, 9)	
- label: the sentiment of the aspect e.g 0

note: label_mappings = {"negative":0,"neutral":1,"positive":2}

In [5]:
dataset.head()

Unnamed: 0,text,span,label
0,Royal Mail chairman Donald Brydon set to step ...,"(0, 10)",0
1,Stakes High for AstraZeneca Heart Drug Facing ...,"(16, 27)",0
2,UPDATE 1Dairy Crest loses a third of Morrisons...,"(37, 46)",1
3,Insight hires Avivas David Hillier for multias...,"(0, 7)",1
4,Primark racks up a happy Christmas after stron...,"(0, 7)",2


In [6]:
dataset.label.value_counts()

2    620
0    312
1    206
Name: label, dtype: int64

In [10]:
len(dataset)

1138

### save

In [19]:
path = os.path.join(os.getcwd(),"data\\FiQA\\train\\FiQA_train.pkl")
dataset.to_pickle(path)

In [21]:
from utils import fully_show_samples
fully_show_samples(dataset)

Unnamed: 0,text,span,label
296,Sainsburys Asda Tesco and Morrisons will all cut petrol prices as oil falls .,"(0, 9)",0
851,GOOGL Hit With Lawsuit from Russian Search Engine YNDX.,"(50, 54)",1
143,UPDATE 1Nomura RBS must pay 806 mln in mortgage bond caseUS judge.,"(15, 18)",0
130,Tesco share price closes higher as two more directors leave grocer.,"(0, 5)",2
117,Tesco sales rise shows tentative recovery continues.,"(0, 5)",2
113,Royal Dutch Shell to Buy BG Group for Nearly 70 Billion.,"(0, 17)",2
979,CTRP breaking out here on good vol and a parser buy signal noticed this on bob langs comments looking for 48 fast.,"(0, 4)",2
763,AAPL Nice RSI 33 bounce at 958.,"(0, 4)",2
600,GMCR with the way this has been acting it may fall to 65 today horrible action since it hit 70 yesterday.,"(0, 4)",0
919,Amazon has been selling surveillance cameras infected with malware AMZN Amazon.,"(67, 71)",1


# ABOUT:
- FiQA dataset has an ABSA task whether aspects are assigned a sentiment score from -1 to 1
- this notebook 
    - process FiQA dataset into the format required to train SATSA model
        - sentiment scores are converted into classes

## FiQA train set
    {
      "1": {
        "sentence": "Royal Mail chairman Donald Brydon set to step down",
        "info": [
          {
            "snippets": "['set to step down']",
            "target": "Royal Mail",
            "sentiment_score": "-0.374",
            "aspects": "['Corporate/Appointment']"
          }
        ]
      },
      "7": {
        "sentence": "Stakes High for AstraZeneca Heart Drug Facing Tough Competition",
        "info": [
          {
            "snippets": "['Facing Tough Competition']",
            "target": "AstraZeneca",
            "sentiment_score": "-0.24",
            "aspects": "['Corporate/Risks']"
          }
        ]
      },

In [1]:
import pandas as pd
import re
import json
import os

In [2]:
def extract_desired_info_MAMS_format(json0, cutoff, cleaning_function = None):
    output = []
    text = cleaning_function(json0['sentence'].strip()) if cleaning_function else cleaning_function
    text+="."
    for info in json0['info']:
        aspect = info['target'].strip()
        # convert sentiment score to label
        sentiment_score = float(info['sentiment_score'])
        sentiment_label = "neutral" if abs(sentiment_score)<=cutoff else "negative" if sentiment_score<0 else "positive"
        # get start and end index
        try:
            start, end = next(re.finditer(aspect, text, re.IGNORECASE)).span()
        except:
            continue
        output.append({"text":text, "span": (start,end), "label" :sentiment_label})
    return output

def cleanText(text):                                       
    def remove_links(text):
        return re.sub(r'http\S+', '', text)                    
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_escape_sequence_char(text):
        return text.replace('\\', '').replace(u'\xa0', u'').replace('\n', '').replace('\t', '').replace('\r', '')
    def remove_non_alphanumeric(text):
        return re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return remove_non_alphanumeric(white_space_fix(remove_escape_sequence_char(remove_links(text))))

### read and process

In [3]:
threshhold = 0.2     # NEG<-0.2 , -2<=NEU<=0.2, POS>0.2
dataset = pd.DataFrame()
label_mapping = {"negative":0,"neutral":1,"positive":2}

In [4]:
paths = [r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\FiQA_ABSA_task1\task1_headline_ABSA_train.json",
         r"C:\Users\tanch\Documents\NTU\URECA - Aspect Based Sentiment Analysis\local\data\FiQA_ABSA_task1\task1_post_ABSA_train.json"]
for path in paths:
    with open(path,"r", encoding="utf8") as f:
        fiqa = json.load(f)
        for k,v in fiqa.items():
            try:
                sample = extract_desired_info_MAMS_format(v, threshhold, cleanText)
            except:
                pass
            dataset = dataset.append(sample,ignore_index=True)   
dataset.label = dataset.label.map(label_mapping)

In [5]:
len(fiqa)

675

### Input for SATSA 
- text: a sentence e.g "The decor is not special at all but their.."
- span: the span of the aspect e.g (4, 9)	
- label: the sentiment of the aspect e.g 0

note: label_mappings = {"negative":0,"neutral":1,"positive":2}

In [5]:
dataset.head()

Unnamed: 0,text,span,label
0,Royal Mail chairman Donald Brydon set to step ...,"(0, 10)",0
1,Stakes High for AstraZeneca Heart Drug Facing ...,"(16, 27)",0
2,UPDATE 1Dairy Crest loses a third of Morrisons...,"(37, 46)",1
3,Insight hires Avivas David Hillier for multias...,"(0, 7)",1
4,Primark racks up a happy Christmas after stron...,"(0, 7)",2


In [6]:
dataset.label.value_counts()

2    620
0    312
1    206
Name: label, dtype: int64

In [10]:
len(dataset)

1138

### save

In [19]:
path = os.path.join(os.getcwd(),"data\\FiQA\\train\\FiQA_train.pkl")
dataset.to_pickle(path)

In [21]:
from utils import fully_show_samples
fully_show_samples(dataset)

Unnamed: 0,text,span,label
296,Sainsburys Asda Tesco and Morrisons will all cut petrol prices as oil falls .,"(0, 9)",0
851,GOOGL Hit With Lawsuit from Russian Search Engine YNDX.,"(50, 54)",1
143,UPDATE 1Nomura RBS must pay 806 mln in mortgage bond caseUS judge.,"(15, 18)",0
130,Tesco share price closes higher as two more directors leave grocer.,"(0, 5)",2
117,Tesco sales rise shows tentative recovery continues.,"(0, 5)",2
113,Royal Dutch Shell to Buy BG Group for Nearly 70 Billion.,"(0, 17)",2
979,CTRP breaking out here on good vol and a parser buy signal noticed this on bob langs comments looking for 48 fast.,"(0, 4)",2
763,AAPL Nice RSI 33 bounce at 958.,"(0, 4)",2
600,GMCR with the way this has been acting it may fall to 65 today horrible action since it hit 70 yesterday.,"(0, 4)",0
919,Amazon has been selling surveillance cameras infected with malware AMZN Amazon.,"(67, 71)",1
