**Publicis Sapient**

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse('/content/drive/MyDrive/publicis/Restaurants_Train_v2.xml')
root = tree.getroot()
# Parse the XML data
# root = ET.fromstring(xml_data)
# Function to extract positions
def extract_positions(root):
    data = []
    sentences = root.findall('sentence')
    for sentence in sentences:
        new_dict = {}
        sentence_id = sentence.get('id')
        text = sentence.find('text').text
        new_dict["sentence_id"] = sentence_id
        new_dict["text"] = text
        new_dict["aspectTerms"] = []
        new_dict["aspectCategory"] = []
        d = [sentence_id, text]

        aspect_terms = sentence.find('aspectTerms')
        if aspect_terms is not None:

            for aspect_term in aspect_terms.findall('aspectTerm'):
                aspect_term_dict = {}
                term = aspect_term.get('term')
                term_from = int(aspect_term.get('from', 0))
                term_to = int(aspect_term.get('to', 0))
                polarity = aspect_term.get('polarity')

                aspect_term_dict["term"] = term
                aspect_term_dict["term_from"] = term_from
                aspect_term_dict["term_to"] = term_to
                aspect_term_dict["polarity"] = polarity
                data.append( d + [term, term_from, term_to, polarity, "", ""])
                new_dict["aspectTerms"].append(aspect_term_dict)
                # print(f"Sentence ID: {sentence_id}, Term: '{term}', Start: {term_from}, End: {term_to}, Polarity: {polarity}")

        aspect_categories = sentence.find('aspectCategories')
        if aspect_categories is not None:
            for aspect_category in aspect_categories.findall('aspectCategory'):
                aspect_cat_dict = {}
                category = aspect_category.get('category')
                polarity = aspect_category.get('polarity')

                aspect_cat_dict["category"] = category
                aspect_cat_dict["polarity"] = polarity

                data.append( d + ["", 0, 0, "", category, polarity])
                new_dict["aspectCategory"].append(aspect_cat_dict)
                # For categories, we're not given start and end positions, so we consider the entire sentence.
                # print(f"Sentence ID: {sentence_id}, Category: '{category}', Polarity: {polarity}, Text: '{text}'")

        # data.append(new_dict)
    columns = ["sent_id", "text", "term", "term_from", "term_to", "term_polarity", "category", "category_polarity"]
    df =  pd.DataFrame(data, columns=columns)
    return df

In [None]:

# Call the function
data = extract_positions(root)

In [None]:
data.head(10)

Unnamed: 0,sent_id,text,term,term_from,term_to,term_polarity,category,category_polarity
0,3121,But the staff was so horrible to us.,staff,8,13,negative,,
1,3121,But the staff was so horrible to us.,,0,0,,service,negative
2,2777,"To be completely fair, the only redeeming fact...",food,57,61,positive,,
3,2777,"To be completely fair, the only redeeming fact...",,0,0,,food,positive
4,2777,"To be completely fair, the only redeeming fact...",,0,0,,anecdotes/miscellaneous,negative
5,1634,"The food is uniformly exceptional, with a very...",food,4,8,positive,,
6,1634,"The food is uniformly exceptional, with a very...",kitchen,55,62,positive,,
7,1634,"The food is uniformly exceptional, with a very...",menu,141,145,neutral,,
8,1634,"The food is uniformly exceptional, with a very...",,0,0,,food,positive
9,2534,Where Gabriela personaly greets you and recomm...,,0,0,,service,positive


In [None]:
data["term_polarity"].value_counts()

term_polarity
            3713
positive    2164
negative     805
neutral      633
conflict      91
Name: count, dtype: int64

In [None]:
list(data["category"].value_counts().keys())

['', 'food', 'anecdotes/miscellaneous', 'service', 'ambience', 'price']

In [None]:
import spacy

# Load a pre-trained NER model
nlp = spacy.load("en_core_web_sm")

def extract_aspect_terms(text):
    doc = nlp(text)
    aspect_terms = [ent.text for ent in doc if ent.pos_ in ["NOUN"]]
    return aspect_terms

# Example usage
extract_aspect_terms(data["text"].tolist()[4])


['factor', 'food', 'deficiencies']

In [None]:
data["text"].tolist()[4]

"To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora."

In [None]:
data["term"].tolist()[4]

In [None]:
aspect_ner_df = data[data["term"] != ""]

In [None]:
from sklearn.model_selection import train_test_split
aspect_ner = {}
for i, row in aspect_ner_df.iterrows():
    text = row["text"]
    entities = (row["term_from"], row["term_to"], "ASPECT")
    if row["sent_id"] not in aspect_ner:
        aspect_ner[row["sent_id"]] = (text, {"entities": [entities]}, row["sent_id"])
    else:
        aspect_ner[row["sent_id"]][1]["entities"].append(entities)


In [None]:
aspect_dataset = list(aspect_ner.values())

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(aspect_dataset, test_size=0.3, random_state=42)

In [None]:
import spacy
# from spacy.gold import biluo_tags_from_offsets
from spacy.training import offsets_to_biluo_tags

def get_iob_dataset(dataset):

    nlp = spacy.load('en_core_web_sm')
    docs = []
    # sid = 0
    SENT_ID, WORDS, TAGS = [], [], []
    for text, annot, sid in (dataset):
        doc = nlp(text)
        tags = offsets_to_biluo_tags(doc, annot['entities'])
        words = []
        sent_id = [sid]*len(tags)
        for id in range(len(doc)):
            words.append(doc[id])
        SENT_ID.extend(sent_id)
        WORDS.extend(words)
        TAGS.extend(tags)

    df = pd.DataFrame()
    df["sent_id"] = SENT_ID
    df["text"] = WORDS
    df["tag"] = TAGS
    df["tag"] = df["tag"].apply(lambda x: x.replace("U-", "B-").replace("L-", "I-"))

    return df



    # then convert L->I and U->B to have IOB tags for the tokens in the doc

In [None]:
train_df = get_iob_dataset(train_data)
test_df = get_iob_dataset(test_data)



In [None]:
train_df.shape, test_df.shape

((24571, 3), (10566, 3))

In [None]:
train_df["sent_id"].value_counts()

sent_id
3265    79
3512    57
3353    57
2153    54
3139    53
        ..
83       3
2909     3
2098     3
1870     3
379      3
Name: count, Length: 1414, dtype: int64

In [None]:
train_df["sent_id"] = pd.to_numeric(train_df["sent_id"])
test_df["sent_id"] = pd.to_numeric(test_df["sent_id"])


In [None]:
train_df[train_df["sent_id"] == 379]

Unnamed: 0,sent_id,text,tag
15159,379,Acceptable,O
15160,379,prices,B-ASPECT
15161,379,.,O


In [None]:
train_df.to_csv("/content/drive/MyDrive/publicis/train.csv")
test_df.to_csv("/content/drive/MyDrive/publicis/test.csv")

In [None]:
/content/drive/MyDrive/publicis