# Extracting data from XML-files

In [None]:
from google.colab import drive
import csv
import os
import xml.etree.ElementTree as xml

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Concatenate files

Sample:

In [None]:
with open('/content/drive/MyDrive/Summarization/automobile data/SentiRuEval_car_markup_train.xml', encoding='utf-8') as ftrain:
    train_text = ftrain.read()

with open('/content/drive/MyDrive/Summarization/automobile data/SentiRuEval_car_markup_test.xml', encoding='utf-8') as ftest:
    test_text = ftest.read()

In [None]:
text = train_text[:-12].replace('        ', '\t') + test_text[32:].replace('    ', '\t')

In [None]:
tr = train_text[:-12]
tr[-200:]

'"positive"/>\n                        <category name="Whole" sentiment="positive"/>\n                        <category name="Costs" sentiment="negative"/>\n                </categories>\n        </review>'

In [None]:
te = test_text[32:]
te[:200]

'\n\t<review id="816831">\n\t\t<meta>\n\t\t\t<object>Mazda 6 седан</object>\n\t\t</meta>\n\t\t<text>В принципе машинка не плохая, объемом в 2.0 куба, легкий кузов, дорогу держит не плохо, приятна по салону, сделана в'

In [None]:
text[:500]

'<?xml version="1.0" ?>\n<reviews>\n\t<review id="92845">\n\t\t<meta>\n\t\t\t<object>AUDI 100 C4 седан</object>\n\t\t</meta>\n\t\t<text>Недавно купил этот автомобиль. Авто отличное! Двигатель 2,5 литра, турбодизель. Прежний хозяин сказал при продаже, что масло не жрет, солярку тоже, летит как угорелая! Так оно и есть. 140 км/ч нормальная крейсерская скорость. Вообще немцы умеют делать автомобили. Дорогу держит отлично, так как достаточно широкая машина. Тормоза все дисковые. Главное передний привод, по сравнению'

In [None]:
def concatenate(train_path: str, test_path: str, final_path: str) -> None:
    '''
    Concatenate train and test XML files for parsing.
    '''
    with open(train_path, 'r', encoding='utf-8') as ftrain:
        train_text = ftrain.read()

    with open(test_path, 'r', encoding='utf-8') as ftest:
        test_text = ftest.read()

    text = train_text[:-12].replace('        ', '\t') + test_text[32:].replace('    ', '\t')

    with open(final_path, 'w', encoding='utf-8') as fftrain:
        fftrain.write(text)

In [None]:
# restaurants
train_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/SentiRuEval_rest_markup_train.xml'
test_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/SentiRuEval_rest_markup_test.xml'
final_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train.xml'
concatenate(train_restaurants, test_restaurants, final_restaurants)

# automobiles
train_automobiles = '/content/drive/MyDrive/Summarization/automobile data/SentiRuEval_car_markup_train.xml'
test_automobiles = '/content/drive/MyDrive/Summarization/automobile data/SentiRuEval_car_markup_test.xml'
final_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train.xml'
concatenate(train_automobiles, test_automobiles, final_automobiles)

## Parse XML

In [None]:
tree = xml.parse(final_automobiles)
root = tree.getroot()

In [None]:
print(root)

<Element 'reviews' at 0x7f4e6f107fb0>


The tables of this structure are need to be created:


*   reviews: text_id, text
*   aspects: text_id, category, mention, start, end, sentiment
*   cats: text_id, category, sentiment



In [None]:
def create_tables(train_path: str, reviews_path: str, aspects_path: str, cats_path: str) -> None:
    '''
    Create tables with the data from reviews.
    '''
    reviews_file = open(reviews_path, 'w')
    reviews_writer = csv.writer(reviews_file, delimiter='\t')
    aspects_file = open(aspects_path, 'w')
    aspects_writer = csv.writer(aspects_file, delimiter='\t')
    cats_file = open(cats_path, 'w')
    cats_writer = csv.writer(cats_file, delimiter='\t')

    tree = xml.parse(train_path)
    root = tree.getroot()

    for child in root:
        text_id = child.attrib.get('id', None)
        text = child.find('./text').text
        # print(text_id, text)

        reviews_writer.writerow([text_id, text])

        for aspect in child.findall('./aspects/aspect'):
            category = aspect.attrib.get('category', None)
            mention = aspect.attrib.get('term', None)
            start = aspect.attrib.get('from', None)
            end = aspect.attrib.get('to', None)
            sentiment = aspect.attrib.get('sentiment', None)

            if sentiment == 'both':
                sentiment = 'neutral'

            # print(text_id, category, mention, start, end, sentiment)
            aspects_writer.writerow([text_id, category, mention, start, end, sentiment])

        for cat in child.findall('./categories/category'):
            cat_name = cat.attrib.get('name', None)
            cat_sent = cat.attrib.get('sentiment', None)

            # print(text_id, cat_name, cat_sent)
            cats_writer.writerow([text_id, cat_name, cat_sent])

    reviews_file.close()
    aspects_file.close()
    cats_file.close()

In [None]:
reviews_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_reviews.txt'
aspects_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_aspects.txt'
cats_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_cats.txt'

reviews_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_reviews.txt'
aspects_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_aspects.txt'
cats_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_cats.txt'

In [None]:
# restaurants
create_tables(final_restaurants, reviews_restaurants, aspects_restaurants, cats_restaurants)

# automobiles
create_tables(final_automobiles, reviews_automobiles, aspects_automobiles, cats_automobiles)