# Getting train and dev split of data

Parsed restaurants and automobile data should be divided into parts/

## Imports

In [1]:
from google.colab import drive
import random
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def seed_everything(seed=42) -> None:
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [4]:
seed_everything()

## Getting data

In [None]:
def get_train_dev_split(reviews_path: str, aspects_path: str, cats_path: str,
                        train_reviews_path: str, train_aspects_path: str, train_cats_path: str,
                        dev_reviews_path: str, dev_aspects_path: str, dev_cats_path: str):
    '''
    Get train and development split of data.
    '''
    texts, ids = [], []
    with open(reviews_path, encoding='utf-8') as f:
        for line in f:
            text_id, text = line.rstrip('\r\n').split('\t')
            texts.append(text)
            ids.append(text_id)

    train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids, random_state=42)

    train_aspects, dev_aspects = [], []
    with open(aspects_path, encoding='utf-8') as f:
        for line in f:
            line = line.rstrip('\r\n')
            text_id = line.split('\t')[0]
            if text_id in train_ids:
                train_aspects.append(line)
            if text_id in dev_ids:
                dev_aspects.append(line)

    train_sentiment, dev_sentiment = [], []
    with open(cats_path, encoding='utf-8') as f:
        for line in f:
            line = line.rstrip('\r\n')
            text_id = line.split('\t')[0]
            if text_id in train_ids:
                train_sentiment.append(line)
            if text_id in dev_ids:
                dev_sentiment.append(line)

    with open(train_aspects_path, 'w', encoding='utf-8') as f:
        for l in train_aspects:
            print(l, file=f)
    with open(dev_aspects_path, 'w', encoding='utf-8') as f:
        for l in dev_aspects:
            print(l, file=f)
    with open(train_reviews_path, 'w', encoding='utf-8') as f:
        for i, l in zip(train_ids, train_texts):
            print(i, l, sep="\t", file=f)
    with open(dev_reviews_path, 'w', encoding='utf-8') as f:
        for i, l in zip(dev_ids, dev_texts):
            print(i, l, sep="\t", file=f)
    with open(train_cats_path, 'w', encoding='utf-8') as f:
        for l in train_sentiment:
            print(l, file=f)
    with open(dev_cats_path, 'w', encoding='utf-8') as f:
        for l in dev_sentiment:
            print(l, file=f)

In [None]:
reviews_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_reviews.txt'
aspects_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_aspects.txt'
cats_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_cats.txt'
train_reviews_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_split_reviews.txt'
train_aspects_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_split_aspects.txt'
train_cats_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/train_split_cats.txt'
dev_reviews_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/dev_reviews.txt'
dev_aspects_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/dev_aspects.txt'
dev_cats_restaurants = '/content/drive/MyDrive/Summarization/restaurant data/dev_cats.txt'

In [None]:
reviews_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_reviews.txt'
aspects_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_aspects.txt'
cats_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_cats.txt'
train_reviews_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_split_reviews.txt'
train_aspects_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_split_aspects.txt'
train_cats_automobiles = '/content/drive/MyDrive/Summarization/automobile data/train_split_cats.txt'
dev_reviews_automobiles = '/content/drive/MyDrive/Summarization/automobile data/dev_reviews.txt'
dev_aspects_automobiles = '/content/drive/MyDrive/Summarization/automobile data/dev_aspects.txt'
dev_cats_automobiles = '/content/drive/MyDrive/Summarization/automobile data/dev_cats.txt'

In [None]:
get_train_dev_split(
    reviews_restaurants, aspects_restaurants, cats_restaurants,
    train_reviews_restaurants, train_aspects_restaurants, train_cats_restaurants,
    dev_reviews_restaurants, dev_aspects_restaurants, dev_cats_restaurants
)

In [None]:
get_train_dev_split(
    reviews_automobiles, aspects_automobiles, cats_automobiles,
    train_reviews_automobiles, train_aspects_automobiles, train_cats_automobiles,
    dev_reviews_automobiles, dev_aspects_automobiles, dev_cats_automobiles
)

In [5]:
train_asp = pd.read_csv(
    '/content/drive/MyDrive/Summarization/restaurant data/train_split_aspects.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_texts = pd.read_csv('/content/drive/MyDrive/Summarization/restaurant data/train_split_reviews.txt', delimiter='\t', names=['text_id','text'])

In [6]:
train_asp.head()

Unnamed: 0,text_id,category,mention,start,end,sentiment
0,30808,Whole,ресторане,16,25,neutral
1,30808,Interior,первом этаже,43,55,neutral
2,30808,Whole,руководству ресторана,124,145,positive
3,30808,Service,обслуживающему персоналу,147,171,positive
4,30808,Service,сотрудникам,189,200,positive


In [7]:
len(train_asp)

7109

In [None]:
train_asp = pd.read_csv(
    '/content/drive/MyDrive/Summarization/automobile data/train_split_aspects.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_texts = pd.read_csv('/content/drive/MyDrive/Summarization/automobile data/train_split_reviews.txt', delimiter='\t', names=['text_id','text'])

train_asp.head()

Unnamed: 0,text_id,category,mention,start,end,sentiment
0,928724,Whole,машину,13,19,neutral
1,928724,Whole,иномарка,177,185,positive
2,928724,Whole,приора,197,203,negative
3,928724,Whole,иномарку,293,301,negative
4,928724,Driveability,ходовыми характеристиками,345,370,positive


In [8]:
len(train_asp)

7109