In [3]:
from typing import List

from tqdm.auto import tqdm
import requests
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


# Подготовка датасета

# Загрузка данных

In [49]:
df = pd.read_csv('../data/disaster_tweets.csv')

In [50]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [51]:
df.target.value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

В исходном датасете почти нет дисбаланса... ну ладно, не будет усложнять задачу

In [52]:
df = df[['text', 'target']].drop_duplicates()
df['text'] = df.text.str.replace('#', ' ') # убираем хеш-теги, у нас "устная" речь

# Переведем на русский

In [89]:
np.random.seed(42)

# ограничим 1000 строк, а то дорого (15$ за 1М символов)
df_1000 = df.sample(1000).reset_index(drop=True)
df_1000.text.str.len().sum()

102258

In [79]:
with open('../secrets/yandex-traslate-apikey.txt', 'r') as file:
    apikey = file.read().replace('\n', '')

def yandex_translate(texts: List[str]) -> List[str]:
  url = "https://translate.api.cloud.yandex.net/translate/v2/translate"

  payload = json.dumps(
      {
    "sourceLanguageCode": "en",
    "targetLanguageCode": "ru",
    "format": "PLAIN_TEXT",
    "texts": texts,
    "glossaryConfig": {
      "glossaryData": {
        "glossaryPairs": [      ]
      }
    },
    "speller": True
  }
      )

  headers = {
    'Authorization': f'Api-Key {apikey}',
    'Content-Type': 'text/plain'
  }

  response = requests.request("POST", url, headers=headers, data=payload)

  return [x['text'] for x in response.json()['translations']]

In [94]:
batch_size = 50
list_of_dataframes = []
for g, x in df_1000.groupby(np.arange(len(df_1000)) // batch_size):
    print(f'{g+1} of {df_1000.shape[0] // batch_size + (1 if df_1000.shape[0] % batch_size != 0 else 0)}')
    texts = x['text'].to_list()
    translated = yandex_translate(texts)
    x['text_ru'] = translated

    list_of_dataframes.append(x)

df_ru = pd.concat(list_of_dataframes)
df_ru.to_csv('../data/dataset_1000_ru.csv')

print(df_ru.shape)
df_ru.head()

1 of 20
2 of 20
3 of 20
4 of 20
5 of 20
6 of 20
7 of 20
8 of 20
9 of 20
10 of 20
11 of 20
12 of 20
13 of 20
14 of 20
15 of 20
16 of 20
17 of 20
18 of 20
19 of 20
20 of 20
(1000, 3)


Unnamed: 0,text,target,text_ru
0,Doing Giveaway Music Kit Dren Death's Head Dem...,0,Разыгрываю бесплатный музыкальный набор Dren D...
1,IDFire Parker Ridge Fact Sheet Aug 6 2015 (Pa...,1,Информационный бюллетень ID Fire Parker Ridg...
2,Vacation update: my great aunt just killed a s...,1,Обновление из отпуска: моя двоюродная бабушка ...
3,The Flash And The Thunder by WC Quick on Amazo...,0,"""Вспышка и гром"" от WC Quick на Amazon Kindle ..."
4,Morgan Silver Dollar 1880 S Gem BU DMPL Cameo ...,0,Морган Серебряный доллар 1880-х годов Драгоцен...


# Генерируем аудиофайлы

In [102]:
def yandex_tts(text: str, lang: str = 'ru-RU', voice: str = 'alena', format: str = 'mp3') -> bytes:
    url = f"https://tts.api.cloud.yandex.net/speech/v1/tts:synthesize?text={text}&lang={lang}&voice={voice}&format={format}"
    payload = {}
    headers = {
        'Authorization': f'Api-Key {apikey}',
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.content


In [115]:
tqdm.pandas()

def apply_fn(row):
    filename = f'{row.name}.mp3'
    with open(f'../data/speech/{filename}', 'wb') as f:
        f.write(yandex_tts(row['text_ru']))

    return filename

df_ru['speech_fn'] = df_ru.progress_apply(apply_fn, axis=1)

100%|██████████| 1000/1000 [05:49<00:00,  2.86it/s]


In [116]:
df_ru.to_csv('../data/dataset_1000_ru.csv')

# Разделение на train -  test - valid

In [4]:
df_ru = pd.read_csv('../data/dataset_1000_ru.csv')

In [5]:
X_train, X_validtest, y_train, y_validtest = train_test_split(df_ru, df_ru.target, test_size=0.4, random_state=42, stratify=df_ru.target)

In [6]:
X_valid, X_test, y_valid, y_test = train_test_split(X_validtest, X_validtest.target, test_size=0.5, random_state=42, stratify=X_validtest.target)

In [7]:
X_train.shape

(600, 5)

In [8]:
X_valid.shape

(200, 5)

In [9]:
X_test.shape

(200, 5)

In [10]:
X_train.to_csv('../data/train.csv')
X_valid.to_csv('../data/valid.csv')
X_test.to_csv('../data/test.csv')