In [1]:
# Import packages
import pandas as pd
from transformers import pipeline
from functools import partial
import re
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read datasets
story_df = pd.read_csv('story_dataset.csv')
ru_0_df = pd.read_csv('language_inversed_dataset.csv')

# Combine datasets
df = pd.concat([
    story_df,
    ru_0_df
])

# Sample timed dataset
# Sample the dataset by a particular time frame
df['date'] = pd.to_datetime(df['date']).dt.tz_convert('Europe/Kyiv')
df = df.set_index('date').sort_index()


In [3]:
# For each channel sample relative to category
# Get indices of the relavant thi
# Sampling config
random_state = 42

def normalize_loc_dates(df, start_date, end_date):
    start_date = pd.Series([
        pd.to_datetime(start_date).tz_localize('Europe/Kyiv'),
        df.index.min()
    ]).max()
    end_date = pd.Series([
        pd.to_datetime(end_date).tz_localize('Europe/Kyiv'),
        df.index.max()
    ]).min()
    return start_date, end_date

def uniform_sample(df, n):
    return df.sample(min(len(df), n), random_state=random_state)

def time_sampling(df, start_date, end_date, n):
    start_date, end_date = normalize_loc_dates(df, start_date, end_date)
    return df.loc[start_date:end_date].sample(min(len(df.loc[start_date:end_date]), n), random_state=random_state).sort_index()

# Sample for the first month
first_month_message_ids = (df.groupby('channel')
 .apply(partial(time_sampling, start_date='2022-02-24', end_date='2022-05-01', n=200))
.reset_index(drop=True))['message_id']

# Sample for the current time
current_time_ids = (df.groupby('channel')
 .apply(partial(time_sampling, start_date='2023-12-01', end_date='2024-02-08', n=200))
.reset_index(drop=True))['message_id']

# Sample for kherson news subset
def get_rows_that_contain_topic(df, start_date, end_date, n, keywords):
    start_date, end_date = normalize_loc_dates(df, start_date, end_date)
    df = df.loc[start_date:end_date]
    keywords = set(keywords)
    
    def contains_topic(text) -> bool:
        for i in text.lower().split():
            if i in keywords:
                return True
            
        return False
    topic_df = df[df['text'].apply(contains_topic)]
    return topic_df.sample(min(len(topic_df), n), random_state=random_state)

keywords = ["херсон", "херсона", "херсону", "херсон", "херсоном", "херсоні", "херсоне"]

kherson_news_ids  = (df.groupby('channel')
 .apply(partial(get_rows_that_contain_topic, start_date='2023-12-01', end_date='2024-02-08', n=200, 
               keywords=keywords))
.reset_index(drop=True))['message_id']
                           
random_sample_ids = (df.groupby('channel')
 .apply(partial(uniform_sample, n=500)).reset_index(drop=True))['message_id']
                           
subsetted_df = df.assign(
    fist_month_subset = df['message_id'].isin(first_month_message_ids),
    current_time_subset = df['message_id'].isin(current_time_ids),
    kherson_subset = df['message_id'].isin(kherson_news_ids),
    random_sample_subset=df['message_id'].isin(random_sample_ids),
)                       
                           


# Create two columns: time_subset, random_subset
# Samples columns
# Translate to english
# Save as out_of_sample_en.csv

In [4]:
subset_mask = subsetted_df[[
    'fist_month_subset',
    'current_time_subset',
    'kherson_subset',
    'random_sample_subset'
]].any(axis=1)
subsetted_df = subsetted_df[subset_mask].reset_index()

print('Number of rows {}'.format(len(subsetted_df)))
subsetted_df

Number of rows 11805


Unnamed: 0.1,date,message_id,text,views,channel,sourced_lang,label,Unnamed: 0,fist_month_subset,current_time_subset,kherson_subset,random_sample_subset
0,2022-01-01 16:17:51+02:00,21564,❗️ 🇺🇦 С 1 января 2022 года вступают в силу изм...,89897,truexanewsua,uk,0,,False,False,False,True
1,2022-01-01 17:18:29+02:00,21565,😍Шедевр на стадионе 604 мкрн\n\n”Всех с новым ...,85084,truexanewsua,uk,0,,False,False,False,True
2,2022-01-02 16:28:16+02:00,2468,"Прием вопросов от зрителей: по любым темам, кр...",10106,yurasumy,ru,1,,False,False,False,True
3,2022-01-02 16:30:38+02:00,21586,🌳 💥 На Харьковских дивизий огромное дерево тру...,82067,truexanewsua,uk,0,,False,False,False,True
4,2022-01-03 11:56:01+02:00,2469,Чем и когда окончится стратегическое наступлен...,10509,yurasumy,ru,1,,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
11800,2024-02-08 19:58:47+02:00,10248,"Я думаю гроши будут уже скоро, но то, что они ...",17028,inslandd,ru,0,2082.0,False,False,False,True
11801,2024-02-08 19:59:31+02:00,10249,"Наш бизнес это: дай, дай, дай и бизнес идет хо...",18010,inslandd,ru,0,3976.0,False,False,False,True
11802,2024-02-08 20:36:22+02:00,14329,"Законопроект о помощи Украине, Израилю и Тайва...",179374,ToBeOr_Official,ru,0,,False,False,False,True
11803,2024-02-08 20:44:06+02:00,10250,Хаймарс уничтожил:\n\nСАУ Гиацинт-С ~ 3 млн $\...,18086,inslandd,ru,0,12894.0,False,False,False,True


In [5]:
subsetted_df.to_csv('out_of_dataset_sample.csv',index=False)
assert False

AssertionError: 

In [None]:
# Translate the dataset

uk_pipe = pipeline(model='Helsinki-NLP/opus-mt-uk-en')
ru_pipe = pipeline(model='Helsinki-NLP/opus-mt-ru-en')

def detect_language(text):
    uk_chars = re.compile(r'[іїєґ]')
    if uk_chars.search(text.lower()):
        return 'uk'
    else:
        return 'ru'
    
def translate_rows(df, batch_size):
    df = df.copy()
    uk_lang_texts = df[df['detected_language'] == 'uk']['text'].tolist()
    ru_lang_texts = df[df['detected_language'] == 'ru']['text'].tolist()
    
    if len(uk_lang_texts) > 0:
        uk_en = uk_pipe(uk_lang_texts, truncation=True, batch_size=batch_size)
        df.iloc[df['detected_language'] == 'uk', df.columns.get_loc('text')] = uk_en
    
    if len(ru_lang_texts) > 0:
        ru_en = ru_pipe(ru_lang_texts, truncation=True, batch_size=batch_size)
        df.iloc[df['detected_language'] == 'ru', df.columns.get_loc('text')] = ru_en
        
    return df
    
        
    
    
subsetted_df['detected_language'] = subsetted_df['text'].apply(detect_language)
subsetted_df['text_en'] = 'none'

progress_bar = tqdm(total=len(subsetted_df), miniters=100)
batch_size = 8
for i in range(0, len(subsetted_df),batch_size):
    
    subsetted_df.iloc[i: min(i + batch_size, len(subsetted_df)), subsetted_df.columns.get_loc('text_en')] = \
        translate_rows(subsetted_df.iloc[i: min(i + batch_size, len(subsetted_df))],
                      batch_size)
    progress_bar.update(batch_size)
    
subsetted_df.to_csv('out_of_sample_sample_en.csv',index=False)
