In [None]:
%cd ..

In [38]:
import os, xml, json, spacy
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm import tqdm

In [39]:
nlp = spacy.load("hr_core_news_lg")

## STONE

In [None]:
stone = pd.read_csv("data/stone/stone_3k.csv")[['document_id', 'text', 'aggregated_tone']]

tone_dict = {"Ton - NEG": 0,
             "Ton - NEUTR": 1,
             "Ton - POZ": 2}

stone['label'] = stone['aggregated_tone'].map(tone_dict)
stone = stone.drop(columns=['aggregated_tone'])
stone

In [None]:
stone['label'].value_counts()

In [40]:
import re

class Lemmatizer():
    def __init__(self):
        self.lemmas = {}
        # __init__ should ensure the file is downloaded on import
        self.pattern = re.compile("\w+")
        with open("data/molex/molex14_word2lemma.txt") as fin:
            for line in fin:
                word, lemma = line.split()
                self.lemmas[word] = lemma

    def lemmatize_word(self, word):
        lemma = self.lemmas.get(word.lower())
        if lemma:
            return lemma
        else:
            return word[:5]

    def lemmatize_string(self, string):
        lemmas = []
        for token in re.findall(self.pattern, string.lower()):
            lemmas.append(self.lemmatize_word(token))
        return " ".join(lemmas)
    

  self.pattern = re.compile("\w+")


In [41]:
def parse_word(tagged_word):
    word = tagged_word.find('Word').text
    tag = tagged_word.find('POSTag').text
    lemma = tagged_word.find('Lemma').text
    stem = tagged_word.find('BasicStem').text

    return word, tag, lemma, stem

In [42]:
molex = Lemmatizer()

In [43]:
import string

all_tags = set()
raw_documents = {}

pos_tag_words = {}

for idx, row in tqdm(stone.iterrows(), total=len(stone)):

    doc = nlp(row['text'])
    doc_tokens = []
    
    for token in doc:
        word = token.text
        pos = token.pos_.lower()
        lemma = molex.lemmatize_word(word.lower()) 
        # lemmatize
        doc_tokens.append((lemma, pos))
    
    doc_tokens = [(word, pos) for word, pos in doc_tokens if word not in string.punctuation]
        
    raw_documents[idx] = {
        'text':doc_tokens,
        'label': row['label'],
        'docid': idx,
    }



100%|██████████| 2880/2880 [00:12<00:00, 235.70it/s]


In [None]:
raw_documents

In [None]:
out_name = 'data/stone/stone_clean_tokens.jsonl'

with open(out_name, 'w') as outfile:
    for k, instance in raw_documents.items():
        outfile.write(json.dumps(instance)+"\n")

## 24sata

In [49]:
news = pd.read_csv('/shared/lovorka/internal/retriever/data/retriever_24sata_diachronic_sentiment_dump.csv')
news_sentiment_labels = pd.read_csv('data/24sata_sentiment.tsv', sep='\t')

In [50]:
def transform_url(url):
    url_id = url.split('-')[-1]
    url_domain = url.split('.hr/')[0]  + '.hr/'
    return url_domain + '--' + url_id
    

news['url'] = news['url'].str.replace('https', 'http', regex=False)
news['transformed_url'] = news['url'].apply(transform_url)
news[['url', 'transformed_url']]

Unnamed: 0,url,transformed_url
0,http://24sata.hr/news/inovatori-iz-zagreba-uz-...,http://24sata.hr/--796736
1,http://24sata.hr/news/uciteljica-skidala-tange...,http://24sata.hr/--887394
2,http://24sata.hr/news/protiv-najezde-krvopija-...,http://24sata.hr/--911295
3,http://joomboos.24sata.hr/bez-srama/snimila-se...,http://joomboos.24sata.hr/--11435
4,http://joomboos.24sata.hr/bez-srama/sve-je-pre...,http://joomboos.24sata.hr/--10801
...,...,...
22474,http://24sata.hr/show/najslabija-karika-tko-se...,http://24sata.hr/--173140
22475,http://joomboos.24sata.hr/bez-srama/pametnjako...,http://joomboos.24sata.hr/--5705
22476,http://24sata.hr/show/osim-oca-pravnu-kontrolu...,http://24sata.hr/--261169
22477,http://joomboos.24sata.hr/bez-srama/prvi-tampo...,http://joomboos.24sata.hr/--7452


In [51]:
news_merged = pd.merge(news, news_sentiment_labels, left_on='transformed_url', right_on='url', how='inner')
news_merged['article'] = news_merged['title'] + ' ' + news_merged['body']
news_merged

Unnamed: 0,url_x,title,body,date_published,transformed_url,ID,url_y,label,article
0,http://joomboos.24sata.hr/bez-srama/sve-je-pre...,Sve je prevarila: Pogledajte što radi na Insta...,Ona je savršen primjer koliko je edit genijaln...,2019-01-31 00:00:00,http://joomboos.24sata.hr/--10801,985,http://joomboos.24sata.hr/--10801,neutral,Sve je prevarila: Pogledajte što radi na Insta...
1,http://joomboos.24sata.hr/prvi/beyonce-sa-7-go...,Beyonce sa 7 godina: Blue Ivy je ista mama,Beyonce je usporedila fotku sebe kada je imala...,2019-01-26 00:00:00,http://joomboos.24sata.hr/--10741,1728,http://joomboos.24sata.hr/--10741,neutral,Beyonce sa 7 godina: Blue Ivy je ista mama Bey...
2,http://joomboos.24sata.hr/bez-srama/ovaj-tip-s...,Ovaj tip se fotošopira na sve fotke Kendall Je...,Ovaj frajer osvaja internet svojim urnebesnim ...,2019-01-24 00:00:00,http://joomboos.24sata.hr/--10727,1570,http://joomboos.24sata.hr/--10727,neutral,Ovaj tip se fotošopira na sve fotke Kendall Je...
3,http://joomboos.24sata.hr/prvi/4-ideje-za-zims...,4 ideje za zimski dejt ako si lijenčina,Dok tvoji frendovi maštaju o savršenom dejtu n...,2018-12-26 00:00:00,http://joomboos.24sata.hr/--10327,1788,http://joomboos.24sata.hr/--10327,neutral,4 ideje za zimski dejt ako si lijenčina Dok tv...
4,http://joomboos.24sata.hr/prvi/doris-slavi-okr...,Doris slavi okrugli broj pratitelja na Instagr...,"Doris je tek nedavno pobijedila na Videostaru,...",2019-04-30 00:00:00,http://joomboos.24sata.hr/--11973,1850,http://joomboos.24sata.hr/--11973,positive,Doris slavi okrugli broj pratitelja na Instagr...
...,...,...,...,...,...,...,...,...,...
1757,http://gastro.24sata.hr/spajza/posao-iz-snova-...,Posao iz snova? Bistro iz Đakova traži degusta...,Posao koji u bistro i pizzerija Loora nudi jes...,2019-01-28 00:00:00,http://gastro.24sata.hr/--22977,1841,http://gastro.24sata.hr/--22977,positive,Posao iz snova? Bistro iz Đakova traži degusta...
1758,http://gastro.24sata.hr/spajza/punjena-pileca-...,Punjena pileća prsa na četiri genijalna načina,"Pileća prsa, rekli bi znalci, sama po sebi pop...",2019-02-21 00:00:00,http://gastro.24sata.hr/--23132,880,http://gastro.24sata.hr/--23132,positive,Punjena pileća prsa na četiri genijalna načina...
1759,http://gastro.24sata.hr/spajza/stizu-jagode-ov...,Top 5 recepata za slastice s jagodama,Izabrali smo 5 recepata naših kolumnista u koj...,2024-04-27 00:00:00,http://gastro.24sata.hr/--23481,1988,http://gastro.24sata.hr/--23481,positive,Top 5 recepata za slastice s jagodama Izabrali...
1760,http://joomboos.24sata.hr/bez-srama/ciro-uskor...,Uskoro slavi 84. rođendan: Ovo su Ćirine najbo...,Legenda hrvatskog nogometa i trener svih trene...,2019-02-10 00:00:00,http://joomboos.24sata.hr/--10926,737,http://joomboos.24sata.hr/--10926,positive,Uskoro slavi 84. rođendan: Ovo su Ćirine najbo...


In [52]:
def remove_string_from_column(df, column_name, string_to_remove):
    """
    Removes a specific string from a specified column in a DataFrame if it exists.

    Parameters:
    df (pd.DataFrame): The DataFrame to modify.
    column_name (str): The name of the column to process.
    string_to_remove (str): The string to remove from the column.

    Returns:
    pd.DataFrame: The modified DataFrame.
    """
    df[column_name] = df[column_name].str.replace(string_to_remove, "", regex=False)
    return df

In [53]:
tone_dict = {"negative": 0,
             "neutral": 1,
             "positive": 2}

news_merged['label'] = news_merged['label'].map(tone_dict)
news_merged['label'].value_counts()

label
1    1103
0     386
2     273
Name: count, dtype: int64

In [56]:
string_to_remove = "Igre na sreću mogu izazvati ovisnost. 18+"
news_merged = remove_string_from_column(news_merged, 'article', string_to_remove)

In [61]:
news_merged.to_csv('data/24sata/24sata_date.csv', index=False)

In [57]:
# Determine the minimum count among the labels
min_count = news_merged['label'].value_counts().min()

# Sample an equal number of rows for each label
balanced_sample = news_merged.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

balanced_sample['label'].value_counts()

  balanced_sample = news_merged.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)


label
0    273
1    273
2    273
Name: count, dtype: int64

In [58]:
import string

all_tags = set()
raw_documents = {}

pos_tag_words = {}

for idx, row in tqdm(balanced_sample.iterrows(), total=len(balanced_sample)):

    doc = nlp(row['article'])
    doc_tokens = []
    
    for token in doc:
        word = token.text
        pos = token.pos_.lower()
        lemma = molex.lemmatize_word(word.lower()) 
        # lemmatize
        doc_tokens.append((lemma, pos))
    
    doc_tokens = [(word, pos) for word, pos in doc_tokens if word not in string.punctuation]
        
    raw_documents[idx] = {
        'text':doc_tokens,
        'label': row['label'],
        'docid': idx,
    }


100%|██████████| 819/819 [00:19<00:00, 41.82it/s]


In [59]:
out_name = 'data/24sata/24sata_clean_tokens.jsonl'

with open(out_name, 'w') as outfile:
    for k, instance in raw_documents.items():
        outfile.write(json.dumps(instance)+"\n")