In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy
from gensim.models import Word2Vec
import numpy as np
import os
import json
from nltk.tokenize import sent_tokenize
from typing import List, Dict

In [13]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


In [14]:
def replace_i_prefix(word, prefixes):
  for prefix in prefixes:
    try:
      if word.lower().startswith(prefix) and len(word) > len(prefix) and word[len(prefix):][0] in ["î", "Î"]:
        first_letter = word[len(prefix):][0]
        first_letter = "i" if first_letter == "î" else ("I" if first_letter == "Î" else first_letter)
        word = prefix + first_letter + word[len(prefix) + 1:]

    except:
      print(word)

  word = word.replace("î", "a").replace("Î", "A")

  return word

def no_diacritics(text, prefixes):

  text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


# for key in moldavian_texts:
#     for i in range(len(moldavian_texts[key])):
#         moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

# for key in romanian_texts:
#     for i in range(len(romanian_texts[key])):
#         romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

# print(moldavian_texts["Sport"][0])
# print(romanian_texts['Stiri'][12])

print(no_diacritics("cîțiva", romanian_prefixes))

cativa


In [15]:
romanian=[
    "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei",
    "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta",
    "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia",
    "acești", "aceștia", "acolo", "acord", "acum", "adica", "ai", "aia",
    "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt",
    "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul",
    "alături", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea",
    "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat",
    "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia",
    "atunci", "au", "avea", "avem", "aveţi", "aveți", "avut", "azi", "aş",
    "aşadar", "aţi", "aș", "așadar", "ați", "b", "ba", "bine", "bucur", "bună",
    "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia",
    "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel",
    "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva",
    "cit", "cita", "cite", "citeva", "citi", "câțiva", "conform", "contra",
    "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte",
    "câtva", "câţi", "câți", "cînd", "cît", "cîte", "cîtva", "cîţi", "cîți",
    "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca",
    "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci",
    "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre",
    "deşi", "deși", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar",
    "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei",
    "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "ești", "f",
    "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu",
    "fiţi", "fiți", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie",
    "grație", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in",
    "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi",
    "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă",
    "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi",
    "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi",
    "mulţumesc", "mulți", "mulțumesc", "mâine", "mîine", "mă", "n", "ne",
    "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri",
    "nimic", "niste", "nişte", "niște", "noastre", "noastră", "noi", "noroc",
    "nostri", "nostru", "nou", "noua", "nouă", "noştri", "noștri", "nu",
    "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum",
    "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca",
    "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina",
    "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-",
    "putini", "puţin", "puţina", "puţină", "puțin", "puțina", "puțină", "până",
    "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau",
    "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem",
    "sunteţi", "sunteți", "sus", "sută", "sînt", "sîntem", "sînteţi",
    "sînteți", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp",
    "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul",
    "totusi", "totuşi", "totuși", "toţi", "toți", "trei", "treia", "treilea",
    "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde",
    "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora",
    "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră",
    "voi", "vom", "vor", "vostru", "vouă", "voştri", "voștri", "vreme", "vreo",
    "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi",
    "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît",
    "între", "întrucât", "întrucît", "îţi", "îți", "ăla", "ălea", "ăsta",
    "ăstea", "ăştia", "ăștia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie",
    "șapte", "șase", "și", "știu", "ți", "ție"
]

In [16]:
# # Get all the words from the stop words list and apply the same transformation
stop_words = romanian
for i in range(len(stop_words)):
    stop_words[i] = no_diacritics(stop_words[i], romanian_prefixes)

stop_words = list(set(stop_words))

In [17]:
def analyze_label_distribution(dataset_folder):
    json_files = [f for f in os.listdir(dataset_folder) if f.endswith('.json')]
    
    dataframes = []
    
    for filename in json_files:
        filepath = os.path.join(dataset_folder, filename)
        
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if 200 <= len(data) <= 1000:
            df = pd.DataFrame(data)
            dataframes.append(df)
        elif len(data) > 1000:
            df = pd.DataFrame(data).sample(n=1000, random_state=42)
            dataframes.append(df)

    if dataframes:
        final_dataframe = pd.concat(dataframes, ignore_index=True)
        
        # Count label distribution
        label_counts = final_dataframe['label'].value_counts()
        
        print("Label Distribution:")
        for label, count in label_counts.items():
            print(f"{label}: {count}")
        
        return final_dataframe
    else:
        print("No files met the criteria.")
        return None

dataset_folder = 'Dataset'
data = analyze_label_distribution(dataset_folder)

Label Distribution:
Banat: 1000
Ungheni: 1000
Oltenia: 1000
Moldova: 1000
Serbia: 1000
Muntenia: 1000
Ucraina: 1000
Ardeal: 1000
Dobrogea: 965
Balti: 948
Sangerei: 775
Spania: 723
Maramures: 656
Canada_EN: 641
Crisana: 579
Orhei: 512
Calarasi: 511
Criuleni: 509
Ialoveni: 504
Cahul: 504
Soroca: 504
Germania: 500
UK: 499
Bucovina: 428
Causeni: 321


In [18]:
def get_features_1(train_text, validation_text, num_features):
    tfidf = TfidfVectorizer(max_features=num_features)

    train_text_tfidf = tfidf.fit_transform(train_text)

    validation_text_tfidf = tfidf.transform(validation_text)

    return train_text_tfidf.toarray(), validation_text_tfidf.toarray()

In [19]:
nlp = spacy.load("ro_core_news_sm")

def preprocess(text):
    text = no_diacritics(text, romanian_prefixes)
    text = ' '.join(word for word in text.split() if word.lower() in stop_words)
    doc = nlp(text)
    text = ' '.join(token.lemma_ for token in doc if not token.is_stop)
    
    return text

In [25]:
print(data.head())

data = data[data['text'].notna() | data['content'].notna()]

data['text'] = data['text'].combine_first(data['content'])

texts = data['text']
labels = data['label']

train_texts, validation_texts, train_labels, validation_labels = train_test_split(texts,labels, test_size=0.2, random_state=42)

train_texts.apply(preprocess)
validation_texts.apply(preprocess)

                                               title  \
0  Miercuri și vineri, veniți la un spectacol des...   
1  Verginia Cetulean: „Vreau o Moldovă modernă, u...   
2  Ești din raionul Soroca și cauți un loc de luc...   
3  Vremea la Soroca: prognoza pentru joi, 28 mart...   
4  Violeta Boțoc: „Toată copilăria mea am petrecu...   

                                             content  \
0  Teatrul „Veniamin Apostol” din Soroca vă invit...   
1  Verginia Cetulean, originară din raionul Soroc...   
2  Miercuri, 27 noiembrie, la Soroca va fi organi...   
3  Potrivit meteorologilor, astăzi vremea va fi f...   
4  Toată copilăria mea am petrecut-o într-o avent...   

                                      metadata   label text  
0  {'original_file': 'Observatorul_0123.html'}  Soroca  NaN  
1  {'original_file': 'Observatorul_0089.html'}  Soroca  NaN  
2  {'original_file': 'Observatorul_0066.html'}  Soroca  NaN  
3  {'original_file': 'Observatorul_0436.html'}  Soroca  NaN  
4  {'original_fi

18633    doi an an an an an putin suta an Antre lingă d...
115                                an an an an an an an an
3314     an an an an an an an an an an an Sunteti an an an
12281    doi an an an an an an an dată an an an an ling...
13876                                            lingă doi
                               ...                        
7541     an doi an doi an an an an an doi an an an an a...
6586                                  an lingă an an an an
16262                        antre an putin an an an an an
16721                                   an noastra miin an
14001                                an an an an pană aiba
Name: text, Length: 3816, dtype: object

In [26]:
train_texts = train_texts[train_texts.str.strip() != '']
train_texts = train_texts[train_texts.notna()]
validation_texts = validation_texts[validation_texts.str.strip() != '']
validation_texts = validation_texts[validation_texts.notna()]

train_labels = train_labels[train_texts.index]
train_labels = train_labels[train_labels.notna()]
validation_labels = validation_labels[validation_texts.index]
validation_labels = validation_labels[validation_labels.notna()]

final_train_indices = train_texts.index.intersection(train_labels.index)
final_val_indices = validation_texts.index.intersection(validation_labels.index)

train_texts = train_texts[final_train_indices]
train_labels = train_labels[final_train_indices]
validation_texts = validation_texts[final_val_indices]
validation_labels = validation_labels[final_val_indices]

print("\nFinal dataset sizes after cleaning:")
print(f"Final training samples: {len(train_texts)}")
print(f"Final validation samples: {len(validation_texts)}")


Final dataset sizes after cleaning:
Final training samples: 14441
Final validation samples: 3598


In [27]:
print(train_texts[1:20])
print(validation_texts[1:20])

print(train_labels[1:20])
print(validation_labels[1:20])

5460     11 barbati şi o femeie, cetăţeni români, au fo...
1739     Circulația a fost întreruptă complet miercuri ...
14294    Primăria Craiova vine în sprijinul cetățenilor...
7775     În primele 10 luni ale anului 2024, poliția di...
19048    Ministrul Sănătăţii, Nelu Tătaru, a declarat, ...
11056    Orașul Ialoveni se va înfrăți cu alte patru lo...
14684    In nicio guvernare nu au existat atatea masuri...
2447     Și consilierii raionali de Orhei nu mai vor să...
1887             Your email address will not be published.
5858     Ministrul delegat al românilor de pretutindeni...
16038    Alina Carp, originară din Cahul, este un exemp...
10443    Cu temperaturi de până la 36°C, așteptate în a...
7568     Trump: "Voi opri războiul din Ucraina. Voi opr...
8056     Este cu ce împușca și cu ce zbura, dar nu are ...
15388    Spania este campioana Europei, după ce s-a imp...
10780    DOC. Consilierii din Horești, convocați în șed...
13662    Pentru cei din departamentul federal de statis.

In [28]:
f = open("rf-texts-tfidf.txt", "w")

for num_features in [500, 1000, 1500, 2000, 2500]:
    for num_estimators in [100, 150, 200]:
        print("=" * 50)
        f.write("=" * 50 + "\n")

        print(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}")
        f.write(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}\n")

        print("Creating text features...")
        f.write("Creating text features...\n")

        train_texts_featurized, validation_texts_featurized = get_features_1(train_texts, validation_texts, num_features)

        print("Training Random Forest...")
        f.write("Training Random Forest...\n")
        random_forest = RandomForestClassifier(
            n_estimators=num_estimators,
            random_state=42)
        random_forest.fit(train_texts_featurized, train_labels)

        print("Getting predictions...")
        f.write("Getting predictions...\n")
        predictions = random_forest.predict(validation_texts_featurized)
        
        print("Computing accuracy...")
        f.write("Computing accuracy...")
        accuracy = accuracy_score(validation_labels, predictions)

        print("Computing f1...")
        f.write("Computing f1...\n")
        f1 = f1_score(validation_labels, predictions, average='weighted')

        print(f"Accuracy: {accuracy}")
        f.write(f"Accuracy: {accuracy}")

        print(f"F1: {f1}")
        f.write(f"F1: {f1}\n")

Testing for num_features = 500, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.6609227348526959
F1: 0.6615604450529123
Testing for num_features = 500, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.669816564758199
F1: 0.6707661034391523
Testing for num_features = 500, num_estimators = 200
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.6681489716509171
F1: 0.6691854144159909
Testing for num_features = 1000, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.7504168982768205
F1: 0.7491143480847187
Testing for num_features = 1000, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...

In [29]:
def get_features_2(train_text, validation_text, num_features=100, window=3, min_count=1, 
                   use_skipgram=True):

    train_tokens = [text.split() for text in train_text]
    validation_tokens = [text.split() for text in validation_text]
    
    model = Word2Vec(sentences=train_tokens, 
                     vector_size=num_features, 
                     window=window, 
                     min_count=min_count,
                     sg=1 if use_skipgram else 0)
    
    def text_to_features(tokens):
        features = []
        for text in tokens:
            text_vectors = [model.wv[word] for word in text if word in model.wv]
            
            if not text_vectors:
                features.append(np.zeros(num_features))
            else:
                features.append(np.mean(text_vectors, axis=0))
        
        return np.array(features)
    
    train_features = text_to_features(train_tokens)
    validation_features = text_to_features(validation_tokens)
    
    return train_features, validation_features

In [30]:
f = open("rf-texts-word2vec.txt", "w")

for num_features in [500, 1000, 1500, 2000, 2500]:
    for num_estimators in [100, 150, 200]:
        print("=" * 50)
        f.write("=" * 50 + "\n")

        print(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}")
        f.write(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}\n")

        print("Creating text features...")
        f.write("Creating text features...\n")

        train_texts_featurized, validation_texts_featurized = get_features_2(train_texts, validation_texts, num_features)

        print("Training Random Forest...")
        f.write("Training Random Forest...\n")
        random_forest = RandomForestClassifier(
            n_estimators=num_estimators,
            random_state=42)
        random_forest.fit(train_texts_featurized, train_labels)

        print("Getting predictions...")
        f.write("Getting predictions...\n")
        predictions = random_forest.predict(validation_texts_featurized)
        
        print("Computing accuracy...")
        f.write("Computing accuracy...\n")
        accuracy = accuracy_score(validation_labels, predictions)

        print("Computing f1...")
        f.write("Computing f1...\n")
        f1 = f1_score(validation_labels, predictions, average='weighted')

        print(f"Accuracy: {accuracy}")
        f.write(f"Accuracy: {accuracy}\n")

        print(f"F1: {f1}")
        f.write(f"F1: {f1}\n")

Testing for num_features = 500, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.5494719288493608
F1: 0.5377898041624213
Testing for num_features = 500, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.556420233463035
F1: 0.5454083075004874
Testing for num_features = 500, num_estimators = 200
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.5630906058921623
F1: 0.5510627233058135
Testing for num_features = 1000, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.5561423012784881
F1: 0.5457892313172258
Testing for num_features = 1000, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...