In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy
from gensim.models import Word2Vec
import numpy as np
import os
import json
from nltk.tokenize import sent_tokenize
from typing import List, Dict

In [2]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


In [3]:
def replace_i_prefix(word, prefixes):
  for prefix in prefixes:
    try:
      if word.lower().startswith(prefix) and len(word) > len(prefix) and word[len(prefix):][0] in ["î", "Î"]:
        first_letter = word[len(prefix):][0]
        first_letter = "i" if first_letter == "î" else ("I" if first_letter == "Î" else first_letter)
        word = prefix + first_letter + word[len(prefix) + 1:]

    except:
      print(word)

  word = word.replace("î", "a").replace("Î", "A")

  return word

def no_diacritics(text, prefixes):

  text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


# for key in moldavian_texts:
#     for i in range(len(moldavian_texts[key])):
#         moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

# for key in romanian_texts:
#     for i in range(len(romanian_texts[key])):
#         romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

# print(moldavian_texts["Sport"][0])
# print(romanian_texts['Stiri'][12])

print(no_diacritics("cîțiva", romanian_prefixes))

cativa


In [4]:
romanian=[
    "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei",
    "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta",
    "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia",
    "acești", "aceștia", "acolo", "acord", "acum", "adica", "ai", "aia",
    "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt",
    "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul",
    "alături", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea",
    "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat",
    "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia",
    "atunci", "au", "avea", "avem", "aveţi", "aveți", "avut", "azi", "aş",
    "aşadar", "aţi", "aș", "așadar", "ați", "b", "ba", "bine", "bucur", "bună",
    "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia",
    "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel",
    "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva",
    "cit", "cita", "cite", "citeva", "citi", "câțiva", "conform", "contra",
    "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte",
    "câtva", "câţi", "câți", "cînd", "cît", "cîte", "cîtva", "cîţi", "cîți",
    "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca",
    "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci",
    "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre",
    "deşi", "deși", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar",
    "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei",
    "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "ești", "f",
    "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu",
    "fiţi", "fiți", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie",
    "grație", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in",
    "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi",
    "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă",
    "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi",
    "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi",
    "mulţumesc", "mulți", "mulțumesc", "mâine", "mîine", "mă", "n", "ne",
    "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri",
    "nimic", "niste", "nişte", "niște", "noastre", "noastră", "noi", "noroc",
    "nostri", "nostru", "nou", "noua", "nouă", "noştri", "noștri", "nu",
    "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum",
    "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca",
    "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina",
    "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-",
    "putini", "puţin", "puţina", "puţină", "puțin", "puțina", "puțină", "până",
    "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau",
    "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem",
    "sunteţi", "sunteți", "sus", "sută", "sînt", "sîntem", "sînteţi",
    "sînteți", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp",
    "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul",
    "totusi", "totuşi", "totuși", "toţi", "toți", "trei", "treia", "treilea",
    "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde",
    "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora",
    "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră",
    "voi", "vom", "vor", "vostru", "vouă", "voştri", "voștri", "vreme", "vreo",
    "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi",
    "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît",
    "între", "întrucât", "întrucît", "îţi", "îți", "ăla", "ălea", "ăsta",
    "ăstea", "ăştia", "ăștia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie",
    "șapte", "șase", "și", "știu", "ți", "ție"
]

In [5]:
# # Get all the words from the stop words list and apply the same transformation
stop_words = romanian
for i in range(len(stop_words)):
    stop_words[i] = no_diacritics(stop_words[i], romanian_prefixes)

stop_words = list(set(stop_words))

In [6]:
def analyze_label_distribution(dataset_folder: str) -> pd.DataFrame:
    json_files = [f for f in os.listdir(dataset_folder) if f.endswith('.json')]
    
    label_data: Dict[str, List[Dict]] = {}
    
    for filename in json_files:
        filepath = os.path.join(dataset_folder, filename)
        
        with open(filepath, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
                
                if isinstance(data, dict):
                    df = pd.DataFrame.from_dict(data, orient='index')
                else:
                    df = pd.DataFrame(data)
                
                for _, row in df.iterrows():
                    
                    try:
                        label = row.get('label', row.get('category', row.get('class')))
                        text = row.get('text', row.get('content', row.get('description')))
                        
                        if label is not None and text is not None:
                            if label not in label_data:
                                label_data[label] = []
                            label_data[label].append({'text': text, 'label': label})
                    except AttributeError:
                        print(f"Skipping row in {filename} due to unexpected format")
                        continue
                    
            except json.JSONDecodeError:
                print(f"Error reading {filename}: Invalid JSON format")
                continue
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
                continue
    
    if not label_data:
        print("No valid data found in any of the JSON files.")
        return None
    
    label_data = {k: v for k, v in label_data.items() if len(v) >= 200}
    
    if not label_data:
        print("No labels met the minimum criteria of 200 examples.")
        return None
    
    processed_data = []
    
    for label, texts in label_data.items():
        
        all_sentences = []
        for item in texts:
            sentences = sent_tokenize(item['text'])
            all_sentences.extend([(sentence, item['label']) for sentence in sentences])
        
        if len(all_sentences) > 10000:
            selected_indices = np.random.choice(len(all_sentences), 10000, replace=False)
            selected_sentences = [all_sentences[i] for i in selected_indices]
        else:
            selected_sentences = all_sentences
        
        for sentence, label in selected_sentences:
            processed_data.append({
                'text': sentence,
                'label': label
            })

    final_dataframe = pd.DataFrame(processed_data)

    label_counts = final_dataframe['label'].value_counts()
    print("\nFinal Label Distribution:")
    for label, count in label_counts.items():
        print(f"{label}: {count} sentences")
    
    return final_dataframe

dataset_folder = 'Dataset'
data = analyze_label_distribution(dataset_folder)


Final Label Distribution:
Banat: 10000 sentences
Criuleni: 10000 sentences
Oltenia: 10000 sentences
Moldova: 10000 sentences
Serbia: 10000 sentences
Muntenia: 10000 sentences
Ungheni: 10000 sentences
Dobrogea: 10000 sentences
Germania: 10000 sentences
Ardeal: 10000 sentences
Sangerei: 10000 sentences
Ucraina: 10000 sentences
Canada_EN: 9779 sentences
Calarasi: 8187 sentences
Spania: 8014 sentences
Crisana: 7837 sentences
Bucovina: 7031 sentences
Maramures: 7019 sentences
UK: 7008 sentences
Soroca: 6548 sentences
Ialoveni: 5931 sentences
Orhei: 5064 sentences
Cahul: 4114 sentences
Causeni: 3254 sentences
Balti: 1377 sentences


In [7]:
def get_features_1(train_text, validation_text, num_features):
    tfidf = TfidfVectorizer(max_features=num_features)

    train_text_tfidf = tfidf.fit_transform(train_text)

    validation_text_tfidf = tfidf.transform(validation_text)

    return train_text_tfidf.toarray(), validation_text_tfidf.toarray()

In [8]:
nlp = spacy.load("ro_core_news_sm")

def preprocess(text):
    text = no_diacritics(text, romanian_prefixes)
    text = ' '.join(word for word in text.split() if word.lower() in stop_words)
    doc = nlp(text)
    text = ' '.join(token.lemma_ for token in doc if not token.is_stop)
    
    return text

In [9]:
print(data.head())

data = data[data['text'].notna()]

texts = data['text']
labels = data['label']

train_texts, validation_texts, train_labels, validation_labels = train_test_split(texts,labels, test_size=0.2, random_state=42)

train_texts.apply(preprocess)
validation_texts.apply(preprocess)

                                                text  label
0  De asemenea, 74.922 de persoane se află în car...  Banat
1  EEI, firma noastră locală, are și ea un merit ...  Banat
2  Vorbim despre tinere care, adeseori, sunt resp...  Banat
3  Încât, pentru mamă, s-a constatat cum că ea se...  Banat
4  „Dispune eliminarea fizică din dosarul de urmă...  Banat


28113          
73653        an
158800         
194447    an an
43245     an an
          ...  
44446     an an
57363          
90483          
109754         
60052        an
Name: text, Length: 40233, dtype: object

In [10]:
train_texts = train_texts[train_texts.str.strip() != '']
train_texts = train_texts[train_texts.notna()]
validation_texts = validation_texts[validation_texts.str.strip() != '']
validation_texts = validation_texts[validation_texts.notna()]

train_labels = train_labels[train_texts.index]
train_labels = train_labels[train_labels.notna()]
validation_labels = validation_labels[validation_texts.index]
validation_labels = validation_labels[validation_labels.notna()]

final_train_indices = train_texts.index.intersection(train_labels.index)
final_val_indices = validation_texts.index.intersection(validation_labels.index)

train_texts = train_texts[final_train_indices]
train_labels = train_labels[final_train_indices]
validation_texts = validation_texts[final_val_indices]
validation_labels = validation_labels[final_val_indices]

print("\nFinal dataset sizes after cleaning:")
print(f"Final training samples: {len(train_texts)}")
print(f"Final validation samples: {len(validation_texts)}")


Final dataset sizes after cleaning:
Final training samples: 160930
Final validation samples: 40233


In [11]:
print(train_texts[1:20])
print(validation_texts[1:20])

print(train_labels[1:20])
print(validation_labels[1:20])

165076    In timpul spectacolului, mama sa, modelul Maye...
6854      La pagina 16 a cărții găsim un tabel comparati...
49895         Adăugați toate ingredientele într-un blender.
189517    Legătura dintre stres și anxietate este una pr...
188662    Consiliul Local al comunei Bâlteni a aprobat o...
66257     În același timp, Sainsbury’s și Tesco s-au reg...
158341    Femeia, care este pensionară, le-a povestit oa...
134077             La eveniment au participat și invitaţii.
191056    După ce a câștigat alegerile, Dijmărescu a fos...
133134    Conform celor menționate de dânsul, Comunitate...
125895    Deși a îndrăgit foarte mult orașul Timișoara, ...
4878      Petru, ducându-se la cârciuma lui Ioniţă Georg...
43647     Azi pe multe dintre tractoarele prezente la ma...
64910     În Scoția, Yorkshire, Lancashire și Staffordsh...
170803    Un alt obiectiv ce mi-l propun este identifica...
100999    În data de 20.01.2025, o echipă comună formată...
73382     De notat că, deocamdată nici p

In [18]:
f = open("rf-sentences-tfidf.txt", "w")

for num_features in [500, 1000, 1500, 2000, 2500]:
    for num_estimators in [100, 150, 200]:
        print("=" * 50)
        f.write("=" * 50 + "\n")

        print(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}")
        f.write(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}\n")

        print("Creating text features...")
        f.write("Creating text features...\n")

        train_texts_featurized, validation_texts_featurized = get_features_1(train_texts, validation_texts, num_features)

        print("Training Random Forest...")
        f.write("Training Random Forest...\n")
        random_forest = RandomForestClassifier(
            n_estimators=num_estimators,
            random_state=42)
        random_forest.fit(train_texts_featurized, train_labels)

        print("Getting predictions...")
        f.write("Getting predictions...\n")
        predictions = random_forest.predict(validation_texts_featurized)
        
        print("Computing accuracy...")
        f.write("Computing accuracy...")
        accuracy = accuracy_score(validation_labels, predictions)

        print("Computing f1...")
        f.write("Computing f1...\n")
        f1 = f1_score(validation_labels, predictions, average='weighted')

        print(f"Accuracy: {accuracy}")
        f.write(f"Accuracy: {accuracy}")

        print(f"F1: {f1}")
        f.write(f"F1: {f1}\n")

Testing for num_features = 500, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.3381303904754803
F1: 0.337371901480755
Testing for num_features = 500, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.3411627271145577
F1: 0.33987721353865197
Testing for num_features = 500, num_estimators = 200
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.3426540402157433
F1: 0.3407174981508568
Testing for num_features = 1000, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.376432281957597
F1: 0.37523337948002033
Testing for num_features = 1000, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions..

In [19]:
def get_features_2(train_text, validation_text, num_features=100, window=3, min_count=1, 
                   use_skipgram=True):

    train_tokens = [text.split() for text in train_text]
    validation_tokens = [text.split() for text in validation_text]
    
    model = Word2Vec(sentences=train_tokens, 
                     vector_size=num_features, 
                     window=window, 
                     min_count=min_count,
                     sg=1 if use_skipgram else 0) 
    
    def text_to_features(tokens):
        features = []
        for text in tokens:
            text_vectors = [model.wv[word] for word in text if word in model.wv]
            
            if not text_vectors:
                features.append(np.zeros(num_features))
            else:
                features.append(np.mean(text_vectors, axis=0))
        
        return np.array(features)
    
    train_features = text_to_features(train_tokens)
    validation_features = text_to_features(validation_tokens)
    
    return train_features, validation_features

In [21]:
f = open("rf-sentences-word2vec.txt", "w")

for num_features in [500, 1000, 1500, 2000, 2500]:
    for num_estimators in [100, 150, 200]:
        print("=" * 50)
        f.write("=" * 50 + "\n")

        print(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}")
        f.write(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}\n")

        print("Creating text features...")
        f.write("Creating text features...\n")

        train_texts_featurized, validation_texts_featurized = get_features_2(train_texts, validation_texts, num_features)

        print("Training Random Forest...")
        f.write("Training Random Forest...\n")
        random_forest = RandomForestClassifier(
            n_estimators=num_estimators,
            random_state=42)
        random_forest.fit(train_texts_featurized, train_labels)

        print("Getting predictions...")
        f.write("Getting predictions...\n")
        predictions = random_forest.predict(validation_texts_featurized)
        
        print("Computing accuracy...")
        f.write("Computing accuracy...\n")
        accuracy = accuracy_score(validation_labels, predictions)

        print("Computing f1...")
        f.write("Computing f1...\n")
        f1 = f1_score(validation_labels, predictions, average='weighted')

        print(f"Accuracy: {accuracy}")
        f.write(f"Accuracy: {accuracy}\n")

        print(f"F1: {f1}")
        f.write(f"F1: {f1}\n")

Testing for num_features = 500, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.27050431238038424
F1: 0.269897521184604
Testing for num_features = 500, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.27293018169164616
F1: 0.271902170830922
Testing for num_features = 500, num_estimators = 200
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.27412323217259464
F1: 0.27257399852068544
Testing for num_features = 1000, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.3011458255660776
F1: 0.3001867035840163
Testing for num_features = 1000, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions