In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import spacy
from gensim.models import Word2Vec
import numpy as np

In [2]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


In [3]:
def replace_i_prefix(word, prefixes):
  for prefix in prefixes:
    try:
      if word.lower().startswith(prefix) and len(word) > len(prefix) and word[len(prefix):][0] in ["î", "Î"]:
        first_letter = word[len(prefix):][0]
        first_letter = "i" if first_letter == "î" else ("I" if first_letter == "Î" else first_letter)
        word = prefix + first_letter + word[len(prefix) + 1:]

    except:
      print(word)

  word = word.replace("î", "a").replace("Î", "A")

  return word

def no_diacritics(text, prefixes):

  text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


# for key in moldavian_texts:
#     for i in range(len(moldavian_texts[key])):
#         moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

# for key in romanian_texts:
#     for i in range(len(romanian_texts[key])):
#         romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

# print(moldavian_texts["Sport"][0])
# print(romanian_texts['Stiri'][12])

print(no_diacritics("cîțiva", romanian_prefixes))

cativa


In [4]:
romanian=[
    "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei",
    "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta",
    "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia",
    "acești", "aceștia", "acolo", "acord", "acum", "adica", "ai", "aia",
    "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt",
    "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul",
    "alături", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea",
    "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat",
    "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia",
    "atunci", "au", "avea", "avem", "aveţi", "aveți", "avut", "azi", "aş",
    "aşadar", "aţi", "aș", "așadar", "ați", "b", "ba", "bine", "bucur", "bună",
    "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia",
    "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel",
    "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva",
    "cit", "cita", "cite", "citeva", "citi", "câțiva", "conform", "contra",
    "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte",
    "câtva", "câţi", "câți", "cînd", "cît", "cîte", "cîtva", "cîţi", "cîți",
    "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca",
    "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci",
    "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre",
    "deşi", "deși", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar",
    "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei",
    "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "ești", "f",
    "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu",
    "fiţi", "fiți", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie",
    "grație", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in",
    "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi",
    "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă",
    "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi",
    "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi",
    "mulţumesc", "mulți", "mulțumesc", "mâine", "mîine", "mă", "n", "ne",
    "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri",
    "nimic", "niste", "nişte", "niște", "noastre", "noastră", "noi", "noroc",
    "nostri", "nostru", "nou", "noua", "nouă", "noştri", "noștri", "nu",
    "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum",
    "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca",
    "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina",
    "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-",
    "putini", "puţin", "puţina", "puţină", "puțin", "puțina", "puțină", "până",
    "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau",
    "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem",
    "sunteţi", "sunteți", "sus", "sută", "sînt", "sîntem", "sînteţi",
    "sînteți", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp",
    "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul",
    "totusi", "totuşi", "totuși", "toţi", "toți", "trei", "treia", "treilea",
    "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde",
    "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora",
    "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră",
    "voi", "vom", "vor", "vostru", "vouă", "voştri", "voștri", "vreme", "vreo",
    "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi",
    "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît",
    "între", "întrucât", "întrucît", "îţi", "îți", "ăla", "ălea", "ăsta",
    "ăstea", "ăştia", "ăștia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie",
    "șapte", "șase", "și", "știu", "ți", "ție"
]

In [5]:
# # Get all the words from the stop words list and apply the same transformation
stop_words = romanian
for i in range(len(stop_words)):
    stop_words[i] = no_diacritics(stop_words[i], romanian_prefixes)

stop_words = list(set(stop_words))

In [6]:
import os
import json
import pandas as pd

def analyze_label_distribution(dataset_folder):
    json_files = [f for f in os.listdir(dataset_folder) if f.endswith('.json')]
    
    dataframes = []
    
    for filename in json_files:
        filepath = os.path.join(dataset_folder, filename)
        
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if 200 <= len(data) <= 1000:
            df = pd.DataFrame(data)
            dataframes.append(df)
        elif len(data) > 1000:
            df = pd.DataFrame(data).sample(n=1000, random_state=42)
            dataframes.append(df)
    
    # Combine dataframes
    if dataframes:
        final_dataframe = pd.concat(dataframes, ignore_index=True)
        
        # Count label distribution
        label_counts = final_dataframe['label'].value_counts()
        
        print("Label Distribution:")
        for label, count in label_counts.items():
            print(f"{label}: {count}")
        
        return final_dataframe
    else:
        print("No files met the criteria.")
        return None

# Run the analysis
dataset_folder = 'Dataset'
data = analyze_label_distribution(dataset_folder)

Label Distribution:
Banat: 1000
Ardeal: 1000
Muntenia: 1000
Serbia: 1000
Moldova: 1000
Oltenia: 1000
Ucraina: 1000
Dobrogea: 965
Spania: 723
Maramures: 656
Canada_EN: 641
Crisana: 579
Germania: 500
UK: 499
Bucovina: 428


In [7]:
def get_features_1(train_text, validation_text, num_features):
    tfidf = TfidfVectorizer(max_features=num_features)

    train_text_tfidf = tfidf.fit_transform(train_text)

    validation_text_tfidf = tfidf.transform(validation_text)

    return train_text_tfidf.toarray(), validation_text_tfidf.toarray()

In [8]:
nlp = spacy.load("ro_core_news_sm")

def preprocess(text):
    text = no_diacritics(text, romanian_prefixes)
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    doc = nlp(text)
    text = ' '.join(token.lemma_ for token in doc if not token.is_stop)
    
    return text

In [9]:
print(data.head())

data = data[data['text'].notna() | data['content'].notna()]

data['text'] = data['text'].combine_first(data['content'])

texts = data['text']
labels = data['label']

train_texts, validation_texts, train_labels, validation_labels = train_test_split(texts,labels, test_size=0.2, random_state=42)

train_texts.apply(preprocess)
validation_texts.apply(preprocess)

                                               title  \
0               UPDATE: Ars ca o torță la Caransebeș   
1  OPINIA PERSONALA ,VERSUS …Raed Arafat: Trebuie...   
2  BILANT COVID-19!5.837 cazuri noi de infectare ...   
3  Doi bărbați au fost arestați după ce unul dint...   
4  Bilant COVID -19 la nivelul Judetului Caras-Se...   

                                                text  label content  
0  Crimă prin lovire și incendiere de cadavru pet...  Banat     NaN  
1  Şeful Departamentului pentru Situaţii de Urgen...  Banat     NaN  
2  Comunica ,Grupul de Comunicare Strategică.”Pân...  Banat     NaN  
3  Polițiștii au intervenit, miercuri seara, pent...  Banat     NaN  
4  Pentru data de 27.11.2020 ora 8.30 vă comunică...  Banat     NaN  


396      primar comună Birna , Ovidiu Ignaton , implica...
3098     sarbatore iarnă reprezint perioadă romină , pl...
9322     viată prieten ? fi spune prieten bun pretuiest...
357      lună anunt Primariei timisoara potrivit fost s...
1323     conducere natională partid popular delimitează...
                               ...                        
9313     fost palat comisie european Dunarii , actual s...
10431    moment asteptat profesor parinti sosit . inspe...
3068     anunt istoric Volkswagen - producator masin Ge...
6104        „ mijloc bun apară < < fac rau > > : seman . ”
3785     turn Blackpool luat foc , filmare trecatore ve...
Name: text, Length: 2399, dtype: object

In [10]:
f = open("knn-texts-tfidf.txt", "w")

for num_features in [500, 1000, 1500, 2000, 2500]:
    for num_estimators in [100, 150, 200]:
        print("=" * 50)
        f.write("=" * 50 + "\n")

        print(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}")
        f.write(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}\n")

        print("Creating text features...")
        f.write("Creating text features...\n")

        train_texts_featurized, validation_texts_featurized = get_features_1(train_texts, validation_texts, num_features)

        print("Training Random Forest...")
        f.write("Training Random Forest...\n")
        random_forest = RandomForestClassifier(
            n_estimators=num_estimators,
            random_state=42)
        random_forest.fit(train_texts_featurized, train_labels)

        print("Getting predictions...")
        f.write("Getting predictions...\n")
        predictions = random_forest.predict(validation_texts_featurized)
        
        print("Computing accuracy...")
        f.write("Computing accuracy...")
        accuracy = accuracy_score(validation_labels, predictions)

        print("Computing f1...")
        f.write("Computing f1...\n")
        f1 = f1_score(validation_labels, predictions, average='weighted')

        print(f"Accuracy: {accuracy}")
        f.write(f"Accuracy: {accuracy}")

        print(f"F1: {f1}")
        f.write(f"F1: {f1}\n")

Testing for num_features = 500, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.669445602334306
F1: 0.6714777129464439
Testing for num_features = 500, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.6706961233847436
F1: 0.674725150729272
Testing for num_features = 500, num_estimators = 200
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.677365568987078
F1: 0.6814079834994059
Testing for num_features = 1000, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.7590662776156732
F1: 0.7597383193577909
Testing for num_features = 1000, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
C

In [11]:
def get_features_2(train_text, validation_text, num_features=100, window=3, min_count=1, 
                   use_skipgram=True):

    train_tokens = [text.split() for text in train_text]
    validation_tokens = [text.split() for text in validation_text]
    
    # Train Word2Vec model
    model = Word2Vec(sentences=train_tokens, 
                     vector_size=num_features, 
                     window=window, 
                     min_count=min_count,
                     sg=1 if use_skipgram else 0)  # 1 for skip-gram, 0 for CBOW
    
    # Function to convert text to Word2Vec features
    def text_to_features(tokens):
        features = []
        for text in tokens:
            # Calculate mean word vector for the text
            text_vectors = [model.wv[word] for word in text if word in model.wv]
            
            # If no words found, use zero vector
            if not text_vectors:
                features.append(np.zeros(num_features))
            else:
                features.append(np.mean(text_vectors, axis=0))
        
        return np.array(features)
    
    # Convert train and validation texts to features
    train_features = text_to_features(train_tokens)
    validation_features = text_to_features(validation_tokens)
    
    return train_features, validation_features

In [12]:
f = open("knn-texts-word2vec.txt", "w")

for num_features in [500, 1000, 1500, 2000, 2500]:
    for num_estimators in [100, 150, 200]:
        print("=" * 50)
        f.write("=" * 50 + "\n")

        print(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}")
        f.write(f"Testing for num_features = {num_features}, num_estimators = {num_estimators}\n")

        print("Creating text features...")
        f.write("Creating text features...\n")

        train_texts_featurized, validation_texts_featurized = get_features_2(train_texts, validation_texts, num_features)

        print("Training Random Forest...")
        f.write("Training Random Forest...\n")
        random_forest = RandomForestClassifier(
            n_estimators=num_estimators,
            random_state=42)
        random_forest.fit(train_texts_featurized, train_labels)

        print("Getting predictions...")
        f.write("Getting predictions...\n")
        predictions = random_forest.predict(validation_texts_featurized)
        
        print("Computing accuracy...")
        f.write("Computing accuracy...\n")
        accuracy = accuracy_score(validation_labels, predictions)

        print("Computing f1...")
        f.write("Computing f1...\n")
        f1 = f1_score(validation_labels, predictions, average='weighted')

        print(f"Accuracy: {accuracy}")
        f.write(f"Accuracy: {accuracy}\n")

        print(f"F1: {f1}")
        f.write(f"F1: {f1}\n")

Testing for num_features = 500, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.554814506044185
F1: 0.5464819506649203
Testing for num_features = 500, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.5523134639433097
F1: 0.5427678332931369
Testing for num_features = 500, num_estimators = 200
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.5552313463943309
F1: 0.5474993232001415
Testing for num_features = 1000, num_estimators = 100
Creating text features...
Training Random Forest...
Getting predictions...
Computing accuracy...
Computing f1...
Accuracy: 0.5523134639433097
F1: 0.5422254017496816
Testing for num_features = 1000, num_estimators = 150
Creating text features...
Training Random Forest...
Getting predictions...