In [None]:

from google.colab import drive
drive.mount("/content/drive")




[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [None]:
!pip install torchcodec


Collecting torchcodec
  Downloading torchcodec-0.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading torchcodec-0.9.0-cp312-cp312-manylinux_2_28_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.9.0


In [None]:
import json
import os

AUDIO_JSONL_PATH = "/content/drive/MyDrive/dijalekti/audio.jsonl"
AUDIO_BASE_DIR   = "/content/drive/MyDrive"

audio_paths = []
audio_dialects = []


with open(AUDIO_JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        rel = (obj.get("audio_file") or "").strip()
        d   = (obj.get("dialect") or "").strip()
        if rel == "" or d == "":
            continue
        full_path = os.path.join(AUDIO_BASE_DIR, rel)
        if os.path.exists(full_path):
            audio_paths.append(full_path)
            audio_dialects.append(d)
        else:
            pass

print("Вкупно аудио примероци (валидни фајлови):", len(audio_paths))
print("Прво аудио:", audio_paths[0])
print("Дијалект:", audio_dialects[0])


Вкупно аудио примероци (валидни фајлови): 145
Прво аудио: /content/drive/MyDrive/dijalekti/audio_wav/гевгелиско-дојрански/богдански/Жител-на-Богданци-зборува-за-игоорна-група.wav
Дијалект: гевгелиско-дојрански


In [None]:
import librosa
import numpy as np
from tqdm import tqdm
from collections import Counter

MAX_PER_DIALECT = 150
TARGET_SR = 16000
MAX_DURATION = 600.0

by_dialect = {}
for path, d in zip(audio_paths, audio_dialects):
    by_dialect.setdefault(d, []).append(path)

print("Дијалекти и број на аудио примероци:")
for d, lst in by_dialect.items():
    print(d, "->", len(lst))

balanced_paths = []
balanced_labels = []

for d, lst in by_dialect.items():
    subset = lst[:MAX_PER_DIALECT]
    balanced_paths.extend(subset)
    balanced_labels.extend([d] * len(subset))

print("После балансирање:", len(balanced_paths), "примероци")

def extract_mfcc(path, sr=TARGET_SR, max_duration=MAX_DURATION, n_mfcc=20):
    y, sr = librosa.load(path, sr=sr, mono=True, duration=max_duration)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    feat = mfcc.mean(axis=1)
    return feat

X_audio_feats = []
y_audio_labels = []

for path, d in tqdm(list(zip(balanced_paths, balanced_labels)), desc="Извлекување MFCC"):
    try:
        feat = extract_mfcc(path)
        X_audio_feats.append(feat)
        y_audio_labels.append(d)
    except Exception as e:
        print(f"Проблем со {path} -> {e}")

X_audio_feats = np.array(X_audio_feats)
print("X_audio_feats shape:", X_audio_feats.shape)
print("Примери:", len(y_audio_labels))


Дијалекти и број на аудио примероци:
гевгелиско-дојрански -> 7
малешевско-пирински -> 5
кратовски -> 3
кривопаланечки -> 2
прилепско-битолски -> 31
охридски -> 5
кичевско-поречки -> 2
гостиварски(горнополошки) -> 1
струмичко-радовишки -> 5
штипско-кочански -> 33
овчеполски -> 2
кумановски -> 8
скопскоцрногорски -> 1
скопско-велешки -> 18
тетовски(долнополошки) -> 6
дримколско-голобрдски -> 1
вевчанско-радошки -> 3
тиквешко-мариовски -> 12
После балансирање: 145 примероци


Извлекување MFCC: 100%|██████████| 145/145 [02:11<00:00,  1.10it/s]

X_audio_feats shape: (145, 20)
Примери: 145





In [None]:
from collections import Counter

MIN_SAMPLES = 5
cnt = Counter(y_audio_labels)

keep = {d for d, c in cnt.items() if c >= MIN_SAMPLES}

X_filt = []
y_filt = []

for x, lbl in zip(X_audio_feats, y_audio_labels):
    if lbl in keep:
        X_filt.append(x)
        y_filt.append(lbl)

X_audio_feats = np.array(X_filt)
y_audio_labels = np.array(y_filt)

print("После филтрирање:", Counter(y_audio_labels))



После филтрирање: Counter({np.str_('штипско-кочански'): 33, np.str_('прилепско-битолски'): 31, np.str_('скопско-велешки'): 18, np.str_('тиквешко-мариовски'): 12, np.str_('кумановски'): 8, np.str_('гевгелиско-дојрански'): 7, np.str_('тетовски(долнополошки)'): 6, np.str_('малешевско-пирински'): 5, np.str_('охридски'): 5, np.str_('струмичко-радовишки'): 5})


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

le_audio = LabelEncoder()
y_audio = le_audio.fit_transform(y_audio_labels)

scaler = StandardScaler()
X_audio_scaled = scaler.fit_transform(X_audio_feats)

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(
    X_audio_scaled,
    y_audio,
    test_size=0.15,
    random_state=42
)

print("Train аудио:", X_train_a.shape[0], "Test аудио:", X_test_a.shape[0])
print("Број класи (аудио):", len(le_audio.classes_))

audio_clf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=20,
    min_samples_leaf=2,
    # max_features="sqrt",
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)

audio_clf.fit(X_train_a, y_train_a)

y_pred_a = audio_clf.predict(X_test_a)
print("Accuracy:", accuracy_score(y_test_a, y_pred_a))
print("=== AUDIO MODEL – Test report ===")
print(classification_report(
    y_test_a,
    y_pred_a,
    labels=list(range(len(le_audio.classes_))),
    target_names=le_audio.classes_,
    zero_division=0
))


Train аудио: 110 Test аудио: 20
Број класи (аудио): 10
Accuracy: 0.65
=== AUDIO MODEL – Test report ===
                        precision    recall  f1-score   support

  гевгелиско-дојрански       0.00      0.00      0.00         1
            кумановски       0.00      0.00      0.00         1
   малешевско-пирински       0.00      0.00      0.00         0
              охридски       0.00      0.00      0.00         0
    прилепско-битолски       0.83      0.83      0.83         6
       скопско-велешки       0.50      0.50      0.50         4
   струмичко-радовишки       0.00      0.00      0.00         0
тетовски(долнополошки)       1.00      1.00      1.00         1
    тиквешко-мариовски       0.00      0.00      0.00         0
      штипско-кочански       0.83      0.71      0.77         7

              accuracy                           0.65        20
             macro avg       0.32      0.30      0.31        20
          weighted avg       0.69      0.65      0.67        2