In [1]:
import pandas as pd
df_train = pd.read_csv('data/train.csv')

In [2]:
print(df_train.head())

                  ID                            wav  \
0   11_15:19_g6j.wav   audio_files/11_15:19_g6j.wav   
1   80_15:03_vsh.wav   audio_files/80_15:03_vsh.wav   
2  136_01:37_y9b.wav  audio_files/136_01:37_y9b.wav   
3   21_00:23_ss0.wav   audio_files/21_00:23_ss0.wav   
4   43_21:04_cxx.wav   audio_files/43_21:04_cxx.wav   

                                                 wrd  duration  
0  اسمي يمينة واخترت هذه الصورة لاني جزائرية و اق...      8.58  
1  وضحونا أمورنا على الاقل لجيل كامل قادم للتعليق...      8.64  
2                   فماشي حوانيت شاملة قريبة مه هوني      4.26  
3                 فما بارشا عباد في الرحلات السياحية      2.52  
4        شركة الطيران باش تدفعلي شراء حوايجي الشخصية      3.66  


In [5]:
print(df_train.shape)
print(df_train.columns)

(2128, 4)
Index(['ID', 'wav', 'wrd', 'duration'], dtype='object')


In [24]:
print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2128 entries, 0 to 2127
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        2128 non-null   object 
 1   wav       2128 non-null   object 
 2   wrd       2128 non-null   object 
 3   duration  2128 non-null   float64
dtypes: float64(1), object(3)
memory usage: 66.6+ KB
None


In [19]:
for col in df_train.columns:
    print(f"Column {col}: Unique Values: {df_train[col].nunique()}")
    print(f"Missing values in {col}:", df_train[col].isna().sum())
    print('==============')

Column ID: Unique Values: 2128
Missing values in ID: 0
Column wav: Unique Values: 2128
Missing values in wav: 0
Column wrd: Unique Values: 2128
Missing values in wrd: 0
Column duration: Unique Values: 156
Missing values in duration: 0


In [25]:
print(df_train.describe())

          duration
count  2128.000000
mean      4.215115
std       2.128777
min       0.240000
25%       2.640000
50%       3.840000
75%       5.580000
max      40.440000


In [28]:
mean_value = df_train['duration'].mean()
values_below_mean = df_train[df_train['duration'] < mean_value]['duration'].count()
total_values = df_train['duration'].count()
percentage_below_mean = (values_below_mean / total_values) * 100
print(f"Percentage of values LESS than the mean in {'duration'}: {percentage_below_mean:.2f}%")
print('==============')
print(f"Percentage of values MORE than the mean in {'duration'}: {(100 - percentage_below_mean):.2f}%")

Percentage of values LESS than the mean in duration: 55.08%
Percentage of values MORE than the mean in duration: 44.92%


In [29]:
mean_value = 30
values_below_mean = df_train[df_train['duration'] < mean_value]['duration'].count()
total_values = df_train['duration'].count()
percentage_below_mean = (values_below_mean / total_values) * 100
print(f"Percentage of values LESS than the mean in {'duration'}: {percentage_below_mean:.2f}%")
print('==============')
print(f"Percentage of values MORE than the mean in {'duration'}: {(100 - percentage_below_mean):.2f}%")

Percentage of values LESS than the mean in duration: 99.95%
Percentage of values MORE than the mean in duration: 0.05%


In [37]:
selected_rows = df_train[df_train['duration'] >= 30]
indices = selected_rows.index.tolist()
print(f'There is {len(indices)} row(s) with duration >= 30, located at: {indices}')

There is 1 row(s) with duration >= 30, located at: [952]


In [45]:
import os
processed_segments_folder = 'data/processed_segments'
transcripts_folder = os.path.join(processed_segments_folder, 'transcripts')

for index, row in df_train.iterrows():
    file_id = str(row['ID']).replace('.wav', '').replace(':', '_')
    
    wrd_content = str(row['wrd'])
    
    txt_file_path = os.path.join(transcripts_folder, f"{file_id}.txt")
    
    with open(txt_file_path, 'w', encoding='utf-8') as f:
        f.write(wrd_content)

file_count = len([f for f in os.listdir(transcripts_folder) if os.path.isfile(os.path.join(transcripts_folder, f))])
print(f"Transcripts saved correctly in {transcripts_folder} and the number of files is: {file_count}")

Transcripts saved correctly in data/processed_segments\transcripts and the number of files is: 2128


In [51]:
audio_files_folder = 'data/processed_segments/audio_files'
file_count = len([f for f in os.listdir(audio_files_folder) if os.path.isfile(os.path.join(audio_files_folder, f))])
print(f"Transcripts saved correctly in {audio_files_folder} and the number of files is: {file_count}")

Transcripts saved correctly in data/processed_segments/audio_files and the number of files is: 2240


In [None]:
import os
import shutil

transcripts_folder = 'data/processed_segments/transcripts'
audio_files_folder = 'data/processed_segments/audio_files'
audio_testing_folder = 'data/audio_testing'

transcript_ids = {f.replace('.txt', '') for f in os.listdir(transcripts_folder) if f.endswith('.txt')}
audio_files = [f for f in os.listdir(audio_files_folder) if f.endswith('.wav')]

training_count = 0
testing_count = 0
for audio_file in audio_files:
    # Get the audio file ID (remove .wav and replace colons with underscores)
    audio_id = audio_file.replace('.wav', '').replace(':', '_')
    
    if audio_id in transcript_ids:
        training_count += 1
    else:      
        source_path = os.path.join(audio_files_folder, audio_file)
        dest_path = os.path.join(audio_testing_folder, audio_file)
        shutil.move(source_path, dest_path)
        testing_count += 1

print(f"Summary:")
print(f"Training audio files kept in {audio_files_folder}: {training_count}")
print(f"Testing audio files moved to {audio_testing_folder}: {testing_count}")

Summary:
Training audio files kept in data/processed_segments/audio_files: 2128
Testing audio files moved to data/audio_testing: 112


In [53]:
dev_df = pd.read_csv('data/dev.csv')

audio_files = [f for f in os.listdir(audio_files_folder) if f.endswith('.wav')] if os.path.exists(audio_files_folder) else []
audio_testing = [f for f in os.listdir(audio_testing_folder) if f.endswith('.wav')] if os.path.exists(audio_testing_folder) else []

audio_files_set = set(audio_files)
audio_testing_set = set(audio_testing)

in_audio_files = 0
in_audio_testing = 0
not_found = 0
results = []
for index, row in dev_df.iterrows():

    audio_id_normalized = str(row['ID']).replace('.wav', '').replace(':', '_')
    audio_id_for_comparison = audio_id_normalized.replace('_', ':') + '.wav'
    
    if audio_id_for_comparison in audio_files_set:
        results.append(f"{audio_id_for_comparison}: Found in {audio_files_folder} (training)")
        in_audio_files += 1
    elif audio_id_for_comparison in audio_testing_set:
        results.append(f"{audio_id_for_comparison}: Found in {audio_testing_folder} (testing)")
        in_audio_testing += 1
    else:
        results.append(f"{audio_id_for_comparison}: Not found in either folder")
        not_found += 1


print(f"Summary:")
print(f"Audio files in {audio_files_folder} (training): {in_audio_files}")
print(f"Audio files in {audio_testing_folder} (testing): {in_audio_testing}")
print(f"Audio files not found in either folder: {not_found}")

Summary:
Audio files in data/processed_segments/audio_files (training): 0
Audio files in data/audio_testing (testing): 0
Audio files not found in either folder: 67


In [57]:
all_text = ' '.join(df_train['wrd'].astype(str))
words = all_text.split()
nan_count = words.count('nan')
print(f"Total nan words: {nan_count}")
if nan_count > 0:
    print(f"Found {nan_count} 'nan' strings in the word list, removing them.")
    words = [word for word in words if word != 'nan']
unique_words = set(words)
print(f"Total number of words (after removing 'nan'): {len(words)}")
print(f"Number of unique words: {len(unique_words)}")
print(f"Unique words: {unique_words}")

Total nan words: 0
Total number of words (after removing 'nan'): 17683
Number of unique words: 6615
Unique words: {'للشباب', 'المكتوب', 'حجزك', 'اثناش', 'وشكون', 'تاتفهم', 'براغتون', 'ثورة', 'القطن', 'الخطر', 'دبر', 'غلة', 'متحدين', 'للاصلاح', 'دراسته', 'الكرم', 'الليسونس', 'موتتها', 'بسوم', 'ضل', 'تهضمهمش', 'الببيت', 'السخانة', 'همه', 'بالمقلوب', 'قريتوهم', 'نكمنديو', 'فقد', 'أني', 'الضاحية', 'لادنبرة', 'متع', 'دورها', 'الملح', 'ذلك', 'جمبو', 'طوا', 'رايكم', 'للبحر', 'بالي', 'الحالي', 'تعدي', 'المطعم', 'تحديدا', 'ساوث', 'بمية', 'تقدمها', 'يفطروا', 'نسهروا', 'مكانش', 'بالحديث', 'وصغارها', 'بقداش', 'نكره', 'تلوات', 'فهم', 'الذيوبة', 'المينو', 'فساد', 'تخاف', 'بالامان', 'نكرهم', 'نحسوه', 'ماشية', 'ملفاتي', 'اذهب', 'غيري', 'مقتبل', 'داخل', 'صطمبالي', 'طحان', 'الوالد', 'الظروف', 'المالية', 'لولاية', 'والكلمة', 'الجزائر', 'التوالات', 'خلي', 'كشكول', 'كويس', 'والثنية', 'أخر', 'قشرتها', 'والتوانسة', 'نتعدا', 'تكسي', 'نساووه', 'هيستيريا', 'غري', 'منه', 'لصحابي', 'حالة', 'سفيرها', 'هنا', 'يستغر