In [1]:
import pandas as pd
import os

In [2]:
def get_noise_list(labels):
    """
    A function that generates a list of tuples that represents the start and end of noises.
    :param labels: the string that represents labels
    :return: a list of tuples that contain (start, end) of each noises.
    """
    starts = list()
    ends = list()
    
    for i in labels:
        starts.append(i['start'])
        ends.append(i['end'])
    
    noises = list()
    noises.append((0, starts[0])) # The start of file to starts[0] is considered a noise

    for i in range(len(ends)):
        try:
            noises.append((ends[i], starts[i + 1]))
        except IndexError:  # If index error, that means there is no more noise left
            pass

    noises.append((ends[len(ends) - 1], -1))  # The last end to the end of the file is considered a noise
    return noises

In [3]:
def generate_noise_df(df):
    """
    A function that generates a pandas.DataFrame that contains all start and ends of noises.
    :param df: a pandas.DataFrame object that represents the original dataframe
    :return: return a DataFrame object that includes all noise starts and ends.
    """
    file_names = list()
    start = list()
    end = list()
    class_name = list()
    
    for i in range(len(df)):
        cur_df = df.loc[i]
        noise_list = get_noise_list(eval(cur_df['labels']))
        cur_file_name = os.path.join(os.getcwd(), cur_df['audio'][1:])
        
        for noise in noise_list:
            start.append(noise[0])
            end.append(noise[1])
            class_name.append("noise")
            file_names.append(cur_file_name)

    modified_df = pd.DataFrame()
    modified_df['file_name'] = file_names
    modified_df['start'] = start
    modified_df['end'] = end
    modified_df['class'] = class_name
    
    return modified_df

In [4]:
def modify_df(file_name):
    """
    A function that reads file in csv format which was exported by LabelStudio.
    This function will transform given csv into dataframe that is more suitable for our sound classification.
    :param file_name: the string object that represents file name
    :returns: pandas.DataFrame object that represents data from the csv turned into our style.
    """
    data = pd.read_csv(file_name)
    
    file_names = list()
    start = list()
    end = list()
    class_name = list()

    for i in range(len(data)):
        instance = data.loc[i]
        cur_file_name = os.path.join(os.getcwd(), instance['audio'][1:])
        cur_class = eval(instance['intent'])  # Each values are recorded as str, so need eval
        cur_time_lines = eval(data.loc[i]['labels'])

        for j in range(len(cur_class)):
            cur_start = float(cur_time_lines[j]['start'])
            cur_end = float(cur_time_lines[j]['end'])

            start.append(cur_start)
            end.append(cur_end)
            class_name.append(cur_class[j])
            file_names.append(cur_file_name)

    modified_df = pd.DataFrame()
    modified_df['file_name'] = file_names
    modified_df['start'] = start
    modified_df['end'] = end
    modified_df['class'] = class_name
    
    noise_df = generate_noise_df(data)
    modified_df = modified_df.append(noise_df)
    
    return modified_df

In [5]:
from PIL import Image
from pydub import AudioSegment
import sqlite3
import string
import os

def generate_db():
    """
    A function that generates training_data/data.db file and initializes the table values.
    :return: None
    """
    conn = sqlite3.connect("./training_data/data.db")
    cur = conn.cursor()
    conn.execute("CREATE TABLE data(data_name TEXT, data_count INTEGER)")
    
    default_data = [(x, 0) for x in string.ascii_lowercase]
    default_data = default_data + [(str(x), 0) for x in range(0, 10, 1)]
    default_data.append(("noise", 0))
    
    cur.executemany("INSERT INTO data VALUES(?, ?)", default_data)
    
    conn.commit()
    conn.close()
    
def get_data_count(data_name):
    """
    A function that gets data count from data.db for specific data_name
    :param data_name: the data_name to look for data counts
    :return: returns count of data_count value for provided data_name, returns -1 if data_name does not exist.
    """
    conn = sqlite3.connect("./training_data/data.db")
    cur = conn.cursor()
    cur.execute("SELECT data_count FROM data WHERE data_name=\"" + data_name + "\"")
    rows = cur.fetchall()
    try:
        return rows[0][0]
    except IndexError:
        return -1
    
    conn.close()
    
def update_data_count(data_name):
    """
    A function that adds 1 to the current data_count for specified data_name
    :param data_name: the data_name to update values into
    :return: None
    """
    before_count = get_data_count(data_name)    
    new_count = before_count + 1
    
    conn = sqlite3.connect("./training_data/data.db")
    cur = conn.cursor()
    cur.execute("UPDATE data SET data_count=" + str(new_count) + " WHERE data_name=\"" + data_name + "\"")
    conn.commit()
    conn.close()

def generate_all_directories():
    """
    A function that generates training_data directories.
    This will generate trainig_data/a~z and 0~9 directories.
    :return: None
    """
    dir_names = [x for x in string.ascii_lowercase]
    dir_names = dir_names + [str(x) for x in range(0, 10, 1)]
    dir_names.append("noise")


    training_data_path = os.path.join(os.getcwd(), "training_data")

    if not os.path.exists(training_data_path):  # generate training_data directory
        os.mkdir(training_data_path)
    
    for i in dir_names:  # generate subdirectories such as a ~ z, 0 ~ 9
        new_dir= os.path.join(training_data_path, i)
        if not os.path.exists(new_dir):
            os.mkdir(new_dir)

def process_df(df):
    """
    A function that splits each sound classifications from dataframe into different files
    :param df: the pandas.DataFrame object that represents single row of information.
    """
    file_name = df['file_name']
    new_audio = AudioSegment.from_wav(file_name)
    
    start = df['start'] * 1000
    
    if df['end'] == -1:
        end = len(new_audio)
    else:
        end = df['end'] * 1000
    class_name = df['class']
    
    tmp_audio = new_audio[start:end]
    
    save_dir = os.path.join(os.getcwd(), "training_data", class_name)
    new_file_name = str(get_data_count(class_name)) + ".wav"
    new_file_name = os.path.join(save_dir, new_file_name)
    update_data_count(class_name)
    tmp_audio.export(new_file_name, format="wav")

In [6]:
df = modify_df("project-3-at-2022-06-23-07-16-c998cdd4.csv")

In [7]:
df

Unnamed: 0,file_name,start,end,class
0,/home/gooday2die/projects/Anti-Captcha-Sound/d...,0.018023,0.702913,5
1,/home/gooday2die/projects/Anti-Captcha-Sound/d...,1.111443,1.730247,t
2,/home/gooday2die/projects/Anti-Captcha-Sound/d...,1.850403,2.517269,j
3,/home/gooday2die/projects/Anti-Captcha-Sound/d...,2.865722,3.640728,t
4,/home/gooday2die/projects/Anti-Captcha-Sound/d...,3.893056,4.620000,n
...,...,...,...,...
1192,/home/gooday2die/projects/Anti-Captcha-Sound/d...,0.560398,1.251121,noise
1193,/home/gooday2die/projects/Anti-Captcha-Sound/d...,1.883198,2.515274,noise
1194,/home/gooday2die/projects/Anti-Captcha-Sound/d...,2.906250,3.440583,noise
1195,/home/gooday2die/projects/Anti-Captcha-Sound/d...,3.948850,4.378923,noise


In [8]:
df

Unnamed: 0,file_name,start,end,class
0,/home/gooday2die/projects/Anti-Captcha-Sound/d...,0.018023,0.702913,5
1,/home/gooday2die/projects/Anti-Captcha-Sound/d...,1.111443,1.730247,t
2,/home/gooday2die/projects/Anti-Captcha-Sound/d...,1.850403,2.517269,j
3,/home/gooday2die/projects/Anti-Captcha-Sound/d...,2.865722,3.640728,t
4,/home/gooday2die/projects/Anti-Captcha-Sound/d...,3.893056,4.620000,n
...,...,...,...,...
1192,/home/gooday2die/projects/Anti-Captcha-Sound/d...,0.560398,1.251121,noise
1193,/home/gooday2die/projects/Anti-Captcha-Sound/d...,1.883198,2.515274,noise
1194,/home/gooday2die/projects/Anti-Captcha-Sound/d...,2.906250,3.440583,noise
1195,/home/gooday2die/projects/Anti-Captcha-Sound/d...,3.948850,4.378923,noise


In [16]:
df.loc[0]

Unnamed: 0,file_name,start,end,class
0,/home/gooday2die/projects/Anti-Captcha-Sound/d...,0.018023,0.702913,5
0,/home/gooday2die/projects/Anti-Captcha-Sound/d...,0.0,0.018023,noise


In [9]:
try:
    generate_db()
    print("[+] DB not found, generating a new one")
except sqlite3.OperationalError:  # when db already exists
    pass
generate_all_directories()

for i in range(len(df)):
    print("[+] Processing file " + str(i + 1) + " / " + str(len(df)) + " : " + df.loc[i]['file_name'])
    process_df(df.loc[i])

0    [+] Processing file 1 / 2194 : /home/gooday2di...
0    [+] Processing file 1 / 2194 : /home/gooday2di...
Name: file_name, dtype: object


AttributeError: 'Series' object has no attribute 'seek'

In [179]:
from IPython.display import Audio 
#Audio("/home/gooday2die/projects/Anti-Captcha/sounddooo/sounds/training_data/j/0.wav", autoplay=True)