# Splitting Wav Files By Channel

## Imports

In [2]:
from scipy.io import wavfile
import re
import os
import pandas as pd
import textgrids
import glob

# Wave Files 

## Get the wav files 

In [3]:
def get_wav_files():
    cwd = os.getcwd()
    correct_folder = cwd + '/original_en_diapix_data'
    return glob.glob(correct_folder + "/*.wav")

In [204]:
my_wav_files = get_wav_files()

## Get Speaker Names

In [4]:
def get_file_names(file_name):
    a = re.findall(r"(ENF?_\d\d)", file_name)
    channel1 = a[0]
    if a[1] != a[0]:
        channel2 = a[1]
    return channel1, channel2

## Move wavfile to new directory

In [5]:
def create_directory_move_file(file_to_be_moved, subdirectory_name = "subdirectory_name"):
    cwd = os.getcwd()
    os.mkdir(subdirectory_name)
    for item in file_to_be_moved:
        os.replace(cwd + '/' + item, cwd + '/' + subdirectory_name + '/' + item)

## split and save the files to correct folder

In [6]:
def split_save_wav(wav_file_list):
    for item in wav_file_list:
        fs, data = wavfile.read(item)
        channel1, channel2 = get_file_names(item)
        wavfile.write(channel1, fs, data[:, 0])   # saving first column which corresponds to channel 1
        wavfile.write(channel2, fs, data[:, 1])   # saving second column which corresponds to channel 2
        os.replace(cwd + '/' + channel1, cwd + '/' + "split_wav_files_folder" + '/' + channel1)
        os.replace(cwd + '/' + channel2, cwd + '/' + "split_wav_files_folder" + '/' + channel2)

In [211]:
split_save_wav(my_wav_files)

# Textgrids

## Function to get textgrids 

In [7]:
def get_textgrids_for_each_speaker():
        cwd = os.getcwd()
        textgrid_directory = cwd + "/original_en_diapix_data_changed_textgrids"
        textgrid_list = glob.glob(textgrid_directory + "/*.TextGrid")
        return textgrid_list

In [8]:
def make_mixed_into_lists(mixed_df):
    text = []
    xmin = []
    xmax = []
    for item in mixed_df[0]:
        if re.match(r"<Interval\stext=\"(.*)\"\sxmin=(.+)\sxmax=(.+)>", str(item)) != None:
            x = re.match(r"<Interval\stext=\"(.*)\"\sxmin=(.+)\sxmax=(.+)>" , str(item))
            if x.group(1) == "":
                text.append(np.nan)
            else:
                text.append(x.group(1))
            xmin.append(float(x.group(2)))
            xmax.append(float(x.group(3)))
        else:
            raise Exception
    mixed_df['Word_Text'] = text
    mixed_df['Word_xmin'] = xmin
    mixed_df['Word_xmax'] = xmax
    mixed_df.drop(columns = [0], inplace=True)
    return mixed_df

def make_phone_into_lists(phone_df):
    text = []
    xmin = []
    xmax = []
    for item in phone_df[0]:
        if re.match(r"<Interval\stext=\"(.*)\"\sxmin=(.+)\sxmax=(.+)>", str(item)) != None:
            x = re.match(r"<Interval\stext=\"(.*)\"\sxmin=(.+)\sxmax=(.+)>", str(item))
            if x.group(1) == "":
                text.append(np.nan)
            else:
                text.append(x.group(1))
            xmin.append(float(x.group(2)))
            xmax.append(float(x.group(3)))
        else:
            raise Exception
    phone_df['Phone_Text'] = text
    phone_df['Phone_xmin'] = xmin
    phone_df['Phone_xmax'] = xmax
    phone_df.drop(columns = [0], inplace=True)
    return phone_df

In [9]:
def combine_dfs(grid):
    # channel1
    # make into a df
    grid_mixed_df = pd.DataFrame.from_dict(grid['mixed'])
    grid_phone_df = pd.DataFrame.from_dict(grid['phone'])
    # making the df have columns
    mixed_df = make_mixed_into_lists(grid_mixed_df)
    phone_df = make_phone_into_lists(grid_phone_df)
    # combining the two dataframes 
    combined_df = phone_df.merge(mixed_df, how='left', left_on='Phone_xmin', right_on='Word_xmin')
    combined_df.fillna(method = 'ffill', inplace=True)
    
    #channel 2
    grid_mixed_df2 = pd.DataFrame.from_dict(grid['mixed2'])
    grid_phone_df2 = pd.DataFrame.from_dict(grid['phone2'])
    # making the df have columns
    mixed_df2 = make_mixed_into_lists(grid_mixed_df2)
    phone_df2 = make_phone_into_lists(grid_phone_df2)
    # combining the two dataframes 
    combined_df2 = phone_df2.merge(mixed_df2, how='left', left_on='Phone_xmin', right_on='Word_xmin')
    combined_df2.fillna(method = 'ffill', inplace=True)
    
    return combined_df, combined_df2

## Splitting and naming the textgrids 

In [10]:
def split_and_name_textgrids():
    cwd = os.getcwd()
    textgrid_list = get_textgrids_for_each_speaker()
    for file_name in textgrid_list:
        channel1, channel2 = get_file_names(file_name)
        grid = textgrids.TextGrid(file_name)
        chan1, chan2 = combine_dfs(grid)
        # name_dfs correctly
        # save to csv
        chan1.to_csv(cwd + '/split_wav_files_folder/' + channel1 + "TextGrid")
        chan2.to_csv(cwd + '/split_wav_files_folder/' + channel2 + "TextGrid")
        # put all of the files in split_wav_files_folder

In [None]:
split_and_name_textgrids()

# Same process but for corrected_textgrid folder

In [11]:
# file names 
cwd = os.getcwd()
corrected_textgrid_names_list = glob.glob(cwd + "/corrected_textgrid/*TextGrid")

In [12]:
corrected_textgrid_names_list

['/Users/gregfeliu/Desktop/Flatiron Bootcamp/Vowel Identifier/corrected_textgrid/DP_ENF_02_ENF_06_EN_ENF_02_DP_ENF_02_ENF_06_EN_ENF_06_corrected_both.TextGrid',
 '/Users/gregfeliu/Desktop/Flatiron Bootcamp/Vowel Identifier/corrected_textgrid/DP_ENF_02_ENF_06_EN_ENF_02_DP_ENF_02_ENF_06_EN_ENF_06_corrected_only_sp2.TextGrid']

In [13]:
for file_name in corrected_textgrid_names_list:
    channel1, channel2 = get_file_names(corrected_textgrid_names_list[0])
    grid = textgrids.TextGrid(file_name)
    chan1, chan2 = combine_dfs(grid)
    # name_dfs correctly
    # save to csv
#     chan1.to_csv(cwd + '/split_wav_files_folder/' + channel1 + "TextGrid")
#     chan2.to_csv(cwd + '/split_wav_files_folder/' + channel2 + "TextGrid")

In [14]:
chan1

Unnamed: 0,Phone_Text,Phone_xmin,Phone_xmax,Word_Text,Word_xmin,Word_xmax
0,,0.000000,2.000000,,0.000000,2.000000
1,!SIL,2.000000,9.491760,!SIL,2.000000,9.457105
2,y,9.491760,9.557000,!SIL,2.000000,9.457105
3,N,9.557000,9.647000,!SIL,2.000000,9.457105
4,DD,9.647000,9.677000,!SIL,2.000000,9.457105
...,...,...,...,...,...,...
1353,K,520.686125,520.726125,!SIL,518.455909,520.326117
1354,EY,520.726125,520.886125,!SIL,518.455909,520.326117
1355,lg,520.886125,521.407062,<LG>,520.886125,521.407062
1356,!SIL,521.407062,525.604172,!SIL,521.407062,525.604172
