# Converting SB Data

Here, we need to convert the unorganized .txt data into a workable format to feed into our classifier. The .csv files outputted from this file are in the format:

`speaker_name, dialect, content`. 

## Important: running the code

This notebook is not necessary, and not designed, to be run on the user front. Rather, this is largely for developers who wish to modify the way in which the data is formatted. The processed data existing already in `../data/SBData/dialectsdata` folder is in the proper format. __Only run if you need to change structure of storing data, and do not push modified data files to master unless it will be beneficial globally.__ 


In [13]:
import csv
import pandas as pd
import string
import nltk
from nltk import RegexpTokenizer

In [14]:
'''
Convert the SB .trn data to .txt data
'''
for i in range(1,61):
    if i < 10:
        with open("../data/SBData/TRN/SBC00" + str(i) + ".trn", "r") as f:
            text = f.read()
        f = open("../data/SBData/TXT/SBC00" + str(i) + ".txt",'w')
        f.write(text)
        f.close()
    elif i == 37 or i == 60:
        pass
    else:
        with open("../data/SBData/TRN/SBC0" + str(i) + ".trn", "r") as f:
            text = f.read()
        f = open("../data/SBData/TXT/SBC0" + str(i) + ".txt",'w')
        f.write(text)
        f.close()

In [10]:
# Reading the CSV files (uncomment if necessary):
# df = pd.read_csv("../data/SBData/dialectsdata/metadata4a.csv")
# df

# Rewriting the CSV files in a proper format (uncomment if necessary):
# with open('../data/SBData/dialectsdata/metadata4a.csv', 'w') as ma:
#     with open('../data/SBData/dialectsdata/metadata4.csv') as m:
#         for line in m.readlines():
#             write = csv.writer(ma, dialect='excel')
#             write.writerow(line.split(','))

In [24]:
def cleanFile():
    # change the number for the .txt and .csv each time you move on to the next .txt file
    with open('../data/SBData/TXT/SBC054.txt') as f:
        data = f.readlines()
    with open('../data/SBData/dialectsdata/dialects054.csv', 'w') as r:
        # for oldName and newName, whenever there's speakers who aren't in the metadata CSV, you need to change
        # the line number of your data to the line number where the first speaker who is in the metadata CSV appears
        # (ex: cleanLine(data[0])[0] to cleanLine(data[1])[0])
        oldName = cleanLine(data[0])[0] # must be the starting pt of dialogue
        newName = cleanLine(data[0])[0] # must be the starting pt of dialogue
        fix = []
        # if you change the line number (ex: cleanLine(data[0])[0] to cleanLine(data[1])[0]), you need to slice the
        # data variable accordingly (in this case, since the line number is now 1, it should say "for line in data[1:]")
        for line in data:
            sentence = ""
            no_punc = cleanLine(line)
            if len(no_punc) == 0: continue # ignore empty lists
                
            # see if there's a switch in people talking
            if no_punc[0] in list(df['NAME']) and newName != no_punc[0]:
                oldName = newName
                newName = no_punc[0]

            # put the words in the list together as a sentence
            for elm in no_punc:
                sentence += elm + " "
            
            # see a new name? write to csv BEFORE appending to 'fix'
            if no_punc[0] in list(df['NAME']) and oldName != newName: # check for name change

                # when running, you need to uncomment below and manually put the names of the people talking 
                # and their geographical origin (put 'NA' if unknown and add more elif statements as needed):
                
                if fix[0][0:len(oldName)] == 'CYNTHIA':
                    state = 'IL'
                elif fix[0][0:len(oldName)] == 'AUD':
                    state = 'NA'
                elif fix[0][0:len(oldName)] == 'MANY':
                    state = 'NA'
                fix = [fix[0][0:len(oldName)], state, fix[0][len(oldName)+1:len(fix[0])-1]] # remove extra space at end
                write = csv.writer(r, dialect='excel')
                write.writerow(fix)
                fix = []
            
            # append line to person's dialogue
            if len(fix) == 0: fix.append(sentence)
            else: fix[0] += sentence

def formatWord(w):
    newWord = ""
    for i in w:
        if i.isnumeric() or i in ["=", "[", "]", "~", "(", ")"]: continue # add more edge cases as needed
        else: newWord += i
    return newWord

def cleanLine(line):
    no_punc = [word.strip(string.punctuation) for word in line.split()] # strips punctuation & separates everything
    no_punc = no_punc[2:] # remove first two numbers in every line
    no_punc = [x for x in no_punc if x] # removes empty strings
    no_punc = [formatWord(x) if x.isalpha() == False else x for x in no_punc] # removes weird punctuation
    return no_punc

In [25]:
cleanFile()

In [17]:
# testing if function works
with open('../data/SBData/TXT/SBC001.txt') as f:
    data = f.readlines()
    print(data[0])
    print(cleanLine(data[0]))

0.00 9.21	LENORE: 	... So you don't need to go ... borrow equipment from anybody,

['LENORE', 'So', 'you', "don't", 'need', 'to', 'go', 'borrow', 'equipment', 'from', 'anybody']
