# Data Cleaning and Reformatting

In this file, the merged file containing all of the data collected in this project - which can be found at ../csv_files/mass_merge.csv - will be cleaned. The objective of doing so is to make the data easier to read, and to format certain columns with html code so it can be read directly into the website in the ../docs/index.js file. 

In [266]:
import pandas as pd 
import regex as re

In [267]:
mass_data = pd.read_csv('../csv_files/mass_merge.csv', index_col=0)
mass_data = mass_data.dropna(subset = ['iso6393'])
# mass_data 

In [268]:
def formatting(row):
    if str(row) != 'nan':

        # replace any instance within a string of a set of brackets which contain 1 or more digits with an empty string
        row = re.sub(r'\[[0-9]+\]', '', row)
        # replace any instance within a string of a set of brackets which contain 1 or more letters with an empty string
        row = re.sub(r'\[[a-zA-Z]+\]', '', row)
        # replace any instance within a string of a set of characters beginning with either a \ or ? and ending with a newline command with an empty string, including the \ or ? and newline command
        row = re.sub(r'\\n|\?\\n', '', row)
        # replace any instance within a string of a set of characters ', '', ' with a , 
        row = re.sub(r'\', \'\', \'', ', ', row)
        # replace any instance within a string of a set of characters ', ' with a , 
        row = re.sub(r'\'+, \'', ', ', row)
        # replace any instance within a string of a [' with an empty string
        row = re.sub(r'\[\'', '', row)
        # replace any instance within a string of a '] with an empty string
        row = re.sub(r'\'\]', '', row)

        return row
    
mass_data['family'] = mass_data['family'].apply(formatting)
mass_data['dialects'] = mass_data['dialects'].apply(formatting)
# mass_data

In [269]:
def formatting(row):
    if str(row) != 'NaN':

        # replace any instance of spaces with a comma followed by a space
        row = re.sub(r'\s', ', ', str(row))

        return row

mass_data['country_ids'] = mass_data['country_ids'].apply(formatting)
# mass_data['country_ids']

In [270]:
def create_hierarchy(txt):
    txt = str(txt).split(', ')
    newtxt = ''

    for i in range(len(txt)): 
        newtxt = newtxt + txt[i] + '\n' 

    outtxt = ''
    newtxt = newtxt.split('\n')[:-1]
    for i in range(len(newtxt)):
        outtxt = outtxt + f'<ul><li>{newtxt[i]}</li>'
    num_indents = len(newtxt)
    outtxt = outtxt + '</ul>'*len(newtxt)
    
    return outtxt

mass_data['family'] = mass_data['family'].apply(create_hierarchy)
# mass_data

In [274]:
mass_data = mass_data.fillna('Data not available')
mass_data

Unnamed: 0,name,glottocode,iso6393,aes_status,Degree of endangerment,family,family_id,dialects,child_dialect_count,child_family_count,...,type,scope,bookkeeping,description,markup_description,country_ids,parent_id,off_lang,rec_min_lang,Wikipedia_Url
0,Ghotuo,ghot1243,aaa,not endangered,Data not available,<ul><li>Niger–Congo</li><ul><li>Atlantic–Congo...,atla1278,Data not available,0.0,0.0,...,living,individual,False,Data not available,Data not available,NG,afen1234,Data not available,Data not available,https://en.wikipedia.org/wiki/Ghotuo_language
1,Alumu-Tesu,alum1246,aab,not endangered,Data not available,<ul><li>Niger–Congo</li><ul><li>Atlantic–Congo...,Data not available,"Alumu, Tesu",Data not available,Data not available,...,living,individual,Data not available,Data not available,Data not available,,Data not available,Data not available,Data not available,https://en.wikipedia.org/wiki/Alumu_language
2,Ari,arii1243,aac,moribund,Severely endangered,<ul><li>Papuan Gulf</li><ul><li>\xa0</li><ul><...,suki1244,Data not available,0.0,0.0,...,living,individual,False,Data not available,Data not available,PG,ariw1234,Data not available,Data not available,https://en.wikipedia.org/wiki/Ari_language_(Ne...
3,Amal,amal1242,aad,shifting,Data not available,<ul><li>Sepik</li><ul><li>Upper Sepik</li><ul>...,sepi1257,Data not available,0.0,0.0,...,living,individual,False,Data not available,Data not available,PG,sepi1257,Data not available,Data not available,https://en.wikipedia.org/wiki/Amal_language
4,Arbëreshë Albanian,arbe1236,aae,threatened,Data not available,<ul><li>Indo-European</li><ul><li>Albanian</li...,indo1319,"Vaccarizzo Albanian, Palermitan Albanian, Cala...",4.0,0.0,...,living,individual,False,Data not available,Data not available,IT,sout3378,Data not available,Data not available,https://en.wikipedia.org/wiki/Arb%C3%ABresh_la...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7872,Youjiang Zhuang,youj1238,zyj,not endangered,Data not available,<ul><li>Kra–Dai</li><ul><li>Tai</li><ul><li>No...,taik1256,Data not available,0.0,0.0,...,living,individual,False,Data not available,Data not available,CN,nort3189,Data not available,Data not available,https://en.wikipedia.org/wiki/Youjiang_Zhuang
7873,Yongnan Zhuang,yong1275,zyn,not endangered,Data not available,<ul><li>Kra–Dai</li><ul><li>Tai</li><ul><li>va...,taik1256,Data not available,0.0,0.0,...,living,individual,False,Data not available,Data not available,"CN, VN",yong1274,Data not available,Data not available,https://en.wikipedia.org/wiki/Yongnan_languages
7874,Zyphe Chin,zyph1238,zyp,not endangered,Data not available,<ul><li>Sino-Tibetan</li><ul><li>(</li><ul><li...,Data not available,Data not available,Data not available,Data not available,...,living,individual,Data not available,Data not available,Data not available,,Data not available,Data not available,Data not available,https://en.wikipedia.org/wiki/Zyphe_language
7875,Zaza,Data not available,zza,Data not available,Data not available,<ul><li>None</li></ul>,Data not available,Data not available,Data not available,Data not available,...,living,macrolanguage,Data not available,Data not available,Data not available,,Data not available,Data not available,Data not available,Data not available


In [275]:
mass_data.to_csv('../csv_files/master_merge.csv')
mass_data.to_csv('../docs/master_merge.csv')