In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
from tqdm import tqdm



In [28]:
endangerment_vars_df = pd.read_csv('raw_corpus_data/language_endangerment/raw_data.csv')
endangerment_vars_df.columns

Index(['ISO', 'name', 'region', 'AES', 'EGIDS', 'L1_pop', 'L1_pop_prop',
       'island', 'documentation', 'area', 'bordering_language_richness',
       'bordering_language_richness_perkm', 'bordering_language_evenness',
       'language_richness', 'threatened_languages',
       'threatened_prop_languages', 'language_evenness', 'official_status',
       'language_of_education', 'minority_education', 'education_spending',
       'years_of_schooling', 'GDPpc', 'life_expectancy_60', 'urban_change',
       'roads', 'pop_density', 'human_footprint', 'cropland',
       'pop_density_change', 'footprint_change', 'cropland_change', 'pasture',
       'pasture_change', 'built', 'built_change', 'GINI', 'growing_season',
       'temperature', 'temperature_seasonality', 'rainfall_seasonality',
       'threatened_species', 'threatened_prop_species', 'waterways',
       'roughness', 'altitude_range', 'world_language', 'Arabic', 'Malay',
       'English', 'French', 'Hindustani', 'Mandarin', 'Portuguese

In [35]:
word_order_clics_counts_df = pd.read_csv('processed_data/word_order_clics_counts.csv')

mapping = {'Adjective-Noun': 1, 'No dominant order': 2, 'Noun-Adjective': 3}
word_order_clics_counts_df['value'] = word_order_clics_counts_df['description'].map(mapping)
word_order_clics_counts_df = word_order_clics_counts_df[word_order_clics_counts_df['description'] != 'Only internally-headed relative clauses']
word_order_clics_counts_df


Unnamed: 0.1,Unnamed: 0,wals code,name,value,description,latitude,longitude,genus,family,area,...,Country_ID,Source,dataset_ID,variety,Macroarea_y,Family_y,num_unique_noun_words,num_unique_noun_concepts,num_unique_adj_words,num_unique_adj_concepts
0,0,abk,Abkhaz,3.0,Noun-Adjective,43.083333,41.000000,Northwest Caucasian,Northwest Caucasian,Word Order,...,GE,Bybee-et-al-1994 Gecadze-1979 Hewitt-1979 Hewi...,diacl,Abkhaz,Eurasia,Abkhaz-Adyge,594,502,152,100
1,1,abv,Abui,3.0,Noun-Adjective,-8.250000,124.666667,Alor-Pantar,Greater West Bomberai,Word Order,...,ID,Kratochvil-2007,lexirumah,"Abui, Takalelang",Papunesia,Timor-Alor-Pantar,1350,327,288,73
2,2,abu,Abun,3.0,Noun-Adjective,-0.500000,132.500000,Abun,Abun,Word Order,...,ID,Berry-1995a Berry-1995b Berry-and-Berry-1999,transnewguineaorg,Abun,Papunesia,Abun,95,26,6,2
3,3,ace,Acehnese,3.0,Noun-Adjective,5.500000,95.500000,Malayo-Sumbawan,Austronesian,Word Order,...,ID,Aboe-Bakar-et-al-1985 Cowan-1981 Durie-1985 Du...,diacl,Acehnese,Eurasia,Austronesian,88,89,0,0
4,4,acg,Achagua,3.0,Noun-Adjective,4.416667,-72.250000,Japura-Colombia,Arawakan,Word Order,...,CO,Aikhenvald-2007b Huber-and-Reed-1992 Melendez-...,hubercolumbian,Achagua,South America,Arawakan,187,157,56,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,508,yko,Yukaghir (Kolyma),1.0,Adjective-Noun,65.750000,150.833333,Yukaghir,Yukaghir,Word Order,...,RU,Maslova-1999 Maslova-2003a Nikolaeva-and-Xelim...,northeuralex,Southern Yukaghir,Eurasia,Yukaghir,304,299,90,79
498,509,ytu,Yukaghir (Tundra),1.0,Adjective-Noun,69.000000,155.000000,Yukaghir,Yukaghir,Word Order,...,RU,Krejnovich-1958 Krejnovich-1968c Maslova-2003b,northeuralex,Northern Yukaghir,Eurasia,Yukaghir,372,394,88,85
499,510,yuw,Yuwaalaraay,1.0,Adjective-Noun,-29.500000,148.000000,Southeastern Pama-Nyungan,Pama-Nyungan,Word Order,...,AU,Mushin-1995 Williams-1980a,bowernpny,Gamilaraay,Australia,Pama-Nyungan,229,171,55,41
500,511,zun,Zuni,3.0,Noun-Adjective,35.083333,-108.833333,Zuni,Zuni,Word Order,...,US,Bunzel-1933-1938 Bybee-et-al-1994 Cook-1975 Mi...,ids,Zuni,North America,Zuni,498,499,90,94


In [36]:
full_df = pd.merge(endangerment_vars_df, word_order_clics_counts_df, left_on='ISO', right_on='ISO639P3code')
full_df.to_csv('processed_data/language_endangerment.csv')

In [37]:
full_df

Unnamed: 0,ISO,name_x,region,AES,EGIDS,L1_pop,L1_pop_prop,island,documentation,area_x,...,Country_ID,Source,dataset_ID,variety,Macroarea_y,Family_y,num_unique_noun_words,num_unique_noun_concepts,num_unique_adj_words,num_unique_adj_concepts
0,abk,Abkhazian,Arab,threatened,2,105000,3363.641012,0,detailed,1847.401115,...,GE,Bybee-et-al-1994 Gecadze-1979 Hewitt-1979 Hewi...,diacl,Abkhaz,Eurasia,Abkhaz-Adyge,594,502,152,100
1,abt,Ambulas,Oceania,threatened,5,44000,1552.650420,0,detailed,1154.190211,...,PG,Laycock-1965a Nichols-1992 Wilson-1980,transnewguineaorg,Ambulas,Papunesia,Ndu,53,40,5,5
2,abz,Abui,South-Eastern Asia,shifting,6a,16000,215.533744,1,detailed,627.239851,...,ID,Kratochvil-2007,lexirumah,"Abui, Takalelang",Papunesia,Timor-Alor-Pantar,1350,327,288,73
3,aca,Achagua,South America,shifting,6b,250,41.038933,0,detailed,3413.503013,...,CO,Aikhenvald-2007b Huber-and-Reed-1992 Melendez-...,hubercolumbian,Achagua,South America,Arawakan,187,157,56,32
4,ace,Acehnese,South-Eastern Asia,threatened,6b,3500000,35411.014220,0,detailed,17655.564140,...,ID,Aboe-Bakar-et-al-1985 Cowan-1981 Durie-1985 Du...,diacl,Acehnese,Eurasia,Austronesian,88,89,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,yux,Southern Yukaghir,Europe,nearly extinct,8b,370,12330.741540,0,detailed,3648.952815,...,RU,Maslova-1999 Maslova-2003a Nikolaeva-and-Xelim...,northeuralex,Southern Yukaghir,Eurasia,Yukaghir,304,299,90,79
471,ywt,Xishanba Lalo,Asia,threatened,7,213000,1773.015389,0,detailed,15267.429170,...,CN,Bjorverud-1998,suntb,Yi (Nanjian),Eurasia,Sino-Tibetan,890,620,192,123
472,zga,Kinga,Africa,not endangered,6a,140000,2198.705850,0,detailed,2079.928842,...,TZ,Wolff-1905,tls,Kinga,Africa,Atlantic-Congo,365,316,52,37
473,zmu,Muruwari,Australia and New Zealand,nearly extinct,10,0,0.000000,0,detailed,60449.858810,...,AU,Mushin-1995 Oates-1988 Trefry-1971,bowernpny,Muruwari,Australia,Pama-Nyungan,163,167,38,38
