In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
from tqdm import tqdm

## Merge the language endangerment data with the other data

In [2]:
endangerment_vars_df = pd.read_csv('raw_corpus_data/language_endangerment/raw_data.csv')
endangerment_vars_df.columns

Index(['ISO', 'name', 'region', 'AES', 'EGIDS', 'L1_pop', 'L1_pop_prop',
       'island', 'documentation', 'area', 'bordering_language_richness',
       'bordering_language_richness_perkm', 'bordering_language_evenness',
       'language_richness', 'threatened_languages',
       'threatened_prop_languages', 'language_evenness', 'official_status',
       'language_of_education', 'minority_education', 'education_spending',
       'years_of_schooling', 'GDPpc', 'life_expectancy_60', 'urban_change',
       'roads', 'pop_density', 'human_footprint', 'cropland',
       'pop_density_change', 'footprint_change', 'cropland_change', 'pasture',
       'pasture_change', 'built', 'built_change', 'GINI', 'growing_season',
       'temperature', 'temperature_seasonality', 'rainfall_seasonality',
       'threatened_species', 'threatened_prop_species', 'waterways',
       'roughness', 'altitude_range', 'world_language', 'Arabic', 'Malay',
       'English', 'French', 'Hindustani', 'Mandarin', 'Portuguese

In [3]:
word_order_clics_counts_df = pd.read_csv('processed_data/word_order_clics_counts_v2.csv').drop(columns=['Unnamed: 0'], axis=1)

mapping = {'Adjective-Noun': 1, 'No dominant order': 2, 'Noun-Adjective': 3}
word_order_clics_counts_df['adj_noun_order_coding'] = word_order_clics_counts_df['adj_noun_order'].map(mapping)
word_order_clics_counts_df = word_order_clics_counts_df[word_order_clics_counts_df['adj_noun_order'] != 'Only internally-headed relative clauses']
word_order_clics_counts_df


Unnamed: 0,wals code,name,OV_order_coded,OV_order,latitude,longitude,genus,family,area,name_x,...,variety,Macroarea_y,Family_y,num_unique_noun_words,num_unique_noun_concepts,mean_noun_word_len,num_unique_adj_words,num_unique_adj_concepts,mean_adj_word_len,adj_noun_order_coding
0,abk,Abkhaz,1,OV,43.083333,41.000000,Northwest Caucasian,Northwest Caucasian,Word Order,Abkhaz,...,Abkhaz,Eurasia,Abkhaz-Adyge,594,502,7.088785,152,100,7.765823,3.0
1,abv,Abui,1,OV,-8.250000,124.666667,Alor-Pantar,Greater West Bomberai,Word Order,Abui,...,"Abui, Takalelang",Papunesia,Timor-Alor-Pantar,1350,327,6.498814,288,73,6.213235,3.0
2,abu,Abun,2,VO,-0.500000,132.500000,Abun,Abun,Word Order,Abun,...,Abun,Papunesia,Abun,95,26,3.980769,6,2,2.875000,3.0
3,ace,Acehnese,3,No dominant order,5.500000,95.500000,Malayo-Sumbawan,Austronesian,Word Order,Acehnese,...,Acehnese,Eurasia,Austronesian,88,89,5.662921,0,0,,3.0
4,acg,Achagua,2,VO,4.416667,-72.250000,Japura-Colombia,Arawakan,Word Order,Achagua,...,Achagua,South America,Arawakan,187,157,6.137755,56,32,6.857143,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431,ycn,Yucuna,2,VO,-0.750000,-71.000000,Japura-Colombia,Arawakan,Word Order,Yucuna,...,Yucuna,South America,Arawakan,251,217,7.643678,43,38,8.681818,1.0
432,yko,Yukaghir (Kolyma),1,OV,65.750000,150.833333,Yukaghir,Yukaghir,Word Order,Yukaghir (Kolyma),...,Southern Yukaghir,Eurasia,Yukaghir,304,299,6.448680,90,79,6.525253,1.0
433,ytu,Yukaghir (Tundra),1,OV,69.000000,155.000000,Yukaghir,Yukaghir,Word Order,Yukaghir (Tundra),...,Northern Yukaghir,Eurasia,Yukaghir,372,394,7.889412,88,85,8.715789,1.0
434,yuw,Yuwaalaraay,1,OV,-29.500000,148.000000,Southeastern Pama-Nyungan,Pama-Nyungan,Word Order,Yuwaalaraay,...,Gamilaraay,Australia,Pama-Nyungan,229,171,6.091176,55,41,6.560976,1.0


In [4]:
full_df = pd.merge(endangerment_vars_df, word_order_clics_counts_df, left_on='ISO', right_on='ISO639P3code')
full_df.to_csv('processed_data/language_endangerment_v2.csv')

  full_df = pd.merge(endangerment_vars_df, word_order_clics_counts_df, left_on='ISO', right_on='ISO639P3code')


In [5]:
full_df

Unnamed: 0,ISO,name_x,region,AES,EGIDS,L1_pop,L1_pop_prop,island,documentation,area_x,...,variety,Macroarea_y,Family_y,num_unique_noun_words,num_unique_noun_concepts,mean_noun_word_len,num_unique_adj_words,num_unique_adj_concepts,mean_adj_word_len,adj_noun_order_coding
0,abk,Abkhazian,Arab,threatened,2,105000,3363.641012,0,detailed,1847.401115,...,Abkhaz,Eurasia,Abkhaz-Adyge,594,502,7.088785,152,100,7.765823,3.0
1,abt,Ambulas,Oceania,threatened,5,44000,1552.650420,0,detailed,1154.190211,...,Ambulas,Papunesia,Ndu,53,40,3.827586,5,5,4.200000,1.0
2,abz,Abui,South-Eastern Asia,shifting,6a,16000,215.533744,1,detailed,627.239851,...,"Abui, Takalelang",Papunesia,Timor-Alor-Pantar,1350,327,6.498814,288,73,6.213235,3.0
3,aca,Achagua,South America,shifting,6b,250,41.038933,0,detailed,3413.503013,...,Achagua,South America,Arawakan,187,157,6.137755,56,32,6.857143,3.0
4,ace,Acehnese,South-Eastern Asia,threatened,6b,3500000,35411.014220,0,detailed,17655.564140,...,Acehnese,Eurasia,Austronesian,88,89,5.662921,0,0,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,yux,Southern Yukaghir,Europe,nearly extinct,8b,370,12330.741540,0,detailed,3648.952815,...,Southern Yukaghir,Eurasia,Yukaghir,304,299,6.448680,90,79,6.525253,1.0
408,ywt,Xishanba Lalo,Asia,threatened,7,213000,1773.015389,0,detailed,15267.429170,...,Yi (Nanjian),Eurasia,Sino-Tibetan,890,620,9.386431,192,123,6.144144,3.0
409,zga,Kinga,Africa,not endangered,6a,140000,2198.705850,0,detailed,2079.928842,...,Kinga,Africa,Atlantic-Congo,365,316,7.362720,52,37,6.980769,3.0
410,zmu,Muruwari,Australia and New Zealand,nearly extinct,10,0,0.000000,0,detailed,60449.858810,...,Muruwari,Australia,Pama-Nyungan,163,167,6.335329,38,38,6.947368,1.0
