Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 940 lines (840 sloc) 51.3 KB
#########################################################################################################################################################
#
# Import, process and export in a standardized Newick format various language classifications (currently WALS, Ethnologue, Glottolog and AUTOTYP)
# and add branch length using a variety or methods (none, constant, proportional, grafen, nnls, ga and nj; see script "TreeHelperFunctions.r" for info)
#
# Copyright (C) 2013-2017 Dan Dediu
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
##########################################################################################################################################################
######################################################################################
#
# Global parametres controlling the various processing, import and export options:
# tweak as desired (some require lots of computater resources and time)
# More parametres controlling the branch length methods are defined below.
#
######################################################################################
MATCH_CODES = TRUE; # compute the equivalences between the ISO, WALS, AUTOTYP and GLOTTOLOG codes and save them to file?
PREOPTIMIZE_DISTS = TRUE; # pre-optimize the distance matrices for fast loading when required
COMPUTE_GEO_DISTS = TRUE; # compute the geographic distances between languages?
TRANSFORM_TREES = TRUE; # transform the trees to the Newick notation (no branch length) and save them?
EXPORT_NEXUS = TRUE; # export the trees to a NEXUS file?
EXPORT_NEXUS_TRANSLATE_BLOCK = TRUE; # when exporting NEXUS files, generate a TRANSLATE block (useful when using programs such as BayesTraits that have issues parsing complicated taxa names)?
EXPORT_CSV = TRUE; # export the trees to a CSV file?
COMPUTE_BRLEN = TRUE; # apply the various branch length methods to the Newick no branch length trees?
CPU_CORES = 4; # the number of parallel CPU cores to use overall
quotes="'"; # the quotes to use for the node (language and family) names
#########################
#
# Tree functions
#
#########################
source( "./FamilyTrees.R" );
######################################################################################
#
# Data needed to map the ISO - WALS - Glottolog codes
#
######################################################################################
if( MATCH_CODES )
{
# The Ethnologue data contains the ISO and language names:
ethn.data <- read.table( "../input/ethnologue/LanguageCodes.tab", header=TRUE, sep="\t", quote="", stringsAsFactors=FALSE );
ethn.data <- ethn.data[ , c("LangID", "Name") ];
ethn.data$Name <- sapply( as.character(ethn.data$Name), function( s ){ normalize.language.name(s); } );
# The wals data contains the WALS, ISO and Glottolog codes for a subset of languages:
wals.data <- read.table( "../input/wals/language.csv", header=TRUE, sep=",", quote="\"", stringsAsFactors=FALSE );
wals.data <- wals.data[ , c("wals_code", "iso_code", "glottocode", "Name", "latitude", "longitude") ];
wals.data$Name <- sapply( as.character(wals.data$Name), function( s ){ normalize.language.name(s); } );
# The Glottolog data contains the ISO and Glottolog codes for a much larger set of languages:
library(rjson);
glott.json <- fromJSON( file="../input/glottolog/glottocodes2iso.json" );
glott.json <- glott.json$resources; # select the actual resources and not the properties
glott.data <- lapply( glott.json, function(x){
# Look for an ISO code:
isos <- NULL;
if( length(x$identifiers) > 0 )
{
for( i in 1:length(x$identifiers) )
{
if( x$identifiers[[i]]$type == "iso639-3" )
{
isos <- c( isos, x$identifiers[[i]]$identifier );
}
}
}
if( length(isos) > 1 )
{
cat( "Multiple ISOs found for glottocode ", x$id, ": packing them separated by '-'!\n" );
isos <- paste( isos, collapse="-", sep="" );
}
return ( data.frame( "Name"=x$name,
"Glottocode"=x$id,
"ISO"=ifelse( is.null(isos), NA, isos ),
"latitude"=ifelse(is.null(x$latitude),NA,x$latitude),
"longitude"=ifelse(is.null(x$latitude),NA,x$longitude) ) );
} );
glott.data <- do.call( rbind, glott.data );
glott.data$Name <- sapply( as.character(glott.data$Name), function( s ){ normalize.language.name(s); } );
rm(glott.json);
# The autotyp data contains the ISO, Glottolog and autoty (LID) mappings:
autotyp.data <- read.table( "../input/autotyp/autotyp-trees.csv", header=TRUE, sep="\t", quote="", stringsAsFactors=FALSE );
autotyp.data$language <- sapply( as.character(autotyp.data$language), function( s ){ normalize.language.name(s); } );
# The mappings:
code.mappings <- merge( glott.data, wals.data, by.x=c("Glottocode","ISO"), by.y=c("glottocode","iso_code"), all=TRUE, suffixes=c(".glot",".wals") );
code.mappings <- merge( code.mappings, autotyp.data, by.x=c("Glottocode","ISO"), by.y=c("glottolog_LID.2014","ISO639.3.2013"), all=TRUE, suffixes=c(".glot",".auto") );
code.mappings <- merge( code.mappings, ethn.data, by.x=c("ISO"), by.y=c("LangID"), all=TRUE, suffixes=c("",".ethn") );
# Combine and harmonize the geographic corrdinates:
# Fill in the missing coordinates in Glottolog from WALS:
s <- (is.na(code.mappings$latitude.glot) | is.na(code.mappings$longitude.glot));
code.mappings[s,c("latitude.glot","longitude.glot")] <- code.mappings[s,c("latitude.wals","longitude.wals")];
# If the Glottolog and WALS coordinates are very different, display the differences and keep the WALS:
s <- (!is.na(code.mappings$latitude.glot) & !is.na(code.mappings$longitude.glot) & !is.na(code.mappings$latitude.wals) & !is.na(code.mappings$longitude.wals) &
(abs(code.mappings$latitude.glot - code.mappings$latitude.wals) >= 1.0 | abs(code.mappings$latitude.glot - code.mappings$latitude.wals) >= 1.0));
if( sum(s,na.rm=TRUE) > 0 )
{
cat( "There are disagreements larger than 1 degree between Glottolog and WALS for ", nrow(unique(code.mappings[s,c("ISO","wals_code","Glottocode")])), " languages (WALS codes): ", paste( unique(code.mappings$wals_code[s]), collapse=",", sep="" ), "; keeping the WALS coordinates!\n" );
code.mappings[s,c("latitude.glot","longitude.glot")] <- code.mappings[s,c("latitude.wals","longitude.wals")];
}
# See how many languages with ISO codes but no geographic coordinates there are:
code.mappings$ISO[ is.na(code.mappings$ISO) ] <- "";
s <- ((is.na(code.mappings$latitude.glot) | is.na(code.mappings$longitude.glot)) & code.mappings$ISO != "");
if( sum(s,na.rm=TRUE) )
{
missing.geo <- unique(code.mappings$ISO[s]);
cat( "There are ", length(missing.geo), " languages with ISO codes but no geographic information: I don't know where to get the coordinates from :(\n" );
}
# Data cleaning:
code.mappings <- code.mappings[ , c( "ISO", "wals_code", "LID", "Glottocode", "Name", "Name.wals", "language", "Name.glot", "latitude.glot", "longitude.glot" ) ];
names(code.mappings) <- c( "ISO", "WALS", "AUTOTYP", "Glottolog", "Name.ethn", "Name.wals", "Name.autotyp", "Name.glottlog", "Latitude", "Longitude" );
code.mappings[is.na(code.mappings)] <- "";
code.mappings$ISO <- as.character(code.mappings$ISO);
code.mappings$WALS <- as.character(code.mappings$WALS);
code.mappings$AUTOTYP <- as.character(code.mappings$AUTOTYP);
code.mappings$Glottolog <- as.character(code.mappings$Glottolog);
code.mappings$Name.ethn <- as.character(code.mappings$Name.ethn);
code.mappings$Name.wals <- as.character(code.mappings$Name.wals);
code.mappings$Name.autotyp <- as.character(code.mappings$Name.autotyp);
code.mappings$Name.glottlog <- as.character(code.mappings$Name.glottlog);
code.mappings$Latitude <- as.numeric(code.mappings$Latitude);
code.mappings$Longitude <- as.numeric(code.mappings$Longitude);
code.mappings <- code.mappings[ order( code.mappings$ISO, code.mappings$WALS, code.mappings$AUTOTYP, code.mappings$Glottolog ), ];
# Add a Universal Unique Language Identifier using the conventions introduced here:
code.mappings <- cbind( code.mappings, "UULID"=paste("[i-",code.mappings$ISO,"][w-",code.mappings$WALS,"][a-",code.mappings$AUTOTYP,"][g-",code.mappings$Glottolog,"]",sep="") );
# Write them to file:
write.table( code.mappings, "../output/code_mappings_iso_wals_autotyp_glottolog.csv", sep="\t", quote=FALSE, row.names=FALSE );
code.mappings$AUTOTYP <- as.character(code.mappings$AUTOTYP); code.mappings$AUTOTYP[ is.na(code.mappings$AUTOTYP) ] <- ""; # make sure the AUTOTYP code is treated as character with missing data as ""
} else
{
code.mappings <- read.table( "../output/code_mappings_iso_wals_autotyp_glottolog.csv", sep="\t", quote="", header=TRUE, stringsAsFactors=FALSE );
code.mappings$AUTOTYP <- as.character(code.mappings$AUTOTYP); code.mappings$AUTOTYP[ is.na(code.mappings$AUTOTYP) ] <- ""; # make sure the AUTOTYP code is treated as character with missing data as ""
}
######################################################################################
#
# Load (and possibly compute) various distance matrices to be used for computing
# the branch lengths.
#
######################################################################################
# Pre-optimize and cache the distance matrices for on-demand loading only when really needed:
if( PREOPTIMIZE_DISTS )
{
# Make sure the ../output/preoptimized-distances directory exists:
dir.create("../output/preoptimized-distances", showWarnings=FALSE);
# ASJP16 (the row and column names are the iso codes):
load( "../input/distances/ASJP/asjp16-dists.RData" ); # asjp16.dm
d <- asjp16.dm; save(d, file="../output/preoptimized-distances/asjp16-dm.RData", compress="xz"); rm(asjp16.dm,d); # cache it pre-optimized for on-demand loading
# WALS (the row and column names are the wals codes); gowler and manhattan are quite similar; using both NAs and imputation of NAs with the mode (per variable):
load( "../input/distances/WALS/wals-gower-dm.RData" ); wals.gower.dm <- as.matrix(wals.gower.dm); # wals.gower.dm (with missing data)
d <- wals.gower.dm; save(d, file="../output/preoptimized-distances/wals-gower-dm.RData", compress="xz"); rm(wals.gower.dm,d); # cache it pre-optimized for on-demand loading
load( "../input/distances/WALS/wals-gower-mode-dm.RData" ); wals.gower.mode.dm <- as.matrix(wals.gower.mode.dm); # wals.gower.mode.dm (with mode imputation for missing data)
d <- wals.gower.mode.dm; save(d, file="../output/preoptimized-distances/wals-gower-mode-dm.RData", compress="xz"); rm(wals.gower.mode.dm,d); # cache it pre-optimized for on-demand loading
load( "../input/distances/WALS/wals-euclidean-dm.RData" ); wals.euclidean.dm <- as.matrix(wals.euclidean.dm); # wals.euclidean.dm (with missing data)
d <- wals.euclidean.dm; save(d, file="../output/preoptimized-distances/wals-euclidean-dm.RData", compress="xz"); rm(wals.euclidean.dm,d); # cache it pre-optimized for on-demand loading
load( "../input/distances/WALS/wals-euclidean-mode-dm.RData" ); wals.euclidean.mode.dm <- as.matrix(wals.euclidean.mode.dm); # wals.euclidean.mode.dm (with mode imputation for missing data)
d <- wals.euclidean.mode.dm; save(d, file="../output/preoptimized-distances/wals-euclidean-mode-dm.RData", compress="xz"); rm(wals.euclidean.mode.dm,d); # cache it pre-optimized for on-demand loading
# AUTOTYP (the row and column names are the AUTOTYP codes (LID)):
load("../input/distances/AUTOTYP/autotyp-dist.RData"); # autotyp.dm (with missing data, using ony features with single values per language, courtesy of Balthasar Bickel)
d <- autotyp.dm; save(d, file="../output/preoptimized-distances/autotyp-dm.RData", compress="xz"); rm(autotyp.dm,d); # cache it pre-optimized for on-demand loading
# Geographic distances (the row and column names are the Glottolog codes):
if( COMPUTE_GEO_DISTS )
{
# Compute the great-circle geographic distances between all languages with geographic coordinates
library(geosphere);
# The languages with geographic coordinates:
s <- (!is.na(code.mappings$Latitude) & !is.na(code.mappings$Longitude));
# of which those that don't have a specific type of code:
s.ISO <- s & (code.mappings$ISO == "");
s.WALS <- s & (code.mappings$WALS == "");
s.AUTOTYP <- s & (code.mappings$AUTOTYP == "");
s.Glottolog <- s & (code.mappings$Glottolog == "");
# Decide which code to use so that as many as possible of the languages with geographic info are covered by the coding scheme:
code.to.use <- c("ISO","WALS","AUTOTYP","Glottolog")[ which.min( c( sum(s.ISO,na.rm=TRUE), sum(s.WALS,na.rm=TRUE), sum(s.AUTOTYP,na.rm=TRUE), sum(s.Glottolog,na.rm=TRUE) ) ) ];
s.use <- s & !(get( paste( "s.", code.to.use, sep="" ) )); # obtain (s & (code != ""))
tmp <- code.mappings[s.use,c(code.to.use,"Longitude","Latitude")];
cat( "For the geographic distances (ellipsoid) the row and column names use the ", code.to.use, " codes.\n" );
# Some codes (with geographic coordinates) are duplicated: keep the frst entries only:
tmp <- tmp[!duplicated(tmp[,1]),];
## Plot these languages:
#library(maps);
#X11(10,10);
#map("world");
#points( tmp$Longitude, tmp$Latitude, cex=0.5, col=grey(0.5) );
# Compute the distances:
m <- as.matrix( tmp[,c("Longitude","Latitude")] ); rownames(m) <- tmp[,1];
system.time( geo.dm <- geosphere::distm( m, fun=distMeeus ) ); # much faster than distVincentyEllipsoid but still a very decent estimate of geographic distance
rownames(geo.dm) <- colnames(geo.dm) <- rownames(m);
rm(m); rm(tmp);
save( geo.dm, file="../input/distances/geo-great-circle-ellipsoid-dists.RData", compress="xz" );
}
load( file="../input/distances/geo-great-circle-ellipsoid-dists.RData" ); # geo.dm
# Normalize it between 0 and 1 by dividing it with the maximum distance (which is very close to the maximum possible distance on Earth of about 21,000km):
geo.dm <- geo.dm / max(as.numeric(geo.dm),na.rm=TRUE);
d <- geo.dm; save(d, file="../output/preoptimized-distances/geo-dm.RData", compress="xz"); rm(geo.dm,d); # cache it pre-optimized for on-demand loading
# The MG2015 [Maurits, L. & Griffiths, T.L. (2015) PNAS 111 (37):13576--13581] distances
# there's one distance matrix for each of the four classifications (WALS, Ethnologue, Glottolog and AUTOTYP)
# and each has a rwo & column names the full language names and code -> we need to replace them by the appropriate codes
.change.row.col.names.to.codes <- function(m, code=c("iso","wals","autotyp","glottolog"))
{
if( is.null(m) || !inherits(m,"matrix") || nrow(m) != ncol(m) || any(rownames(m) != colnames(m)) )
{
stop("Problems with the distance matrix!\n");
return (NULL);
} else
{
# Change the row and column names to the WALS codes:
new.names <- vapply(rownames(m), function(s)
{
tmp <- extract.name.and.codes(s);
if( is.null(tmp[[code]]) || length(tmp[[code]]) != 1 || tmp[[code]]=="" )
{
warning(paste0("Language ",s," must have a single ", code, " code\n"));
return (s);
} else
{
return(tmp[[code]]);
}
}, character(1));
return (new.names);
}
}
load("../input/distances/MG2015/MG2015-wals-alpha=0.69.RData"); rownames(MG2015.WALS) <- colnames(MG2015.WALS) <-.change.row.col.names.to.codes(MG2015.WALS, "wals"); # MG2015.WALS
d <- MG2015.WALS; save(d, file="../output/preoptimized-distances/mg2015-wals-dm.RData", compress="xz"); rm(MG2015.WALS,d); # cache it pre-optimized for on-demand loading
load("../input/distances/MG2015/MG2015-ethnologue-alpha=0.69.RData"); rownames(MG2015.Ethnologue) <- colnames(MG2015.Ethnologue) <-.change.row.col.names.to.codes(MG2015.Ethnologue, "iso"); # MG2015.Ethnologue
d <- MG2015.Ethnologue; save(d, file="../output/preoptimized-distances/mg2015-ethnologue-dm.RData", compress="xz"); rm(MG2015.Ethnologue,d); # cache it pre-optimized for on-demand loading
load("../input/distances/MG2015/MG2015-glottolog-alpha=0.69.RData"); rownames(MG2015.Glottolog) <- colnames(MG2015.Glottolog) <-.change.row.col.names.to.codes(MG2015.Glottolog, "glottolog"); # MG2015.Glottolog
d <- MG2015.Glottolog; save(d, file="../output/preoptimized-distances/mg2015-glottolog-dm.RData", compress="xz"); rm(MG2015.Glottolog,d); # cache it pre-optimized for on-demand loading
load("../input/distances/MG2015/MG2015-autotyp-alpha=0.69.RData"); rownames(MG2015.AUTOTYP) <- colnames(MG2015.AUTOTYP) <-.change.row.col.names.to.codes(MG2015.AUTOTYP, "autotyp"); # MG2015.AUTOTYP
d <- MG2015.AUTOTYP; save(d, file="../output/preoptimized-distances/mg2015-autotyp-dm.RData", compress="xz"); rm(MG2015.AUTOTYP,d); # cache it pre-optimized for on-demand loading
gc(); # call the garbage collector to make sure the space is freeded after this memory-hungry step
}
######################################################################################
#
# Global parametres controlling the branch length methods to apply
#
######################################################################################
# the branch length methods to be used; can be "all", "none", or any subset of {"constant", "proportional", "grafen", "nnls", "ga", "nj"}
CLASSIFICATIONS = c("wals", "ethnologue", "glottolog", "autotyp");
METHODS = c("constant", "proportional", "grafen", "nnls", "nj", "ga");
CONSTANT = 1.0; # the positive constant required by some methods
DISTS.CODES = read.table(text="Distance ShortName File Code \n
asjp16 asjp ../output/preoptimized-distances/asjp16-dm.RData iso \n
wals(gower) w:g ../output/preoptimized-distances/wals-gower-dm.RData wals \n
wals(gower,mode) w:gm ../output/preoptimized-distances/wals-gower-mode-dm.RData wals \n
wals(euclidean) w:e ../output/preoptimized-distances/wals-euclidean-dm.RData wals \n
wals(euclidean,mode) w:em ../output/preoptimized-distances/wals-euclidean-mode-dm.RData wals \n
autotyp atyp ../output/preoptimized-distances/autotyp-dm.RData autotyp \n
mg2015(wals) mg:w ../output/preoptimized-distances/mg2015-wals-dm.RData wals \n
mg2015(ethnologue) mg:e ../output/preoptimized-distances/mg2015-ethnologue-dm.RData iso \n
mg2015(glottolog) mg:g ../output/preoptimized-distances/mg2015-glottolog-dm.RData glottolog \n
mg2015(autotyp) mg:a ../output/preoptimized-distances/mg2015-autotyp-dm.RData autotyp \n
geo geo ../output/preoptimized-distances/geo-dm.RData glottolog \n",
sep="", quote="", header=TRUE, stringsAsFactors=FALSE); # the distances required by some methods
######################
#
# WALS trees
#
######################
if( TRANSFORM_TREES )
{
# WALS languages info:
wals.data <- read.table( "../input/wals/language.csv", header=TRUE, sep=",", quote="\"", stringsAsFactors=FALSE );
wals.data <- wals.data[, c("wals_code", "iso_code", "glottocode", "Name", "latitude", "longitude", "genus", "family" ) ];
# Replace troublesome characters in language and family names:
wals.data$Name <- sapply( as.character(wals.data$Name), function( s ){ normalize.language.name(s); } );
wals.data$family <- sapply( as.character(wals.data$family), function( s ){ normalize.language.name(s); } );
wals.data$genus <- sapply( as.character(wals.data$genus), function( s ){ normalize.language.name(s); } );
# Create the WALS trees:
roots <- NULL;
for( i in 1:nrow(wals.data) )
{
# Process this entry:
if( !is.na(wals.data$family[i]) && !(wals.data$family[i] %in% c("other","Unclassified","Creole","Mixed language")) )
{
autotypcode <- unique( as.character( code.mappings$AUTOTYP[ code.mappings$WALS == as.character(wals.data$wals_code[i]) ] ) );
if( length(autotypcode) > 1 )
{
# Get rid of empty codes:
autotypcode <- autotypcode[ autotypcode != "" ];
if( length(autotypcode) == 0 )
{
autotypcode = ""; # all were ""
} else
{
autotypcode <- paste( autotypcode, collapse="-", sep="" ); # in principle it can happen to have more than one AUTOTYP code
}
}
roots <- add.language.to.languageclassification( roots,
path=c(create.name.and.codes(wals.data$family[i], iso="", wals="", autotyp="", glottocode="", quotes=quotes),
create.name.and.codes(wals.data$genus[i], iso="", wals="", autotyp="", glottocode="", quotes=quotes),
create.name.and.codes(wals.data$Name[i],
iso= as.character(wals.data$iso_code[i]),
wals= as.character(wals.data$wals_code[i]),
autotyp= autotypcode,
glottocode=as.character(wals.data$glottocode[i]),
quotes) ),
brlens=NULL );
}
}
attr(roots, "classif.name") <- "wals";
# Fix the family and language names (get rid of the codes):
for( i in 1:length(roots) )
{
roots[[i]] <- .fix.names(roots[[i]], quotes=quotes);
attr(roots[[i]], "tree.name") <- str_trim(strsplit(strsplit(get.name(roots[[i]]),quotes,fixed=TRUE)[[1]][2],"[",fixed=TRUE)[[1]][1]);
}
# Save them to file:
export.languageclassification( roots, dir.name="../output", classification="wals", export.nexus=EXPORT_NEXUS, nexus.translate.block=EXPORT_NEXUS_TRANSLATE_BLOCK, export.csv=EXPORT_CSV );
rm(wals.data);
}
######################
#
# Ethnologue trees
#
######################
if( TRANSFORM_TREES )
{
# The Enthologue trees: we need to download and parse them from the Ethnologue website:
fams.file <- paste("../input/ethnologue/fams.html",sep="");
fams.URL <- "http://www.ethnologue.com/browse/families";
if( !file.exists(fams.file) || file.info(fams.file)$size < 1 )
{
options(timeout=180); # Ethnologue is a slow site for some reason...
options(HTTPUserAgent="Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130917 Firefox/17.0"); # emulate Firefox
if( download.file( fams.URL, fams.file, quiet=T ) != 0 )
{
stop( "Error retrieving the Ethnologue main families page !\n" );
}
}
# Load the file:
HTML.content <- readLines( fams.file, warn=FALSE ); # read the file:
# Find the links to the families themselves:
fams.links <- HTML.content[ grep( "/subgroups/", HTML.content, fixed=TRUE ) ];
# Extract the links:
fams.links <- sapply( fams.links, function(s){
tmp <- strsplit( s, "<a href=\"/subgroups/", fixed=TRUE )[[1]];
if( length(tmp) < 2 )
{
cat( "Error retrieving the URL for ", s, "\n" );
return (NULL);
}
tmp <- strsplit( tmp[2], "\">", fixed=TRUE )[[1]];
if( length(tmp) < 1 )
{
cat( "Error retrieving the URL for ", s, "\n" );
return (NULL);
}
return (tmp[1]);
} );
# Load and process one language family
fetch.ethnologue.family <- function( family )
{
if( is.na( family ) )
{
return( NULL );
}
else
{
# The base URL for families:
fam.base.URL <- "http://www.ethnologue.com/subgroups/";
fam.URL <- paste( fam.base.URL, family, sep="" );
fam.file <- paste("../input/ethnologue/fam-",family,".html",sep="");
if( !file.exists(fam.file) || file.info(fam.file)$size < 1 )
{
options(timeout=180); # Ethnologue is a slow site for some reason...
options(HTTPUserAgent="Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130917 Firefox/17.0"); # emulate Firefox
if( download.file( fam.URL, fam.file, quiet=T ) != 0 )
{
cat( "Error retrieving family ", family, " !\n" );
return( NULL );
}
}
# Use the downloaded file:
HTML.content <- readLines( fam.file, warn=F ); # read the file:
# Get the family official name:
tag.string <- "<h1 class=\"title\" id=\"page-title\">";
matches <- grep( tag.string, HTML.content, fixed=TRUE );
if( length(matches) != 1 )
{
cat( "Error locating family name for ", family, "\n" )
return ( NULL );
} else
{
# Get the family name:
family.name <- strsplit( strsplit( HTML.content[ matches ], tag.string, fixed=TRUE )[[1]][2], "</h1>", fixed=TRUE )[[1]][1];
}
# The tree in preorder with special symbols ( and ) representing an increse/decrease in level
# the names are "node" for an internal node, "leaf" for a leaf, and "special" for ( and ):
tree <- c("node"=create.name.and.codes( family.name, iso="", wals="", autotyp="", glottocode="", quotes=quotes));
# Preprocess the HTML text to make sure each of the important tags appear only once per line (i.e., insert newlines after each </a> and </ul>):
HTML.content.new <- gsub("</a>", "</a>\n", HTML.content[(matches+1):length(HTML.content)], fixed=TRUE);
HTML.content.new <- gsub("</ul>", "</ul>\n", HTML.content.new, fixed=TRUE);
HTML.content.new <- unlist(strsplit(HTML.content.new, "\n", fixed=TRUE));
# Build the tree by adding the subfamilies and languages as they appear in the file:
for( i in 1:length(HTML.content.new) )
{
if( length( grep( "<a href=\"/subgroups/", HTML.content.new[i], fixed=TRUE ) ) == 1 )
{
# This seems to be a subgroup: extract its name:
subgroup <- strsplit( strsplit( strsplit( HTML.content.new[i], "<a href=\"/subgroups/", fixed=TRUE )[[1]][2], "\">", fixed=TRUE )[[1]][2], "</a>", fixed=TRUE )[[1]][1];
subgroup <- str_trim( strsplit( subgroup, "\\([[:digit:]]+\\)" )[[1]][1] );
if( is.null(subgroup) || subgroup == "" )
{
cat( "Error extracting subgroup from ", HTML.content.new[i], "\n" );
return ( NULL );
}
# Add this to the tree:
tree <- c(tree, "node"=create.name.and.codes(subgroup, iso="", wals="", autotyp="", glottocode="", quotes=quotes));
} else if( length( grep( "<span class=\"field-content\">", HTML.content.new[i], fixed=TRUE ) ) == 1 && length( grep( "<a href=\"/language/", HTML.content.new[i], fixed=TRUE ) ) == 1 )
{
# Seems to be language: extract it and its iso:
lg.name <- strsplit( HTML.content.new[i], "<span class=\"field-content\">", fixed=TRUE )[[1]][2];
lg.name <- str_trim( strsplit( lg.name, "<a href=\"/language/", fixed=TRUE )[[1]][1] );
lg.iso <- strsplit( HTML.content.new[i], "<a href=\"/language/", fixed=TRUE )[[1]][2];
lg.iso <- strsplit( lg.iso, "\">[", fixed=TRUE )[[1]][2];
lg.iso <- strsplit( lg.iso, "]</a>", fixed=TRUE )[[1]][1];
if( is.null(lg.name) || lg.name == "" || is.null(lg.iso) || lg.iso == "" )
{
cat( "Error extracting language from ", HTML.content.new[i], "\n" );
return ( NULL );
}
# Get the other codes as well:
wals <- autotyp <- glottocode <- "";
s <- code.mappings[ code.mappings$ISO == lg.iso, ];
if( !is.null(s) && nrow(s) > 0 )
{
wals <- unique(s$WALS);
autotyp <- unique(s$AUTOTYP);
glottocode <- unique(s$Glottolog);
}
# Add this language to the tree:
tree <- c(tree, "leaf"=create.name.and.codes(lg.name, iso=lg.iso, wals=wals, autotyp=autotyp, glottocode=glottocode, quotes=quotes));
} else if( length( grep( "<ul>", HTML.content.new[i], fixed=TRUE ) ) == 1 )
{
# Subgroup starts:
tree <- c(tree, "special"="(");
} else if( length( grep( "</ul>", HTML.content.new[i], fixed=TRUE ) ) == 1 )
{
# Subgroup ends:
tree <- c(tree, "special"=")");
} else if( length( grep( "</section>", HTML.content.new[i], fixed=TRUE ) ) == 1 )
{
# The end of the tree section
break;
}
}
# Build the tree reusing as much code as possible: simply add the full path for each language:
root <- NULL;
cur.path <- tree[1];
for( i in 2:length(tree) )
{
#cat(paste0("Tree[",i,"]=",tree[i],": "));
if( names(tree)[i] == "special" && tree[i] == "(" )
{
# Simply ignore, we're going up one level
} else if( names(tree)[i] == "special" && tree[i] == ")" )
{
if( i < length(tree) && names(tree)[i+1] == "special" && tree[i+1] == "(" ) next; # artifact of the way languages and sub-families are mixed in the HTML file
# Down one level, discard the last node in the path:
cur.path <- cur.path[ -length(cur.path) ];
#cat("cur.path=",paste0(cur.path,collapse=", "));
} else if( names(tree)[i] == "leaf" )
{
# Add the full path to the tree:
root <- add.tree.path(root, c(cur.path, tree[i]));
#cat(paste0("add.tree.path(",paste0(c(cur.path, tree[i]),collapse=", "),")"));
} else if( names(tree)[i] == "node" )
{
# Add the node to the path:
cur.path <- c(cur.path, tree[i]);
#cat("cur.path=",paste0(cur.path,collapse=", "));
}
#cat("\n");
}
attr(root, "tree.name") <- family.name;
return( root );
}
}
fetch.ethnologue.family <- cmpfun(fetch.ethnologue.family);
# Fetch the families:
roots <- list();
for( i in 1:length(fams.links) )
{
cat( "Processing Ethnologue family ", fams.links[i], "...\n" );
root <- fetch.ethnologue.family( fams.links[i] );
if( is.null(root) )
{
cat( " >>> Error fetching Ethnologue family ", fams.links[i], "...\n" );
next;
}
roots[[ length(roots) + 1 ]] <- .fix.names( root, quotes=quotes );
}
# Save them to file:
attr(roots, "classif.name") <- "Ethnologue";
class(roots) <- append("languageclassification", class(roots));
export.languageclassification( roots, dir.name="../output", classification="ethnologue", export.nexus=EXPORT_NEXUS, nexus.translate.block=EXPORT_NEXUS_TRANSLATE_BLOCK, export.csv=EXPORT_CSV );
rm(HTML.content); rm(fams.links);
}
######################
#
# Glottolog trees
#
######################
if( TRANSFORM_TREES )
{
# The Glottolog trees: parse them and export them in the Family\tTree format, also updating the convention on the language codes:
glott.fams <- readLines( "../input/glottolog/tree-glottolog-newick.txt" );
mapping <- code.mappings[,c("ISO","WALS","AUTOTYP","Glottolog")]
roots <- NULL;
for( i in 1:length(glott.fams) )
{
# Pre-process the tree by replacing any "\\'" within langauge names by "`", and taking care of names with ":" by replacing it with "-":
gfam <- gsub( "\\'", "`", str_trim(glott.fams[i]), fixed=TRUE );
tmp <- strsplit( gfam, "'", fixed=TRUE )[[1]];
k <- grep( "[^:]+:", tmp );
if( length(k) > 0 )
{
# ":" in names!
for( j in k ) tmp[j] <- gsub( ":", "-", tmp[j], fixed=TRUE );
gfam <- paste( tmp, collapse="'", sep="" );
}
# Make sure the Newick is enclosed between '(' and ');':
if( substr(gfam,1,2) != "(" || substr(gfam,nchar(gfam)-1,nchar(gfam)) != ");" )
{
if( substr(gfam,nchar(gfam),nchar(gfam)) == ";" ) gfam <- paste0("(",substr(gfam,1,nchar(gfam)-1),");") else gfam <- paste0("(",gfam,");");
}
# Parse the tree:
root <- familytree(gfam);
# Set the family name (the root's only child):
attr(root, "tree.name") <- str_trim(strsplit(strsplit(.get.node.name(root,root$edge[root$edge[,1] == find.root(root), 2][1]),"[",fixed=TRUE)[[1]][1],quotes,fixed=TRUE)[[1]][2]);
# Convert the Glottolog convention to our conventions in what concerns the codes:
root <- .convert.glottolog.convention(root, mapping);
# Erase the branch length info:
root <- compute.brlen(root, "none")$tree;
# Fix the names and add it to the collection:
roots[[ length(roots) + 1 ]] <- .fix.names(root, quotes);
}
# Save them to file:
attr(roots, "classif.name") <- "Glottolog";
class(roots) <- append("languageclassification", class(roots));
export.languageclassification( roots, dir.name="../output", classification="glottolog", export.nexus=EXPORT_NEXUS, nexus.translate.block=EXPORT_NEXUS_TRANSLATE_BLOCK, export.csv=EXPORT_CSV );
rm(glott.fams); rm(mapping);
}
######################
#
# AUTOTYP trees
#
######################
if( TRANSFORM_TREES )
{
# AUTOTYP data:
autotyp.data <- read.table( "../input/autotyp/autotyp-trees.csv", header=TRUE, sep="\t", quote="", stringsAsFactors=FALSE );
# Replace troublesome characters in language, and branch names:
autotyp.data$language <- sapply( as.character(autotyp.data$language), function( s ){ if( !is.na(s) ) { normalize.language.name(s); } else { NA; } } );
autotyp.data$lsbranch <- sapply( as.character(autotyp.data$lsbranch), function( s ){ if( !is.na(s) ) { normalize.language.name(s); } else { NA; } } );
autotyp.data$ssbranch <- sapply( as.character(autotyp.data$ssbranch), function( s ){ if( !is.na(s) ) { normalize.language.name(s); } else { NA; } } );
autotyp.data$sbranch <- sapply( as.character(autotyp.data$sbranch), function( s ){ if( !is.na(s) ) { normalize.language.name(s); } else { NA; } } );
autotyp.data$mbranch <- sapply( as.character(autotyp.data$mbranch), function( s ){ if( !is.na(s) ) { normalize.language.name(s); } else { NA; } } );
autotyp.data$stock <- sapply( as.character(autotyp.data$stock), function( s ){ if( !is.na(s) ) { normalize.language.name(s); } else { NA; } } );
# Replace the NAs in Glottolog and ISO codes by "":
autotyp.data$glottolog_LID.2014[ is.na(autotyp.data$glottolog_LID.2014) ] <- "";
autotyp.data$ISO639.3.2013[ is.na(autotyp.data$ISO639.3.2013) ] <- "";
# Create the AUTOTYP trees:
roots <- NULL;
for( i in 1:nrow(autotyp.data) )
{
# Process this entry:
if( !is.na(autotyp.data$stock[i]) )
{
# Build the path; the stock is always given:
path <- create.name.and.codes(autotyp.data$stock[i], iso="", wals="", autotyp="", glottocode="", quotes=quotes);
# For the mbranch, sbranch, ssbranch & lsbranch things are a bit more complex in the sense that sometimes an intermediate level is skipped (NA) but deeper levels are given: add them with later tree-wide disambiguation:
defined.levels <- !is.na(autotyp.data[ i, c( "mbranch", "sbranch", "ssbranch", "lsbranch" ) ]);
if( any(defined.levels) )
{
# There is such a level defined
deepest.defined.level <- max(which( !is.na(autotyp.data[ i, c( "mbranch", "sbranch", "ssbranch", "lsbranch" ) ]) ));
mbranch.column <- which(names(autotyp.data)=="mbranch")-1;
for( j in 1:deepest.defined.level )
{
if( is.na(autotyp.data[ i, j+mbranch.column ]) )
{
path <- c(path, create.name.and.codes(paste0("Group",j), iso="", wals="", autotyp="", glottocode="", quotes=quotes));
} else
{
path <- c(path, create.name.and.codes(autotyp.data[ i, j+mbranch.column ], iso="", wals="", autotyp="", glottocode="", quotes=quotes));
}
}
}
# Add the language, also finding the other matching code(s):
s <- which(code.mappings$AUTOTYP == as.character(autotyp.data$LID[i]));
path <- c(path, create.name.and.codes(autotyp.data$language[i],
iso=as.character(autotyp.data$ISO639.3.2013[i]),
wals=ifelse( length(s) == 0, "", paste(code.mappings$WALS[s],collapse="-",sep="") ),
autotyp=as.character(autotyp.data$LID[i]),
glottocode=as.character(autotyp.data$glottolog_LID.2014[i])));
roots <- add.language.to.languageclassification( roots,
path=path,
brlens=NULL );
}
}
attr(roots, "classif.name") <- "autotyp";
# Fix the family and language names (get rid of the codes):
for( i in 1:length(roots) )
{
roots[[i]] <- .fix.names(roots[[i]], quotes=quotes);
attr(roots[[i]], "tree.name") <- str_trim(strsplit(strsplit(get.name(roots[[i]]),quotes,fixed=TRUE)[[1]][2],"[",fixed=TRUE)[[1]][1]);
}
# Save them to file:
export.languageclassification( roots, dir.name="../output", classification="autotyp", export.nexus=EXPORT_NEXUS, nexus.translate.block=EXPORT_NEXUS_TRANSLATE_BLOCK, export.csv=EXPORT_CSV );
rm(autotyp.data);
}
#################################
#
# Compute the branch lengths
#
#################################
if( COMPUTE_BRLEN )
{
# Apply various methods for computing branch length to a classification:
compute.brlength.trees <- function( dir.name, # the directory where the original trees are and where the resulting trees will be saved
classification, # the classification: there must be a trees file in ./output/classification/classification-newick.csv
export.nexus=TRUE, # export the resulting trees to a NEXUS file?
nexus.translate.block=TRUE, # when exporting to NEXUS, also generate a translate block?
export.csv=TRUE, # export the resulting trees to a CSV file?
methods="all", # the branch length methods to be used; can be "all", "none", or any subset of {"constant", "proportional", "grafen", "nnls", "ga", "nj"}
constant=1.0, # the positive constant required by the methods "constant" and "proportional"
dists.codes=NULL, # the data.frame of distance matrices to be used by "nnls", "ga" and "nj", and the codes used as the distance matrices' row and column names
replace.NA.brlen.with=NA, # some methods produce NA branch length (if the actual brlen cannot be estimated from the data): replace it by another numeric value?
restore.collapsed.singles=TRUE, # some standard methods need to collapse singles, but we can restore them
remove.all.missing.distances=TRUE, # remove languages with missing distance info?
parallel.mc.cores=4,
quotes="'"
)
{
start.time <- Sys.time(); cat(paste0("Staring processing classification '",classification,"' @ ",start.time,"\n"));
# Helper function: given a distances matrix with row and column names as language codes and which code to use, generate the corresponding submatrix with row and column full language names:
dists.submatrix <- function( roots, dist.mat, code=c("iso","wals","autotyp","glottolog") )
{
# Sanity checks:
if( is.null(roots) || length(roots) == 0 || is.null(dist.mat) || nrow(dist.mat) == 0 || ncol(dist.mat) == 0 )
{
return (NULL);
}
# Adapt the distance methods for these trees (keep only the languages in the intersection and generate the full names):
lgs <- data.frame( "full"=unlist( lapply( roots, function(root){ extract.languages( root ); } ) ), "name"=NA, "iso"=NA, "wals"=NA, "autotyp"=NA, "glottolog"=NA, stringsAsFactors=FALSE );
for( i in 1:nrow(lgs) )
{
tmp <- extract.name.and.codes( lgs$full[i], quotes );
lgs$name[i] <- normalize.language.name(tmp$name);
lgs$iso[i] <- paste0(tmp$iso,collapse="-");
lgs$wals[i] <- paste0(tmp$wals,collapse="-");
lgs$autotyp[i] <- paste0(tmp$autotyp,collapse="-");
lgs$glottolog[i] <- paste0(tmp$glottolog,collapse="-");
}
# Expand the required code in the long format, keeping only the actually defined ones:
codes <- lapply( 1:nrow(lgs), function(i){ tmp <- ifelse( lgs[i,code] == "", "", strsplit(lgs[i,code],"-",fixed=TRUE)[[1]] ); data.frame( "name"=lgs$full[i], "code"=tmp ); } );
codes <- do.call( rbind, codes );
codes <- codes[ codes$code != "", ];
# Overlap santiy check:
if( length( intersect( codes$code, rownames(dist.mat) ) ) == 0 )
{
return (NULL);
}
# The shared codes:
shared.codes <- codes[ codes$code %in% rownames(dist.mat), ];
if( is.null(shared.codes) || length(shared.codes) == 0 )
{
return (NULL);
}
# Select the submatrix corresponding to these codes
dm <- dist.mat[ as.character(shared.codes$code), as.character(shared.codes$code), drop=FALSE ];
if( is.null(dm) || !is.matrix(dm) || nrow(dm) == 0 || ncol(dm) == 0 )
{
return (NULL);
}
shared.codes <- shared.codes[ shared.codes$code %in% rownames(dm), , drop=FALSE ];
if( is.null(shared.codes) || nrow(shared.codes) == 0 )
{
return (NULL);
}
full.language.names <- as.character(shared.codes$name);
# For duplicated languages, keep the code that minimized the matrix-wide amount of missing data:
duplgs <- duplicated( shared.codes$name );
if( sum(duplgs,na.rm=TRUE) > 0 )
{
# The duplicated language names:
duplgnames <- unique(shared.codes$name[ duplgs ]);
# The tried combinations, each composed of an iso code per duplicated language and the resulting number of missing data in the distances matrix:
lgcombs <- list();
for( i in 1:length(duplgnames) )
{
lgcombs[[i]] <- shared.codes$code[ shared.codes$name==duplgnames[i] ];
}
lgcombs <- expand.grid( lgcombs );
names(lgcombs) <- sapply( 1:length(duplgnames), function(i){ duplgnames[i] } );
lgcombs <- cbind( lgcombs, "NoNAs"=NA );
# The non-duplicated language codes:
nondupcodes <- as.character(shared.codes$code[ !(shared.codes$name %in% duplgnames) ]);
# Compute the number of NAs for each combination:
for( i in 1:nrow(lgcombs) )
{
# The distances matrix:
combcodes <- c( nondupcodes, as.character(unlist(lgcombs[i,1:ncol(lgcombs)-1])) );
d <- dm[ combcodes, combcodes ];
lgcombs$NoNAs[i] <- sum( is.na(d) );
}
if( sum( !is.na(lgcombs$NoNAs) ) > 0 )
{
# Pick one that minimizes the number of NAs:
combcodes <- c( nondupcodes, as.character(unlist(lgcombs[which.min(lgcombs$NoNAs),1:ncol(lgcombs)-1])) );
dm <- dm[ combcodes, combcodes ];
# And the corresponding language names:
full.language.names <- c( as.character(shared.codes$name[ !(shared.codes$name %in% duplgnames) ]), as.character(duplgnames) );
}
}
# Make the row and column names the full language names:
rownames(dm) <- colnames(dm) <- full.language.names;
# Return this submatrix:
return (dm);
}
# Sanity checks:
if( constant <= 0 )
{
cat( "The constant must be positive\n" );
return (FALSE);
}
# Expand the methods to be used:
if( length(methods) == 1 && methods == "none" )
{
# Nothing to do:
return (TRUE);
} else if( length(methods) == 1 && methods == "all" )
{
# All methods
methods = c("constant", "proportional", "grafen", "nj", "nnls", "ga");
} else if( !all( methods %in% c("constant", "proportional", "grafen", "nj", "nnls", "ga"), na.rm=TRUE ) )
{
cat( "Unknown methods\n" );
return (FALSE);
}
# Read the roots from file:
cat( paste( "Reading the trees for classification '", classification, "'...\n", sep="") );
roots <- languageclassification( classif.name=classification, csv.file=paste(dir.name,"/",classification,"/",classification,"-newick.csv",sep=""), csv.name.column="Family", csv.tree.column="Tree" );
if( is.null(roots) || length(roots) == 0 || !inherits(roots,"languageclassification") )
{
return (FALSE);
}
# Compute the branch lengths and export them as separate files, one per method (by distance or constant):
for( method in methods )
{
# Decide which constant and/or distances to use depending on the method (and store that in the d.c.k data.frame):
if( method == "grafen" )
{
d.c.k <- data.frame("Distance"=NA, "ShortName"=NA, "File"=NA, "Code"=NA, "k"=NA);
} else if( method %in% c("constant", "proportional") )
{
d.c.k <- data.frame("Distance"=NA, "ShortName"=NA, "File"=NA, "Code"=NA, "k"=constant);
} else if( method %in% c("nnls", "ga", "nj") )
{
d.c.k <- cbind(dists.codes, "k"=NA);
} else
{
# Unknown method; how did I get here?
next;
}
# Apply the method:
for( i in 1:nrow(d.c.k) )
{
# Print progress message:
cat( paste( "\t\tProcesssing the '", classification, "' trees with method '", as.character(method), "'",
ifelse( is.na(d.c.k$k[i]), "", paste( " with constant=", sprintf("%.02f",d.c.k$k[i]), sep="" ) ),
ifelse( is.na(d.c.k$Distance[i]), "", paste( " with distance matrix '", d.c.k$Distance[i], "'", sep="" ) ),
"...\n", sep="" ) );
# Load and adapt the distance matrix:
if( !is.na(d.c.k$Distance[i]) )
{
load(d.c.k$File[i]); if( is.null(d) || class(d) != "matrix" || nrow(d) != ncol(d) || nrow(d) < 1 ){ cat("Distance ", d.c.k$Distance[i], " is malformed\n"); return (FALSE); }
d <- dists.submatrix(roots, d, d.c.k$Code[i]);
if( is.null(d) || class(d) != "matrix" || nrow(d) != ncol(d) || nrow(d) < 1 ){ cat("Distance ", d.c.k$Distance[i], " is empty for classification ", classification, "\n"); return (FALSE); }
} else
{
d <- NULL;
}
# Apply the method:
export.languageclassification( roots,
dir.name=dir.name,
classification=classification,
export.nexus=export.nexus,
nexus.translate.block=nexus.translate.block,
export.csv=export.csv,
method=as.character(method),
constant=d.c.k$k[i],
distmatrix=d,
replace.NA.brlen.with=replace.NA.brlen.with,
restore.collapsed.singles=restore.collapsed.singles,
remove.all.missing.distances=remove.all.missing.distances,
filename.postfix=paste( "-", method,
ifelse( is.na(d.c.k$k[i]), "", paste( "=", sprintf("%.02f",d.c.k$k[i]), sep="") ),
ifelse( is.na(d.c.k$Distance[i]), "", paste( "+", d.c.k$Distance[i], sep="") ),
sep="" ),
quotes=quotes,
parallel.mc.cores=parallel.mc.cores );
# Free up the space:
rm(d);
}
}
end.time <- Sys.time(); cat(paste0("Ended processing classification '",classification,"' @ ",end.time," (took ",sprintf("%.1f minutes",difftime(end.time,start.time,units="mins")),")","\n"));
return (TRUE);
}
compute.brlength.trees <- cmpfun(compute.brlength.trees);
# Compute the branch lengths for each classification:
x <- mclapply( CLASSIFICATIONS, function( classification )
{
# Apply only the relevant mg2015 distances:
if( length(grep("mg2015", DISTS.CODES$Distance, fixed=TRUE)) > 0 )
{
dists.codes <- DISTS.CODES[ -grep("mg2015", DISTS.CODES$Distance, fixed=TRUE), ];
dists.codes <- rbind(dists.codes, DISTS.CODES[ DISTS.CODES$Distance == paste0("mg2015(",classification,")"), ]);
} else
{
dists.codes <- DISTS.CODES;
}
# Compute the branch lengths:
compute.brlength.trees( dir.name="../output", classification=classification,
export.nexus=EXPORT_NEXUS, nexus.translate.block=EXPORT_NEXUS_TRANSLATE_BLOCK, export.csv=EXPORT_CSV,
methods=METHODS, constant=CONSTANT, dists.codes=dists.codes, replace.NA.brlen.with=NA, restore.collapsed.singles=TRUE,
parallel.mc.cores=CPU_CORES, quotes=quotes );
}, mc.cores=length(CLASSIFICATIONS) );
}