In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import numpy as np
import pymysql
import sqlalchemy as alch
from getpass import getpass
import os

# MySQL Queries

### Hypotheses

1. Endangered languages are more concentrated in specific geographic regions.
2. Countries with higher population density tend to have a higher linguistic diversity.
3. Higher levels of urbanization in a country are associated with a shift towards dominant languages and a decline in the use of minority languages.
4. The type of political system in a country is related to the number of languages spoken, with democratic countries having higher linguistic diversity. (My hipothesis is that Authoritarian countries may have the same or even more linguistic diversity but they don't recognise other languages and they don't give them any status, therefore we don't have enough data. On the other side, democratic countries probably recognise and protect more languages, and provide more data related to them).
5. The current number of endangered languages in a country is positively correlated with the historical incidence of language extinction in that country and the presence of a dominant language from the top 5 most spoken languages in the world. 

In [2]:
# I import the dataframes I'm going to use in SQL

extinct_languages = pd.read_csv("/Users/usuari/Desktop/Ironhack/BOOTCAMP/projects/project-4-sql-tableau/data/extinct_languages.csv")
top_languages = pd.read_csv("/Users/usuari/Desktop/Ironhack/BOOTCAMP/projects/project-4-sql-tableau/data/top_languages.csv")
country_lan = pd.read_csv("/Users/usuari/Desktop/Ironhack/BOOTCAMP/projects/project-4-sql-tableau/data/country_lan.csv")

In [3]:
password = getpass()

········


In [4]:
# I have created a database in MySQL called "languages" and now I add my three dataframes as tables: 

In [5]:
dbName = "languages"
connectionData = f"mysql+pymysql://root:{password}@localhost/{dbName}"
engine = alch.create_engine(connectionData)
extinct_languages.to_sql("extinct_languages.csv", if_exists="append", con=engine, index=False)

2722

In [6]:
dbName = "languages"
connectionData = f"mysql+pymysql://root:{password}@localhost/{dbName}"
engine = alch.create_engine(connectionData)
top_languages.to_sql("top_languages.csv", if_exists="append", con=engine, index=False)

45

In [7]:
dbName = "languages"
connectionData = f"mysql+pymysql://root:{password}@localhost/{dbName}"
engine = alch.create_engine(connectionData)
country_lan.to_sql("country_lan.csv", if_exists="append", con=engine, index=False)

194

## 1. Endangered languages are more concentrated in specific geographic regions.

In [23]:
# With this query I want to extract the top 6 countries with more endangered languages

query = '''
WITH endangered_countries AS (
    SELECT countries, country_code, COUNT(countries) AS count_endangered_languages
    FROM `extinct_languages.csv`
    GROUP BY countries, country_code
)

SELECT *
FROM endangered_countries
ORDER BY count_endangered_languages DESC
limit 6; 
'''
pd.read_sql_query(query, engine)

Unnamed: 0,countries,country_code,count_endangered_languages
0,United States of America,USA,216
1,Brazil,BRA,186
2,India,IND,163
3,Indonesia,IDN,149
4,Mexico,MEX,143
5,China,CHN,122


In [105]:
# Now I'm extracting the coordinates and regions of these countries

query = '''
SELECT country, region, continent, latitude, longitude
FROM `country_lan.csv`
WHERE country LIKE '%%united states%%'
   OR country LIKE '%%brazil%%'
   OR country LIKE '%%india%%'
   OR country LIKE '%%indonesia%%'
   OR country LIKE '%%mexico%%'
   OR country LIKE '%%china%%';
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,region,continent,latitude,longitude
0,Brazil,South America,Americas,-10.0,-55.0
1,China,Eastern Asia,Asia,35.0,105.0
2,India,Southern Asia,Asia,20.0,77.0
3,Indonesia,South-Eastern Asia,Asia,-5.0,120.0
4,Mexico,Central America,Americas,23.0,-102.0
5,United States,Northern America,Americas,38.0,-97.0


## 2. Countries with higher population density tend to have a higher linguistic diversity.

In [35]:
# Here I'm checking which are the 10 countries with more population.

query = '''
SELECT country, latitude, longitude, population, languages_spoken, count_languages
FROM `country_lan.csv`
order by population desc
limit 10;
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,latitude,longitude,population,languages_spoken,count_languages
0,India,20.0,77.0,1417173173,"Hindi 30%, English, Bengali, Gujarati, Kashmir...",19
1,China,35.0,105.0,1412175000,"Standard Chinese (Mandarin/Putonghua), Yue (Ca...",10
2,United States,38.0,-97.0,333287557,"English 82%, Spanish 11% (2000)",2
3,Indonesia,-5.0,120.0,275501339,"Bahasa Indonesia (official), English, Dutch, J...",7
4,Pakistan,30.0,70.0,235824862,"Urdu 8%, English (both official); Punjabi 48%,...",13
5,Nigeria,10.0,8.0,218541212,"English (official), Hausa, Yoruba, Igbo, Fulan...",7
6,Brazil,-10.0,-55.0,215313498,"Portuguese (official), Spanish, English, French",4
7,Bangladesh,24.0,90.0,171186372,"Bangla (official), English",2
8,Russia,60.0,100.0,143555736,"Russian, others",2
9,Mexico,23.0,-102.0,127504125,"Spanish, various Mayan, Nahuatl, and other reg...",5


In [36]:
# And here I want to extract the countries in which there are more spoken languages. 

query = '''
SELECT country, latitude, longitude, population, languages_spoken, count_languages
FROM `country_lan.csv`
order by count_languages desc
limit 10;
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,latitude,longitude,population,languages_spoken,count_languages
0,India,20.0,77.0,1417173173,"Hindi 30%, English, Bengali, Gujarati, Kashmir...",19
1,Pakistan,30.0,70.0,235824862,"Urdu 8%, English (both official); Punjabi 48%,...",13
2,Philippines,13.0,122.0,115559009,"Filipino (based on Tagalog), English (both off...",11
3,Palau,7.5,134.5,18055,"Palauan 64.7%, English 9.4%, Sonsoralese, Tobi...",11
4,Singapore,1.366667,103.8,5637022,"Mandarin 35%, English 23%, Malay 14.1%, Hokkie...",10
5,China,35.0,105.0,1412175000,"Standard Chinese (Mandarin/Putonghua), Yue (Ca...",10
6,Iran,32.0,53.0,88550570,"Persian and Persian dialects 58%, Turkic and T...",10
7,Malaysia,2.5,112.5,33938221,"Bahasa Melayu (Malay, official), English, Chin...",10
8,Micronesia,6.916667,158.25,114164,"English (official, common), Chukese, Pohnpeian...",9
9,Zambia,-15.0,30.0,20017675,"English (official); major vernaculars: Bemba, ...",9


In [106]:

query = '''
SELECT country, latitude, longitude, population, languages_spoken, count_languages
FROM `country_lan.csv`
order by population asc
limit 10;
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,latitude,longitude,population,languages_spoken,count_languages
0,Tuvalu,-8.0,178.0,11312,"Tuvaluan, English, Samoan, Kiribati (on the is...",4
1,Nauru,-0.533333,166.916667,12668,"Nauruan (official), English",2
2,Palau,7.5,134.5,18055,"Palauan 64.7%, English 9.4%, Sonsoralese, Tobi...",11
3,San Marino,43.766667,12.416667,33660,Italian,1
4,Monaco,43.733333,7.4,36469,"French (official), English, Italian, Monégasque",4
5,Liechtenstein,47.266667,9.533333,39327,"German (official), Alemannic dialect",2
6,Marshall Islands,9.0,168.0,41569,Marshallese 98% (two major dialects from the M...,3
7,St. Kitts and Nevis,17.333333,-62.75,47657,English,1
8,Dominica,15.416667,-61.333333,72737,English (official) and French patois,3
9,Andorra,42.5,1.5,79824,"Catalán (official), French, Castilian, Portuguese",4


## 3. Higher levels of urbanization in a country are associated with a shift towards dominant languages and a decline in the use of minority languages.

In [47]:
# I'm checking which are the countries with more urban land. 

query = '''
SELECT country, latitude, longitude, urban_land, languages_spoken, count_languages
FROM `country_lan.csv`
order by urban_land desc
limit 9;
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,latitude,longitude,urban_land,languages_spoken,count_languages
0,China,35.0,105.0,522345.0,"Standard Chinese (Mandarin/Putonghua), Yue (Ca...",10
1,United States,38.0,-97.0,186573.0,"English 82%, Spanish 11% (2000)",2
2,India,20.0,77.0,171839.0,"Hindi 30%, English, Bengali, Gujarati, Kashmir...",19
3,Indonesia,-5.0,120.0,67755.1,"Bahasa Indonesia (official), English, Dutch, J...",7
4,Bangladesh,24.0,90.0,56970.0,"Bangla (official), English",2
5,Japan,36.0,138.0,53452.1,Japanese,1
6,Russia,60.0,100.0,52887.0,"Russian, others",2
7,Brazil,-10.0,-55.0,45853.3,"Portuguese (official), Spanish, English, French",4
8,Pakistan,30.0,70.0,39082.0,"Urdu 8%, English (both official); Punjabi 48%,...",13


In [46]:
# Here instead I'm extracting the countries with more urban population.

query = '''
SELECT country, latitude, longitude, urban_population, languages_spoken, count_languages
FROM `country_lan.csv`
order by urban_population desc
limit 9;
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,latitude,longitude,urban_population,languages_spoken,count_languages
0,China,35.0,105.0,897578430,"Standard Chinese (Mandarin/Putonghua), Yue (Ca...",10
1,India,20.0,77.0,508368361,"Hindi 30%, English, Bengali, Gujarati, Kashmir...",19
2,United States,38.0,-97.0,276908634,"English 82%, Spanish 11% (2000)",2
3,Brazil,-10.0,-55.0,188517733,"Portuguese (official), Spanish, English, French",4
4,Indonesia,-5.0,120.0,159608946,"Bahasa Indonesia (official), English, Dutch, J...",7
5,Nigeria,10.0,8.0,116965442,"English (official), Hausa, Yoruba, Igbo, Fulan...",7
6,Japan,36.0,138.0,115058684,Japanese,1
7,Russia,60.0,100.0,107847682,"Russian, others",2
8,Mexico,23.0,-102.0,103660854,"Spanish, various Mayan, Nahuatl, and other reg...",5


## 4. The type of political system in a country is related to the number of languages spoken, with democratic countries having higher linguistic diversity.

In [78]:
# These are the 10 countries with the best democracy_score

query = '''
SELECT country, continent, democracy_score, democracy_type, languages_spoken, count_languages
    FROM
        `country_lan.csv`
    ORDER BY
        democracy_score DESC
    LIMIT 10

'''
pd.read_sql_query(query, engine)

Unnamed: 0,country,continent,democracy_score,democracy_type,languages_spoken,count_languages
0,Norway,Europe,9.87,Full democracy,"Bokmål Norwegian, Nynorsk Norwegian (both offi...",4
1,Iceland,Europe,9.58,Full democracy,"Icelandic, English, Nordic languages, German w...",5
2,Sweden,Europe,9.39,Full democracy,"Swedish, small Sami- and Finnish-speaking mino...",3
3,New Zealand,Oceania,9.26,Full democracy,"English, Maori (both official)",2
4,Denmark,Europe,9.22,Full democracy,"Danish, Faroese, Greenlandic (Inuit dialect), ...",6
5,Canada,Americas,9.15,Full democracy,"English 59.3%, French 23.2% (both official); o...",3
6,Ireland,Europe,9.15,Full democracy,"English, Irish (Gaelic) (both official)",3
7,Finland,Europe,9.14,Full democracy,"Finnish 92%, Swedish 6% (both official); small...",5
8,Australia,Oceania,9.09,Full democracy,"English 79%, native and other languages",3
9,Switzerland,Europe,9.03,Full democracy,"German 64%, French 20%, Italian 7% (all offici...",4


In [76]:
# I'm going to calculate the average of spoken languages in the top ten countries with the best democracy_score. 

query = '''
WITH preselection AS (
    SELECT
        country,
        democracy_score,
        democracy_type,
        languages_spoken,
        count_languages
    FROM
        `country_lan.csv`
    ORDER BY
        democracy_score DESC
    LIMIT 10
)

SELECT
    AVG(count_languages) AS average_count_languages
FROM
    preselection;

'''
pd.read_sql_query(query, engine)

Unnamed: 0,average_count_languages
0,3.8


In [84]:
# These are the 10 countries with the worst democracy_score, authoritarian regimes.

query = '''
SELECT country, continent, democracy_score, democracy_type, languages_spoken, count_languages
    FROM
        `country_lan.csv`
    where democracy_score >= 1
    and count_languages >= 1
    ORDER BY
        democracy_score asc
    LIMIT 10

'''
pd.read_sql_query(query, engine)

Unnamed: 0,country,continent,democracy_score,democracy_type,languages_spoken,count_languages
0,Central African Republic,Africa,1.52,Authoritarian,"French (official), Sangho (lingua franca, nati...",3
1,Chad,Africa,1.61,Authoritarian,"French, Arabic (both official); Sara; more tha...",5
2,Turkmenistan,Asia,1.72,Authoritarian,"Turkmen 72%; Russian 12%; Uzbek 9%, other 7%",4
3,Equatorial Guinea,Africa,1.92,Authoritarian,"Spanish, French (both official); pidgin Englis...",6
4,Saudi Arabia,Asia,1.93,Authoritarian,Arabic,1
5,Tajikistan,Asia,1.93,Authoritarian,"Tajik (official), Russian widely used in gover...",3
6,Yemen,Asia,1.95,Authoritarian,Arabic,1
7,Guinea-Bissau,Africa,1.98,Authoritarian,"Portuguese (official), Criolo, African languages",3
8,Uzbekistan,Asia,2.01,Authoritarian,"Uzbek 74.3%, Russian 14.2%, Tajik 4.4%, other ...",4
9,Sudan,Africa,2.15,Authoritarian,"Arabic (official), Nubian, Ta Bedawie, diverse...",7


In [88]:
# Now I'm going to calculate the average of spoken languages in the top ten countries with the worst democracy_score. 

query = '''
WITH preselection AS (
    SELECT
        country,
        democracy_score,
        democracy_type,
        languages_spoken,
        count_languages
    FROM
        `country_lan.csv`
    where democracy_score >= 1
    and count_languages >= 1
    ORDER BY
        democracy_score asc
    LIMIT 10
)

SELECT
    AVG(count_languages) AS average_count_languages
FROM
    preselection;

'''
pd.read_sql_query(query, engine)

# There really isn't many difference, compared to the average of spoken languages in the countries with best democratic score.

Unnamed: 0,average_count_languages
0,3.7


In [89]:
# Here I'm checking again which are the top 10 countries with more endangered languages. 

query = '''
WITH endangered_countries AS (
    SELECT countries, country_code, COUNT(countries) AS count_endangered_languages
    FROM `extinct_languages.csv`
    GROUP BY countries, country_code
)

SELECT *
FROM endangered_countries
ORDER BY count_endangered_languages DESC
limit 10; 
'''
pd.read_sql_query(query, engine)

Unnamed: 0,countries,country_code,count_endangered_languages
0,United States of America,USA,216
1,Brazil,BRA,186
2,India,IND,163
3,Indonesia,IDN,149
4,Mexico,MEX,143
5,China,CHN,122
6,Russian Federation,RUS,119
7,Australia,AUS,108
8,Papua New Guinea,PNG,98
9,Canada,CAN,82


In [91]:
# And now I want to see which form of governament have the previous countries. 

query = '''
SELECT country, democracy_score, democracy_type, languages_spoken, count_languages
FROM `country_lan.csv`
WHERE country LIKE '%%united states%%'
   OR country LIKE '%%brazil%%'
   OR country LIKE '%%india%%'
   OR country LIKE '%%indonesia%%'
   OR country LIKE '%%mexico%%'
   OR country LIKE '%%china%%'
   OR country LIKE '%%russia%%'
   OR country LIKE '%%australia%%'
    OR country LIKE '%%papua new guinea%%'
     OR country LIKE '%%canada%%';
'''

pd.read_sql_query(query, engine)

Unnamed: 0,country,democracy_score,democracy_type,languages_spoken,count_languages
0,Australia,9.09,Full democracy,"English 79%, native and other languages",3
1,Brazil,6.97,Flawed democracy,"Portuguese (official), Spanish, English, French",4
2,Canada,9.15,Full democracy,"English 59.3%, French 23.2% (both official); o...",3
3,China,3.32,Authoritarian,"Standard Chinese (Mandarin/Putonghua), Yue (Ca...",10
4,India,7.23,Flawed democracy,"Hindi 30%, English, Bengali, Gujarati, Kashmir...",19
5,Indonesia,6.39,Flawed democracy,"Bahasa Indonesia (official), English, Dutch, J...",7
6,Mexico,6.19,Flawed democracy,"Spanish, various Mayan, Nahuatl, and other reg...",5
7,Papua New Guinea,6.03,Flawed democracy,"Tok Pisin (Melanesian Pidgin, the lingua franc...",4
8,Russia,2.94,Authoritarian,"Russian, others",2
9,United States,7.96,Flawed democracy,"English 82%, Spanish 11% (2000)",2


## 5. The current number of endangered languages in a country is correlated with the historical incidence of language extinction in that country and the presence of a dominant language (from the top languages spoken in the world).

In [100]:
# With this query I want to extract the top 10 laguages most spoken in the world. 

query = '''
SELECT language, total_speakers
FROM `top_languages.csv`
order by total_speakers desc
limit 5;
'''
pd.read_sql_query(query, engine)

Unnamed: 0,language,total_speakers
0,English(excl. creole languages),1452000000.0
1,"Mandarin Chinese(incl. Standard Chinese, but e...",1118000000.0
2,Hindi(excl. Urdu),602200000.0
3,Spanish,548300000.0
4,French,274100000.0


In [101]:
# Now I'm checking again which countries have more endangered languages.

query = '''
WITH endangered_countries AS (
    SELECT countries, country_code, COUNT(countries) AS count_endangered_languages
    FROM `extinct_languages.csv`
    GROUP BY countries, country_code
)

SELECT *
FROM endangered_countries
ORDER BY count_endangered_languages DESC
limit 5; 
'''
pd.read_sql_query(query, engine)

Unnamed: 0,countries,country_code,count_endangered_languages
0,United States of America,USA,216
1,Brazil,BRA,186
2,India,IND,163
3,Indonesia,IDN,149
4,Mexico,MEX,143


In [102]:
# I also want to find out which are the countries that have more extinct languages.

query = '''
WITH extinct_countries AS (
    SELECT countries, country_code, COUNT(countries) AS count_endangered_languages
    FROM `extinct_languages.csv`
    where degree_of_endangerment = 'Extinct'
    GROUP BY countries, country_code
)

SELECT *
FROM extinct_countries
ORDER BY count_endangered_languages DESC
limit 5; 
'''
pd.read_sql_query(query, engine)

Unnamed: 0,countries,country_code,count_endangered_languages
0,United States of America,USA,54
1,Russian Federation,RUS,24
2,Indonesia,IDN,12
3,Brazil,BRA,12
4,China,CHN,10


## Conclusions:

1. In these regions, there is a higher concentration of endangered languages: Northern, Central, and Southern America, as well as Eastern and Southern Asia.

2. With the data provided from the dataset of spoken Languages per country, there's not an evident correlation between the count of registered languages and the population of each country. However, according to this data and our visualizations, we can see that there are a lot of unnamed languages and dialects, therefore a higher linguistic diversity, in the countries with the highest population.

3. The countries with the highest urban population are also the ones with more endangered languages. Also, their official languages are among the top 10 spoken languages in the world. 

4. We may think that the type of political system in a country is related to the number of languages spoken, with democratic countries having higher linguistic diversity. However, this analysis has showed that the type of countries where there are more vulnerables languages are flawed democracies, rather than authoritarian regimes.

5. The United States is the nation where the prevalence of a dominant language (English) has posed a threat to and caused the disappearance of the highest number of languages.