In [1]:
# importing modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from pprint import pprint
from scipy import stats
import statsmodels.api as sm 

In [2]:

# athlete events df from csv
global_file = os.path.join("..","data", "rawData","NCD_RisC_eLife_2016_height_age18_countries.txt")
global_df = pd.read_csv(global_file, encoding="ISO-8859-1")

# To get the raw data count, commented out to keep the original df .head() results prominent
raw_count = len(global_df)

# preview the raw data
global_df.head()

Unnamed: 0,Country,ISO,Sex,Year of birth,Mean height (cm),Mean height lower 95% uncertainty interval (cm),Mean height upper 95% uncertainty interval (cm)
0,Afghanistan,AFG,Men,1896,161.164095,154.484285,167.754033
1,Afghanistan,AFG,Men,1897,161.196286,154.571603,167.659618
2,Afghanistan,AFG,Men,1898,161.228297,154.70734,167.602576
3,Afghanistan,AFG,Men,1899,161.260727,154.835644,167.528113
4,Afghanistan,AFG,Men,1900,161.293068,154.95954,167.508077


In [3]:
age_offset = 19

global_df['Olympic Year'] = global_df['Year of birth'] + age_offset

global_df['Mean height (inches)'] = global_df['Mean height (cm)'].apply(lambda x: x / 2.54)

global_df.head()

Unnamed: 0,Country,ISO,Sex,Year of birth,Mean height (cm),Mean height lower 95% uncertainty interval (cm),Mean height upper 95% uncertainty interval (cm),Olympic Year,Mean height (inches)
0,Afghanistan,AFG,Men,1896,161.164095,154.484285,167.754033,1915,63.450431
1,Afghanistan,AFG,Men,1897,161.196286,154.571603,167.659618,1916,63.463105
2,Afghanistan,AFG,Men,1898,161.228297,154.70734,167.602576,1917,63.475707
3,Afghanistan,AFG,Men,1899,161.260727,154.835644,167.528113,1918,63.488475
4,Afghanistan,AFG,Men,1900,161.293068,154.95954,167.508077,1919,63.501208


In [4]:
globalMen = global_df.loc[global_df['Sex']=='Men']

globalMen.head()

Unnamed: 0,Country,ISO,Sex,Year of birth,Mean height (cm),Mean height lower 95% uncertainty interval (cm),Mean height upper 95% uncertainty interval (cm),Olympic Year,Mean height (inches)
0,Afghanistan,AFG,Men,1896,161.164095,154.484285,167.754033,1915,63.450431
1,Afghanistan,AFG,Men,1897,161.196286,154.571603,167.659618,1916,63.463105
2,Afghanistan,AFG,Men,1898,161.228297,154.70734,167.602576,1917,63.475707
3,Afghanistan,AFG,Men,1899,161.260727,154.835644,167.528113,1918,63.488475
4,Afghanistan,AFG,Men,1900,161.293068,154.95954,167.508077,1919,63.501208


In [5]:
# new df with just columns of interest
menHeight_df = globalMen[['Country','Olympic Year','Mean height (inches)']]

menHeight_df.head()

# pivot to get 1 column per country
men_pivot = menHeight_df.pivot(index='Olympic Year', columns='Country', values = 'Mean height (inches)').reset_index()

men_pivot.head()

Country,Olympic Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,...,United Kingdom,United States of America,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,Yemen,Zambia,Zimbabwe
0,1915,63.450431,64.301236,63.144913,66.817208,64.325861,63.846909,63.666118,63.91231,63.721344,...,65.69775,67.354044,64.491937,63.419485,64.40451,62.977111,61.14088,60.828494,63.861389,64.872834
1,1916,63.463105,64.356862,63.22923,66.830218,64.361269,63.881724,63.732762,63.970173,63.785224,...,65.746774,67.393566,64.546531,63.492362,64.418471,63.0378,61.19148,60.908412,63.90606,64.905676
2,1917,63.475707,64.412639,63.313582,66.843296,64.396825,63.916712,63.799399,64.028191,63.849084,...,65.7958,67.433186,64.601137,63.565366,64.432572,63.098624,61.24209,60.988418,63.950827,64.938636
3,1918,63.488475,64.468465,63.39806,66.856563,64.432556,63.951724,63.865888,64.086284,63.91309,...,65.844868,67.472893,64.655645,63.638335,64.446783,63.159401,61.292734,61.068543,63.995537,64.97169
4,1919,63.501208,64.524351,63.482699,66.86998,64.468263,63.986702,63.932357,64.144393,63.977298,...,65.893934,67.512643,64.710131,63.711416,64.461268,63.220177,61.343342,61.148666,64.040229,65.004718


In [6]:
# try the correlation
corr_matrix = men_pivot.corr(method='pearson')

corr_matrix.head()


Country,Olympic Year,Afghanistan,Albania,Algeria,American Samoa,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,...,United Kingdom,United States of America,Uruguay,Uzbekistan,Vanuatu,Venezuela,Viet Nam,Yemen,Zambia,Zimbabwe
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Olympic Year,1.0,0.980255,0.97132,0.890137,0.978706,0.994131,0.891788,0.964611,0.98718,0.961804,...,0.968892,0.910052,0.96906,0.891277,0.965347,0.986696,0.970485,0.726945,0.822155,0.871913
Afghanistan,0.980255,1.0,0.996816,0.945085,0.967188,0.994113,0.960957,0.986163,0.993912,0.987551,...,0.990222,0.951628,0.990708,0.942472,0.993493,0.99313,0.987259,0.823719,0.907438,0.94734
Albania,0.97132,0.996816,1.0,0.963305,0.955567,0.987049,0.972933,0.992544,0.992791,0.994618,...,0.993625,0.964818,0.994971,0.960823,0.992561,0.993995,0.989833,0.857083,0.929357,0.96295
Algeria,0.890137,0.945085,0.963305,1.0,0.843095,0.912811,0.982902,0.978331,0.95039,0.980694,...,0.973657,0.994933,0.974336,0.999478,0.932766,0.952083,0.964753,0.959769,0.986333,0.980596
American Samoa,0.978706,0.967188,0.955567,0.843095,1.0,0.985041,0.877317,0.924761,0.954893,0.927138,...,0.931896,0.85042,0.933015,0.838554,0.973686,0.957377,0.930455,0.673957,0.786936,0.861719


In [7]:
# picking some countries to try...
(stat, p) = stats.f_oneway(men_pivot['United States of America'],men_pivot['China'],men_pivot['Japan'],men_pivot['Spain'])

print(f'I think this is saying that the variance is significant? P-value = {p}')

I think this is saying that the variance is significant? P-value = 5.0152514677341767e-57
