## Chocolate Consumption, Cognitive Function, and Nobel Laureates

"Chocolate consumption could hypothetically improve cognitive function not only in individuals but in whole populations. Could there be a correlation between a country's level of chocolate consumption and its total number of Nobel laureates per capita?"

http://www.nejm.org/doi/full/10.1056/NEJMon1211064


In [None]:
# Inspired by article, statistics taken from Wikipedia for 2017 and comma separated file created 
file = "Data/ChocNobels2017small.csv"

# The Swedish scientist Alfred Nobel established the prizes in 1895. 
# There are six categories for nobel prizes awarded ~each year:
# Physics, chemistry, physiology or medicine, literature, peace, economic sciences

In [None]:
# Pandas is an open source library providing easy-to-use data structures and data analysis tools for Python.
# Import library
import pandas as pd

# Import plotting library from matplotlib
import matplotlib.pyplot as plt

In [None]:
# Read the data from file of comma separted values(csv)
data = pd.read_csv(file)

In [None]:
# Look at the data
data

In [None]:
# Look at first 5 values (head of file)
data.head()

In [None]:
# Look at last 5 values (tail of file)
data.tail()

In [None]:
# Summary of data
data.info()

In [None]:
# Set country as index
data.set_index('Country', inplace=True)

In [None]:
# Look again at head of data to see change in index
data.head()

In [None]:
# Rename column headings to easier names
data.columns = ['Population','Chocolate','Nobels']

In [None]:
data.head()

In [None]:
# There are numerous libraries for plotting and data visualisation
# Inline means your matplotlib graphs will be included in your notebook
%matplotlib inline
# runtime configuration (rc) parameters allow customization
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 6

In [None]:
nobels = data['Nobels']

In [None]:
# Plot the nobel laureates per country
nobels.plot()

In [None]:
# Bar graph more appropriate in this case for distinct values
nobels.plot(kind='bar')

In [None]:
# Redraw plot this time with title and single colour
nobels.plot(kind = 'bar', 
            title = 'Nobel Laureates per Country', 
            color = 'orange')

In [None]:
# This time give the plot a name
bar_plot_nobels = nobels.plot.bar(color = 'orange')

# Add axis label and title
bar_plot_nobels.set_title('Nobel Laureates per Country')
bar_plot_nobels.set_ylabel('Nobel laureates per 10 million population')

In [None]:
# Sort data
data_sort_nobels = data.sort_values(['Nobels'], ascending=[False])
data_sort_nobels

In [None]:
# Horizontal bar plot using barh
sorted_nobels = data_sort_nobels['Nobels']
barh_plot_nobels = sorted_nobels.plot.barh(color = 'orange')

In [None]:
# font size
fsize = 15
# Bar plot
bar_plot_nobels = sorted_nobels.plot.bar(color = 'orange')

bar_plot_nobels.set_title('Nobel Laureates per Country', fontsize=fsize+10)
bar_plot_nobels.set_ylabel('Nobel laureates per 10 million population', fontsize=fsize)

In [None]:
# Let's look at our other variable
chocolate = data['Chocolate']

bar_plot_choc = chocolate.plot.bar(color = 'darkblue')

bar_plot_choc.set_title('Chocolate consumption per country', fontsize=fsize+5)
bar_plot_choc.set_ylabel('Chocolate consumption (kg/year/capita)', fontsize=fsize)


In [None]:
chocolate

In [None]:
# The first row in the data has an index = 0
chocolate.index[0]

In [None]:
# The fourth row in the data has an index = 3
chocolate.index[3]

In [None]:
plt.bar(chocolate.index,chocolate, color="darkblue")

plt.xlabel('Country', fontsize=fsize)
plt.ylabel('Chocolate consumption (kg/year/capita)', fontsize=fsize)
plt.title('Chocolate Consumption for different countries', fontsize=fsize+8)

# country names are the ticks on the x-axis
rcParams['xtick.labelsize'] = fsize
plt.xticks(rotation=70)

# to ensure axes labels fit area of plot
plt.tight_layout()

plt.savefig("Chocolate.png")
plt.show()


In [None]:
plt.bar(nobels.index, nobels, color="orange")

plt.xlabel('Country', fontsize=fsize)
plt.ylabel('Nobel Laureates per 10 million population', fontsize=fsize)
plt.title('Number of Nobel Laureates from different countries per 10 million population', fontsize=fsize+3)
plt.xticks(rotation=70)

# to ensure axes labels fit
plt.tight_layout()

plt.savefig("Nobels.png")
plt.show()


In [None]:
# Scatter plot of Chocolate Consumption vs Nobel Laureates
plt.scatter(chocolate, nobels)
plt.ylabel('Nobel laureates per 10 million population')
plt.xlabel('Chocolate consumption (kg/year/capita)')
plt.title('Chocolate Consumption vs Nobel Laureates', fontsize=14)

plt.show()

In [None]:
# Already defined variables
# nobels = data['Nobels']
# chocolate = data['Chocolate']

# Iterate over data and print out values
number_entries = len(data)
for i in range(number_entries):
    print(data.index[i], nobels[i], chocolate[i])

In [None]:
# Label axes and define title
plt.scatter(chocolate, nobels)
plt.ylabel('Nobel laureates per 10 million population')
plt.xlabel('Chocolate consumption (kg/year/capita)')
plt.title('Chocolate Consumption vs Nobel Laureates', fontsize=14)

number_entries = len(data)
for i in range(number_entries):
    country = data.index[i]
    x = chocolate[i]
    y = nobels[i]
    plt.annotate(country, xy=(x,y),xytext=(x,y))

plt.show()

In [None]:
# Label axes and define title
plt.scatter(chocolate, nobels, color='green', edgecolor='darkblue')
plt.ylabel('Nobel laureates per 10 million population')
plt.xlabel('Chocolate consumption (kg/year/capita)')
plt.title('Chocolate Consumption vs Nobel Laureates', fontsize=14)

for i in range(number_entries):
    country = data.index[i]
    x = chocolate[i]
    y = nobels[i]

    # Address individual cases where text overlaps
    offset = 0.1
    xtext = x + offset
    ytext = y
    if (country =='France'or country =='Netherlands' or country =='Norway'):
        ytext = y+1
    if (country =='Austria' or country =='USA' or country =='Finland'):
        ytext = y-1
    
    plt.annotate(country, xy=(x,y),xytext=(xtext,ytext))
    #print (country, ' ', x, ' ',y)
    
plt.show()

In [None]:
# Make some asthetic improvements
# Increase size of figure
rcParams['figure.figsize'] = 15, 8

# Set axes max and min values
plt.axis([0, 12, -1, 35])    

# Increase size of marker
plt.scatter(chocolate, nobels, color='green', edgecolor='darkblue', s=100)

plt.ylabel('Nobel laureates per 10 million population', )
plt.xlabel('Chocolate consumption (kg/year/capita)', fontsize=14)
plt.title('Nobel Laureates vs Chocolate Consumption', fontsize=18)

# Iterate over data to label each point with country name
for ind in data.index:
    x = data['Chocolate'][ind]
    y = data['Nobels'][ind]
    
    # Individual cases where text overlaps
    ytext=y
    if (ind=='France'or ind=='Netherlands' or ind=='Norway'):
        ytext = y+1
    if (ind=='Austria' or ind =='USA'):
        ytext = y-1
    
    plt.annotate(ind, xy=(x,y),xytext=(x+0.1,ytext))
    
plt.show()

In [None]:
# Add regression line
print('R-squared is a statistical measure of how close the data are to the fitted regression line.')
print('A statistically significant result is one where the observed p-value is less than 5%\n')      

from scipy import stats
import numpy as np

plt.scatter(chocolate, nobels, color='darkgreen', edgecolor='lightgreen', s=100)

slope, intercept, r_value, p_value, std_err = stats.linregress(chocolate, nobels)

print('R-squared value=',r_value, " p value=", p_value)

if p_value < 0.05: print('Statistically significant result!!!')

plt.plot(chocolate, intercept + slope*chocolate, 'r', label='fit', color = 'darkgreen')


plt.ylabel('Nobel laureates per 10 million population', fontsize=14)
plt.xlabel('Chocolate consumption (kg/year/capita)', fontsize=14)
plt.title('Nobel Laureates vs Chocolate Consumption', fontsize=18)

# Iterate over data to label each point with country name
for ind in data.index:
    x = data['Chocolate'][ind]
    y = data['Nobels'][ind]
    
    # Individual cases where text overlaps
    ytext=y
    if (ind=='France'or ind=='Netherlands' or ind=='Norway'):
        ytext = y+1
    if (ind=='Austria' or ind =='USA'):
        ytext = y-1
    
    plt.annotate(ind, xy=(x,y),xytext=(x+0.1,ytext))
    
plt.savefig("ChocolateNobels.png")

plt.show()

In [None]:
# Use seaborn. Python visualization library based on matplotlib. 
# It provides a high-level interface for drawing attractive statistical graphics.
# You may have to ! pip install Seaborn
import seaborn as sb
sb.set_style('whitegrid')

# Set axes max and min values
plt.axis([0, 12, -1, 35])    

# Regplot: plot data and a linear regression model fit (confidence interval around sample mean)
sb.regplot(chocolate, nobels, color = 'red')

plt.ylabel('Nobel laureates per 10 million population', fontsize=14)
plt.xlabel('Chocolate consumption (kg/year/capita)', fontsize=14)
plt.title('Nobel Laureates vs Chocolate Consumption', fontsize=18)

# Iterate over data to label each point with country name
for ind in data.index:
    x = data['Chocolate'][ind]
    y = data['Nobels'][ind]
    
    # Individual cases where text overlaps
    ytext=y
    if (ind=='France'or ind=='The Netherlands' or ind=='Norway'):
        ytext = y+1
    if (ind=='Austria' or ind =='USA'):
        ytext = y-1
    
    plt.annotate(ind, xy=(x,y),xytext=(x+0.1,ytext))
    
plt.savefig("ChocNobels")    
plt.show()
