In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
df = pd.read_csv('drinks.csv')
df.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,Afghanistan,0,0,0,0.0
1,Albania,89,132,54,4.9
2,Algeria,25,0,14,0.7
3,Andorra,245,138,312,12.4
4,Angola,217,57,45,5.9


In [3]:
df.describe()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
count,193.0,193.0,193.0,193.0
mean,106.160622,80.994819,49.450777,4.717098
std,101.143103,88.284312,79.697598,3.773298
min,0.0,0.0,0.0,0.0
25%,20.0,4.0,1.0,1.3
50%,76.0,56.0,8.0,4.2
75%,188.0,128.0,59.0,7.2
max,376.0,438.0,370.0,14.4


In [4]:
df.median()

beer_servings                   76.0
spirit_servings                 56.0
wine_servings                    8.0
total_litres_of_pure_alcohol     4.2
dtype: float64

In [5]:
sample = df[df['beer_servings'] != 0].sample(60)
sample

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
133,Peru,163,160,21,6.1
51,Dominican Republic,193,147,9,6.2
55,Equatorial Guinea,92,0,233,5.8
53,Egypt,6,4,1,0.2
131,Papua New Guinea,44,39,1,1.5
136,Portugal,194,67,339,11.0
178,Tuvalu,6,41,9,1.0
29,Cabo Verde,144,56,16,4.0
41,Costa Rica,149,87,11,4.4
9,Austria,279,75,191,9.7


In [76]:
N = sample['beer_servings'].count() # número de elementos
K = int(1 + 3.3 * np.log(N)) # número de classes, usando a regra de Sturges
AT = max(sample['beer_servings']) - min(sample['beer_servings']) # Amplitude total
H = int(AT / K) # Amplitude de classe

In [8]:
# Funções de cálculo de acúmulo de frequência
def accumulatedFrequencySimple(mass):
    acc = 0
    f = []
    for n in mass:
        acc += n
        f.append(acc)
    return f

def accumulatedFrequencyPerc(mass):
    acc = 0
    f = []
    for n in mass:
        acc += n * 100
        f.append(acc)
    return f

In [25]:
freqs = sample.get('beer_servings').value_counts()
values = freqs.values
size = sample.get('beer_servings').size
accRelFreqs = accumulatedFrequencySimple(values)
relFreqs = [n / size for n in values]
percFreqs = [f'{x * 100}' for x in relFreqs]
accPercFreqs = accumulatedFrequencyPerc(relFreqs)

## Tabela de distribuição

In [28]:
distribution_table = pd.DataFrame({'xi': freqs.keys(), 'fi': values, 'Fi': accRelFreqs, 'fi%': percFreqs, 'Fi%': accPercFreqs})
distribution_table

Unnamed: 0,xi,fi,Fi,fi%,Fi%
0,8,3,3,5.0,5.0
1,149,2,5,3.333333333333333,8.333333
2,6,2,7,3.333333333333333,11.666667
3,16,1,8,1.6666666666666667,13.333333
4,44,1,9,1.6666666666666667,15.0
5,42,1,10,1.6666666666666667,16.666667
6,297,1,11,1.6666666666666667,18.333333
7,163,1,12,1.6666666666666667,20.0
8,147,1,13,1.6666666666666667,21.666667
9,285,1,14,1.6666666666666667,23.333333


## Tabela de distribuição agrupada em classes

In [140]:
#Calculando rol e classes

rol = sorted(freqs.keys())
classes = [(rol[0], rol[0]+H)]
for i in range(rol[0]+H, rol[-1], H):
    classes.append((i, i+H))

if len(classes) > K:
    classes = classes[:-1]
    cl = classes[-1]
    classes[-1] = (cl[0], rol[-1])


freqsClasses = []
def calcClassFreqs(mass, classes):
    f = []
    for cl in classes:
        c = 0
        for n in mass:
            if cl == classes[-1]:
                if cl[0] <= n <= cl[1]:
                    c += 1
            else:
                if cl[0] <= n < cl[1]:
                    c += 1
        f.append(c)
    return f

# Calculando campos de frequência
v = sample.get('beer_servings').values
classFreqs = calcClassFreqs(v, classes)
accClassFreqs = accumulatedFrequencySimple(classFreqs)
percClassFreqs = [n / size for n in classFreqs]
accPercClassFreqs = accumulatedFrequencyPerc(percClassFreqs)
table_d = pd.DataFrame({'Classe': [x+1 for x in range(len(classes))],
                       'Xi': [f'{x[0]} a {x[1]}' for x in classes],
                       'fi': classFreqs,
                       'Fi': accClassFreqs,
                       'fi%': [x*100 for x in percClassFreqs],
                       'Fi%': accPercClassFreqs})
                        
table_d

Unnamed: 0,Classe,Xi,fi,Fi,fi%,Fi%
0,1,1 a 26,14,14,23.333333,23.333333
1,2,26 a 51,3,17,5.0,28.333333
2,3,51 a 76,4,21,6.666667,35.0
3,4,76 a 101,9,30,15.0,50.0
4,5,101 a 126,4,34,6.666667,56.666667
5,6,126 a 151,8,42,13.333333,70.0
6,7,151 a 176,3,45,5.0,75.0
7,8,176 a 201,2,47,3.333333,78.333333
8,9,201 a 226,3,50,5.0,83.333333
9,10,226 a 251,4,54,6.666667,90.0


## Correlação
#### oferta de cerveja e litros de puro álcool

In [103]:
sample.get('beer_servings').corr(sample.get('total_litres_of_pure_alcohol'))
# 0.7 < c < 0.9 = correlação forte

0.7940926903884363