In [1]:
# Ejemplos de estadistica descriptiva con python

import numpy as np # importando numpy
from scipy import stats # importando scipy.stats
import pandas as pd # importando pandas

np.random.seed(2131982) # para poder replicar el random

In [3]:
datos = np.random.randn(5, 4) # datos normalmente distribuidos
datos

array([[ 1.67506019, -0.96206654, -0.56642475,  0.05249989],
       [-1.43075098, -2.61689912,  0.2934247 , -1.57887913],
       [ 0.12762303, -0.43609918,  0.87183416,  0.8429677 ],
       [-1.68163145,  2.28013096,  0.48126528, -0.15789303],
       [ 1.6143065 , -1.75129997,  0.86925338,  1.04526093]])

In [5]:
# media arítmetica
datos.mean() # Calcula la media aritmetica de

-0.05141587089184916

In [6]:
np.mean(datos) # Mismo resultado desde la funcion de numpy

-0.05141587089184916

In [7]:
datos.mean(axis=1) # media aritmetica de cada fila

array([ 0.0497672 , -1.33327613,  0.35158143,  0.23046794,  0.44438021])

In [8]:
datos.mean(axis=0) # media aritmetica de cada columna

array([ 0.06092146, -0.69724677,  0.38987056,  0.04079127])

In [9]:
# mediana
np.median(datos)

0.09006146190938408

In [11]:
np.median(datos, 0) # media aritmetica de cada columna

array([ 0.12762303, -0.96206654,  0.48126528,  0.05249989])

In [14]:
np.median(datos, 1) # media aritmetica de cada fila

array([-0.25696243, -1.50481505,  0.48529537,  0.16168613,  0.95725716])

In [16]:
 # Desviación típica
np.std(datos)

1.2837071941389726

In [17]:
np.std(datos, 0) # Desviación típica de cada columna

array([1.43418794, 1.66127656, 0.52806293, 0.92902003])

In [19]:
np.std(datos, 1) # Desviación típica de cada fila

array([1.00561981, 1.04446945, 0.543765  , 1.42047229, 1.29723878])

In [21]:
# varianza
np.var(datos)

1.647904160284154

In [22]:
np.var(datos, 0) # varianza de cada columna

array([2.05689506, 2.75983981, 0.27885046, 0.86307822])

In [25]:
np.var(datos, 1) # varianza de cada fila

array([1.0112712 , 1.09091644, 0.29568037, 2.01774153, 1.68282846])

In [26]:
# moda
stats.mode(datos) # Calcula la moda de cada columna
# el 2do array devuelve la frecuencia.

  stats.mode(datos) # Calcula la moda de cada columna


ModeResult(mode=array([[-1.68163145, -2.61689912, -0.56642475, -1.57887913]]), count=array([[1, 1, 1, 1]]))

In [28]:
datos2 = np.array([1, 2, 3, 6, 6, 1, 2, 4, 2, 2, 6, 6, 8, 10, 6])
stats.mode(datos2) # aqui la moda es el 6 porque aparece 5 veces en el vector.

  stats.mode(datos2) # aqui la moda es el 6 porque aparece 5 veces en el vector.


ModeResult(mode=array([6]), count=array([5]))

In [30]:
# correlacion
np.corrcoef(datos) # Crea matriz de correlación.

array([[ 1.        ,  0.03269343,  0.05201478, -0.93409317,  0.74030068],
       [ 0.03269343,  1.        ,  0.77407231, -0.32711073,  0.59928548],
       [ 0.05201478,  0.77407231,  1.        , -0.40348308,  0.70307819],
       [-0.93409317, -0.32711073, -0.40348308,  1.        , -0.9312764 ],
       [ 0.74030068,  0.59928548,  0.70307819, -0.9312764 ,  1.        ]])

In [31]:
# calculando la correlación entre dos vectores.
np.corrcoef(datos[0], datos[1])

array([[1.        , 0.03269343],
       [0.03269343, 1.        ]])

In [32]:
# covarianza
np.cov(datos) # calcula matriz de covarianza

array([[ 1.3483616 ,  0.04578559,  0.03792369, -1.77908017,  1.2876583 ],
       [ 0.04578559,  1.45455525,  0.58617495, -0.64708604,  1.0826502 ],
       [ 0.03792369,  0.58617495,  0.39424049, -0.41553545,  0.66126195],
       [-1.77908017, -0.64708604, -0.41553545,  2.69032204, -2.28807379],
       [ 1.2876583 ,  1.0826502 ,  0.66126195, -2.28807379,  2.24377128]])

In [33]:
# covarianza de dos vectores
np.cov(datos[0], datos[1])

array([[1.3483616 , 0.04578559],
       [0.04578559, 1.45455525]])

In [36]:
# usando pandas
dataframe = pd.DataFrame(datos, index=['a', 'b', 'c', 'd', 'e'], 
                        columns=['col1', 'col2', 'col3', 'col4'])
dataframe

Unnamed: 0,col1,col2,col3,col4
a,1.67506,-0.962067,-0.566425,0.0525
b,-1.430751,-2.616899,0.293425,-1.578879
c,0.127623,-0.436099,0.871834,0.842968
d,-1.681631,2.280131,0.481265,-0.157893
e,1.614306,-1.7513,0.869253,1.045261


In [37]:
# resumen estadistadistico con pandas
dataframe.describe()

Unnamed: 0,col1,col2,col3,col4
count,5.0,5.0,5.0,5.0
mean,0.060921,-0.697247,0.389871,0.040791
std,1.603471,1.857364,0.590392,1.038676
min,-1.681631,-2.616899,-0.566425,-1.578879
25%,-1.430751,-1.7513,0.293425,-0.157893
50%,0.127623,-0.962067,0.481265,0.0525
75%,1.614306,-0.436099,0.869253,0.842968
max,1.67506,2.280131,0.871834,1.045261


In [42]:
# sumando las columnas
dataframe.sum(axis=0)
#dataframe.sum()

col1    0.304607
col2   -3.486234
col3    1.949353
col4    0.203956
dtype: float64

In [43]:
# sumando filas
dataframe.sum(axis=1)

a    0.199069
b   -5.333105
c    1.406326
d    0.921872
e    1.777521
dtype: float64

In [44]:
dataframe.cumsum() # acumulados

Unnamed: 0,col1,col2,col3,col4
a,1.67506,-0.962067,-0.566425,0.0525
b,0.244309,-3.578966,-0.273,-1.526379
c,0.371932,-4.015065,0.598834,-0.683412
d,-1.309699,-1.734934,1.080099,-0.841305
e,0.304607,-3.486234,1.949353,0.203956


In [46]:
# media aritmetica de cada columna con pandas
dataframe.mean()

col1    0.060921
col2   -0.697247
col3    0.389871
col4    0.040791
dtype: float64

In [47]:
# media aritmetica de cada fila con pandas
dataframe.mean(axis=1)

a    0.049767
b   -1.333276
c    0.351581
d    0.230468
e    0.444380
dtype: float64