## Agregar datos por categoría

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
gender = ["Male", "Female"]
income_status = ["Poor", "Midle Class", "Rich"]

In [None]:
n = 500
gender_data = []
income_data = []

for i in range(0, n):
    gender_data.append(np.random.choice(gender))
    income_data.append(np.random.choice(income_status))

In [None]:
# Usamos una distribución normal
height = 160 + 30 * np.random.randn(n)
weight = 65 + 25 * np.random.randn(n)
age = 30 + 12 * np.random.randn(n)
income = 18000 + 3500 * np.random.randn(n)

In [None]:
data = pd.DataFrame(
{
    "Gender" : gender_data,
    "Economic Status" : income_data,
    "Height" : height,
    "Weight" : weight,
    "Age" : age,
    "Income" : income
}
)

In [None]:
data.shape

In [None]:
data

## Agrupación de datos

In [None]:
grouped_gender = data.groupby("Gender")

In [None]:
grouped_gender.groups

In [None]:
for names, groups in grouped_gender:
    print(names)
    print(groups)

In [None]:
grouped_gender.get_group("Female")

In [None]:
double_group = data.groupby(["Gender", "Economic Status"])

In [None]:
len(double_group)

In [None]:
for names, groups in double_group:
    print(names)
    print(groups)

## Operaciones sobre datos agrupados

In [None]:
double_group.sum()

In [None]:
double_group.mean()

In [None]:
double_group.size()

In [None]:
double_group.describe()

In [None]:
grouped_income = double_group["Income"]

In [None]:
grouped_income.describe()

In [None]:
double_group.aggregate({
    "Income" : np.sum,
    "Age" : np.mean,
    "Height" : np.std
})

In [None]:
double_group.aggregate({
    "Age" : np.mean,
    "Height" : lambda h:np.mean(h)/ np.std(h)
})

In [None]:
double_group.aggregate([np.sum, np.mean, np.std])

In [None]:
double_group.aggregate([lambda x:np.mean(x) / np.std(x)])

## Filtrado de datos

In [None]:
double_group["Age"].filter(lambda x: x.sum() > 2400)

## Transformacion de variables

In [None]:
zscore = lambda x:(x - x.mean()) / x.std()

In [None]:
zgroup = double_group.transform(zscore)

In [None]:
plt.hist(zgroup["Age"])

In [None]:
fill_na_mean = lambda x : x.fillna(x.mean())

In [None]:
zgroup["Height"][0] = float("NaN")

In [None]:
zgroup

In [None]:
zgroup.transform(fill_na_mean)

In [None]:
np.mean(zgroup["Height"])

## Operciones diversas muy útiles

In [None]:
double_group.head(1)

In [None]:
double_group.tail(1)

In [None]:
double_group.nth(32) #Hay que asegurarse que haya esas 32 filas en cada grupo, si no, no lo devuelve

In [None]:
data_sorted = data.sort_values(["Age", "Income"])

In [None]:
data_sorted.head(10)

In [None]:
age_grouped = data_sorted.groupby("Gender")

In [None]:
age_grouped.head()

In [None]:
age_grouped.tail(1)

# Conjunto de entrenamiento y de testing

In [None]:
mainpath = "C:/Users/francisco/Documents/GitHub/python-ml-course/datasets"
data = pd.read_csv(mainpath + "/" + "/customer-churn-model/Customer Churn Model.txt")
len(data)

## Dividir utilizando la distribución normal

In [None]:
a = np.random.randn(len(data))

In [None]:
plt.hist(a)

In [None]:
check = (a < 0.75)
check

In [None]:
plt.hist(check)

In [None]:
training = data[check]
testing = data[~check]

In [None]:
len(training)

In [None]:
len(testing)

## Con la librería sklearn

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(data, test_size = 0.2)

In [None]:
len(train)

In [None]:
len(test)

## Usando una función de shuffle

In [None]:
import sklearn

In [None]:
data = sklearn.utils.shuffle(data)

In [None]:
cut_id = int(0.75 * len(data))
cut_id

In [None]:
train_data = data[:cut_id]
test_data = data[cut_id + 1 :]

In [None]:
len(train_data)

In [None]:
len(test_data)