In [80]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [81]:
# Leemos el dataset
df = pd.read_csv('inpatientCharges.csv')
df.sample(frac=0.65)

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
60415,291 - HEART FAILURE & SHOCK W MCC,10035,CULLMAN REGIONAL MEDICAL CENTER,1912 ALABAMA HIGHWAY 157,CULLMAN,AL,35058,AL - Birmingham,72,$26413.80,$7930.87,$7187.05
24449,191 - CHRONIC OBSTRUCTIVE PULMONARY DISEASE W CC,150004,FRANCISCAN ST MARGARET HEALTH - HAMMOND,5454 HOHMAN AVE,HAMMOND,IN,46320,IN - Munster,103,$17252.59,$6700.47,$5637.79
45147,244 - PERMANENT CARDIAC PACEMAKER IMPLANT W/O ...,140186,RIVERSIDE MEDICAL CENTER,350 N WALL ST,KANKAKEE,IL,60901,IL - Joliet,20,$65392.50,$13652.40,$12696.60
82613,313 - CHEST PAIN,100038,MEMORIAL REGIONAL HOSPITAL,3501 JOHNSON ST,HOLLYWOOD,FL,33021,FL - Miami,121,$19761.35,$3887.56,$3168.34
81116,312 - SYNCOPE & COLLAPSE,260085,ST JOSEPH MEDICAL CENTER,1000 CARONDELET DR,KANSAS CITY,MO,64114,MO - Kansas City,61,$17686.59,$3889.67,$3060.36
...,...,...,...,...,...,...,...,...,...,...,...,...
115887,481 - HIP & FEMUR PROCEDURES EXCEPT MAJOR JOIN...,40039,ARKANSAS METHODIST MEDICAL CENTER,900 WEST KINGSHIGHWAY,PARAGOULD,AR,72450,AR - Jonesboro,22,$24916.63,$10996.72,$10180.72
128059,638 - DIABETES W CC,130002,ST LUKES MAGIC VALLEY RMC,801 POLE LINE ROAD WEST,TWIN FALLS,ID,83301,ID - Boise,14,$9376.71,$5557.57,$3669.71
85048,314 - OTHER CIRCULATORY SYSTEM DIAGNOSES W MCC,180067,UNIVERSITY OF KENTUCKY HOSPITAL,HOSPITAL ADMINISTRATION,LEXINGTON,KY,40536,KY - Lexington,43,$36821.25,$19155.00,$13923.30
138137,683 - RENAL FAILURE W CC,390180,CROZER CHESTER MEDICAL CENTER,ONE MEDICAL CENTER BOULEVARD,UPLAND,PA,19013,PA - Philadelphia,123,$95174.52,$8012.94,$7282.45


In [82]:
# Limpiamos el dataset
# Eliminar columnas "object" con más de 10 tipos de valores distintos
object_columns = df.select_dtypes(include=['object']).columns

for column in object_columns:
    unique_values = df[column].nunique()
    if unique_values > 10:
        df = df.drop(column, axis=1)

# Obtener todas las columnas que no sean numéricas
non_numeric_columns = [col for col in df.columns if not np.issubdtype(df[col].dtype, np.number)]
diccionario = {}

# Mapeamos cada columna no numérica a un valor numérico
for column in non_numeric_columns:
    diccionario[column] = {}
    unique_values = df[column].unique()
    for i, value in enumerate(unique_values):
        diccionario[column][value] = i
    df[column] = df[column].map(diccionario[column])

# Rellenar valores nulos con la media de cada columna
df = df.fillna(df.mean())

# Eliminar columna con identificación
column_id = df.columns[(df.columns.str.startswith('0')) & (df.columns.str.endswith(str(df.shape[0])))]
df = df.drop(column_id, axis=1)

In [83]:
df.sample(frac=0.65)

Unnamed: 0,Provider Id,Provider Zip Code,Total Discharges
140784,300017,3038,13
118323,210004,20910,16
136855,130002,83301,48
95407,520028,53566,27
16034,310070,8901,32
...,...,...,...
41177,140077,62207,16
16241,360052,45406,32
64497,250059,39090,22
47245,140088,60637,50


In [84]:
# Dividimos el dataset en train y test
x = df.drop('Provider Zip Code', axis=1)
y = df['Provider Zip Code']

In [85]:
# Hacemos el train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [86]:
print("x train: {}, x test: {}, y train: {}, y test: {}".format(x_train.shape, x_test.shape, y_train.shape, y_test.shape))

x train: (130452, 2), x test: (32613, 2), y train: (130452,), y test: (32613,)


In [87]:
x

Unnamed: 0,Provider Id,Total Discharges
0,10001,91
1,10005,14
2,10006,24
3,10011,25
4,10016,18
...,...,...
163060,670041,23
163061,670055,11
163062,670056,19
163063,670060,11


In [88]:
y

0         36301
1         35957
2         35631
3         35235
4         35007
          ...  
163060    78664
163061    78258
163062    78640
163063    75182
163064    75028
Name: Provider Zip Code, Length: 163065, dtype: int64

In [89]:
n_entries = x_train.shape[1]

In [90]:
# Ahora escalamos los datos
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


In [91]:
# Preparamos los datos en la red neuronal para que sean tensores
x_train = torch.from_numpy(x_train).float().to('cpu')
x_test = torch.from_numpy(x_test).float().to('cpu')
y_train = torch.from_numpy(y_train.values).long().to('cpu')
y_test = torch.from_numpy(y_test.values).long().to('cpu')


In [92]:
# Creamos el modelo de la red neuronal
class Net(nn.Module):
    def __init__(self, n_entries):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_entries, 15)
        self.fc2 = nn.Linear(15, 10)
        self.fc3 = nn.Linear(10, 2)

    def forward(self, xc):
        result_1 = torch.relu(self.fc1(xc))
        result_2 = torch.relu(self.fc2(result_1))
        final_result = F.softmax(self.fc3(result_2), dim=1)
        return final_result

In [None]:
# Entrenamos el modelo
learning_rate = 0.01
epochs = 1000
batch_size = 100

model = Net(n_entries=n_entries)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print("Modelo: {}".format(model))

res = pd.DataFrame()
# Ahora entrenamos el modelo
for epoch in range(1, epochs + 1):
    y_pred = model(x_train)
    loss = loss_fn(input=y_pred, target=y_train)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if epoch % batch_size == 0:
        print('Epoch: {}/{}, Loss: {:.3f}'.format(epoch, epochs, loss.item()))

# Evaluamos el modelo
with torch.no_grad():
    y_pred = model(x_test)
    __, predicted = torch.max(y_pred, 1)
    correct = (predicted == y_test).sum().item()
    accuracy = 100 * correct / float(len(y_test))

    if epoch % batch_size == 0:
        print('Accuracy: {:.3f}'.format(accuracy))

df_tmp = pd.DataFrame({
    'Epoch': epoch,
    'Loss' : round(loss.item(),4),
    'Accuracy' : round(accuracy,4)  
}, index=[0])
res = pd.concat(objs=[res, df_tmp], ignore_index=True, sort=False)

print("Accuracy: {:.3f}".format(accuracy))






In [None]:
plt.figure(figsize=(10, 10))
plt.plot(res['Epoch'], res['Loss'], label='Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(res['Epoch'], res['Accuracy'], label='Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()