In [1]:
#
# Carga del dataset
#
import pandas as pd

dataset = pd.read_csv("auto_mpg.csv")
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
#
# Tamaño del dataset
#
dataset.shape

(398, 8)

In [4]:
#
# Búsqueda de valores nulos
#
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [5]:
#
# Se eliminan los registros nulos
#
dataset = dataset.dropna()
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [6]:
#
# Columna Origin
# Nota:  1) USA
#        2) Europe
#        3) Japan
dataset.Origin.value_counts()

Origin
1    245
3     79
2     68
Name: count, dtype: int64

In [7]:
#
# Convierte la columna a categorias
# Nota. Realmente no se debería hacer asi para aplicaciones
# en productivo
#
dataset["Origin"] = dataset["Origin"].map(
    {
        1: "USA", 
        2: "Europe",
        3: "Japan"
    },
)
dataset.Origin.value_counts()

Origin
USA       245
Japan      79
Europe     68
Name: count, dtype: int64

In [8]:
#
# Genera variables dummy para indicar la procedencia
#
dataset = pd.get_dummies(dataset, columns=["Origin"], prefix="", prefix_sep="")
dataset.head()


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan,USA
0,18.0,8,307.0,130.0,3504.0,12.0,70,False,False,True
1,15.0,8,350.0,165.0,3693.0,11.5,70,False,False,True
2,18.0,8,318.0,150.0,3436.0,11.0,70,False,False,True
3,16.0,8,304.0,150.0,3433.0,12.0,70,False,False,True
4,17.0,8,302.0,140.0,3449.0,10.5,70,False,False,True


In [9]:
#
# Note que aca no se usa train_test_split
#
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [13]:
#
# Las millas por galon (MPG) son función de las demas variables.
#
import seaborn as sns

sns.pairplot(
    train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde"
)

ModuleNotFoundError: No module named 'seaborn'

In [10]:
#
# Cálculo de algunas estadísticas generales
#
train_dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,314.0,23.31051,7.728652,10.0,17.0,22.0,28.95,46.6
Cylinders,314.0,5.477707,1.699788,3.0,4.0,4.0,8.0,8.0
Displacement,314.0,195.318471,104.331589,68.0,105.5,151.0,265.75,455.0
Horsepower,314.0,104.869427,38.096214,46.0,76.25,94.5,128.0,225.0
Weight,314.0,2990.251592,843.898596,1649.0,2256.5,2822.5,3608.0,5140.0
Acceleration,314.0,15.559236,2.78923,8.0,13.8,15.5,17.2,24.8
Model Year,314.0,75.898089,3.675642,70.0,73.0,76.0,79.0,82.0


In [14]:
#
# Separación de las X y la y en el modelo de regresión
#
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop("MPG")
test_labels = test_features.pop("MPG")

In [15]:
#
# Transformación de los datos
#
train_dataset.describe().transpose()[["mean", "std"]]

Unnamed: 0,mean,std
MPG,23.31051,7.728652
Cylinders,5.477707,1.699788
Displacement,195.318471,104.331589
Horsepower,104.869427,38.096214
Weight,2990.251592,843.898596
Acceleration,15.559236,2.78923
Model Year,75.898089,3.675642
