In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.nn.functional import one_hot

In [11]:
url = 'http://archive.ics.uci.edu/ml/' \
      'machine-learning-databases/auto-mpg/auto-mpg.data'

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower',
        'Weight', 'Acceleration', 'Model Year', 'Origin']

numerical_columns = ['Cylinders', 'Displacement', 'Horsepower',
        'Weight', 'Acceleration']

df = pd.read_csv(url,names=column_names, sep=' ', comment='\t',na_values = "?",skipinitialspace=True)

# Drop the NA rows
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)    # assigning a new default integer index to the DataFrame

# Convert The Model Year column
boundries = torch.tensor([73,76,79])
v = torch.tensor(df['Model Year'].values)
df['Model Year'] = torch.bucketize(v,boundries,right=True)

# One-Hot-Encoding for the 'Original' column
origin_encoded = one_hot(torch.tensor((df['Origin']-1).values),num_classes=df['Origin'].nunique())[:,1:]
data = torch.cat([torch.tensor(df.drop(columns='Origin').values),origin_encoded],dim=1)
column_names_encoded = ['MPG', 'Cylinders', 'Displacement', 'Horsepower',
        'Weight', 'Acceleration', 'Model Year', 'Europe','Japan']
df = pd.DataFrame(data,columns=column_names_encoded)

# Split our data
df_train, df_test = train_test_split(df,test_size=0.8,random_state=123)

# Sdandarize it
for col in numerical_columns:
    mean = df_train[col].mean()
    std = df_train[col].std()
    df_train[col] = (df_train[col] - mean) / std
    df_test[col] = (df_test[col] - mean) / std

df_train.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Europe,Japan
3,16.0,1.629454,1.192495,1.373011,0.629638,-1.470879,0.0,0.0,0.0
56,24.0,-0.779976,-0.748803,-0.226722,-0.812878,-0.04896,0.0,0.0,1.0
304,28.8,0.424739,-0.138971,0.354999,-0.416967,-1.755263,3.0,0.0,0.0
233,26.0,-0.779976,-0.911425,-0.808443,-0.829115,1.047949,2.0,0.0,1.0
51,30.0,-0.779976,-1.0029,-0.779357,-1.078901,-0.455222,0.0,1.0,0.0


In [12]:
df['Europe'].unique()

array([0., 1.])