# Demo Data Preprocessing

In [106]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [107]:
df = sns.load_dataset('mpg')

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [109]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger


In [110]:
df.dropna(inplace=True)

In [111]:
df['origin'].value_counts()

usa       245
japan      79
europe     68
Name: origin, dtype: int64

### Determine target

In [130]:
target = 'mpg'

In [131]:
features = [col for col in df.columns if col not in [target, 'name']]
features

['cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model_year',
 'origin']

In [132]:
df_target = df[[target]]
df_features = df[features]

### One-hot encoding

In [113]:
df_encoded = pd.get_dummies(df[features], drop_first=True, dtype='int')
df_encoded

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_japan,origin_usa
0,18.0,8,307.0,130.0,3504,12.0,70,0,1
1,15.0,8,350.0,165.0,3693,11.5,70,0,1
2,18.0,8,318.0,150.0,3436,11.0,70,0,1
3,16.0,8,304.0,150.0,3433,12.0,70,0,1
4,17.0,8,302.0,140.0,3449,10.5,70,0,1
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,0,1
394,44.0,4,97.0,52.0,2130,24.6,82,0,0
395,32.0,4,135.0,84.0,2295,11.6,82,0,1
396,28.0,4,120.0,79.0,2625,18.6,82,0,1


### Scaling

In [114]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [115]:
scaler = MinMaxScaler()
# scaler = StandardScaler()

scaler.fit(df_encoded)

In [116]:
scaler.scale_.round(3)

array([0.027, 0.2  , 0.003, 0.005, 0.   , 0.06 , 0.083, 1.   , 1.   ])

In [117]:
df_scaled = pd.DataFrame(scaler.transform(df_encoded), columns=df_encoded.columns)
df_scaled

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_japan,origin_usa
0,0.239362,1.0,0.617571,0.456522,0.536150,0.238095,0.0,0.0,1.0
1,0.159574,1.0,0.728682,0.646739,0.589736,0.208333,0.0,0.0,1.0
2,0.239362,1.0,0.645995,0.565217,0.516870,0.178571,0.0,0.0,1.0
3,0.186170,1.0,0.609819,0.565217,0.516019,0.238095,0.0,0.0,1.0
4,0.212766,1.0,0.604651,0.510870,0.520556,0.148810,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
387,0.478723,0.2,0.186047,0.217391,0.333711,0.452381,1.0,0.0,1.0
388,0.930851,0.2,0.074935,0.032609,0.146583,0.988095,1.0,0.0,0.0
389,0.611702,0.2,0.173127,0.206522,0.193365,0.214286,1.0,0.0,1.0
390,0.505319,0.2,0.134367,0.179348,0.286929,0.630952,1.0,0.0,1.0


### Split in Training en Test dataset

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
df_train, df_test = train_test_split(df_scaled, test_size=0.3)

In [120]:
print('train dataset', df_train.shape)
print('test dataset', df_test.shape)

train dataset (274, 9)
test dataset (118, 9)


# Define target en features

In [121]:
target = 'mpg'
features = [col for col in df_train.columns if col != target]

print('target', target)
print('features', features)

target mpg
features ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin_japan', 'origin_usa']


In [122]:
df_train_target = df_train[target].copy()
df_train_features = df_train[features].copy()
df_test_target = df_test[target].copy()
df_test_features = df_test[features].copy()

In [123]:
print('df_train_target', df_train_target.shape)
print('df_train_features', df_train_features.shape)
print('df_test_target', df_test_target.shape)
print('df_test_features', df_test_features.shape)

df_train_target (274,)
df_train_features (274, 8)
df_test_target (118,)
df_test_features (118, 8)


In [124]:
from sklearn.linear_model import LinearRegression

In [125]:
regressor = LinearRegression()

In [126]:
regressor.fit(df_train_features, df_train_target)

In [103]:
predicted = regressor.predict(df_test_features)

In [104]:
df_test_target['predicted_linear_regression'] = predicted

In [127]:
df_test_target

97     0.186170
391    0.585106
256    0.308511
19     0.452128
310    0.750000
         ...   
127    0.585106
129    0.611702
99     0.239362
12     0.159574
203    0.505319
Name: mpg, Length: 118, dtype: float64