### Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 2. Basic Data Processing

In [2]:
df = pd.read_csv('../data/raw/Country-data.csv')
df.head()

Unnamed: 0,country,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,Afghanistan,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,Albania,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,Algeria,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,Angola,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,Antigua and Barbuda,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


In [3]:
df = df.drop(columns=['country'])
df.head()

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,90.2,10.0,7.58,44.9,1610,9.44,56.2,5.82,553
1,16.6,28.0,6.55,48.6,9930,4.49,76.3,1.65,4090
2,27.3,38.4,4.17,31.4,12900,16.1,76.5,2.89,4460
3,119.0,62.3,2.85,42.9,5900,22.4,60.1,6.16,3530
4,10.3,45.5,6.03,58.9,19100,1.44,76.8,2.13,12200


### 3. Handling Missing Values

In [4]:
df.isnull().sum()

child_mort    0
exports       0
health        0
imports       0
income        0
inflation     0
life_expec    0
total_fer     0
gdpp          0
dtype: int64

##### No need to perform handling missing values strategy since there are no missing values in the data set

### 4. Encode categorical variables

##### No need to perform encoding strategy since All columns are in numerical format

### 5. Standardization

In [5]:
df[:] = np.log1p(df.clip(lower=0))

# Standardize each column (Z-score)
df[:] = (df - df.mean()) / (df.std() + 1e-8)

1      9.203416
2      9.465060
3      8.682877
4      9.857496
         ...   
162    7.989899
163    9.711176
164    8.409831
165    8.407602
166    8.095904
Name: income, Length: 167, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df[:] = np.log1p(df.clip(lower=0))
1      8.316545
2      8.403128
3      8.169336
4      9.409273
         ...   
162    7.996654
163    9.510519
164    7.178545
165    7.178545
166    7.286876
Name: gdpp, Length: 167, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df[:] = np.log1p(df.clip(lower=0))


In [6]:
df.head()

Unnamed: 0,child_mort,exports,health,imports,income,inflation,life_expec,total_fer,gdpp
0,1.27418,-1.759023,0.433342,0.147424,-1.408798,0.634392,-1.576856,1.729616,-1.455922
1,-0.238989,-0.283021,0.076251,0.290959,0.071445,-0.051222,0.628958,-0.941505,-0.122467
2,0.197869,0.183608,-0.981105,-0.49745,0.284385,1.160761,0.647886,0.143133,-0.064721
3,1.526599,0.9055,-1.804261,0.06494,-0.352198,1.495353,-1.093722,1.867087,-0.220645
4,-0.646536,0.435882,-0.123007,0.640302,0.60377,-0.916278,0.676185,-0.471103,0.606311


In [7]:
df.to_csv('../data/processed/CountryData_Preprocessed.csv', index=False)