In [1]:
# Step 1: Import libraries and load dataset 
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv("Data.csv") 
print("Original Data (first 5 rows):\n", df.head(), "\n")

# Step 2: Explore dataset 
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nSummary Statistics:\n", df.describe(include="all"))

# Step 3: Handle Missing Values 
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].mean(), inplace=True)

for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\nAfter Handling Missing Values:\n", df.head())

# Step 4: Encode Categorical Variables
encoder = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = encoder.fit_transform(df[col])

print("\nAfter Encoding Categorical Data:\n", df.head())

# Step 5: Feature Scaling 
numeric_cols = df.select_dtypes(include=[np.number]).columns

scalers = {
    "Min-Max Scaler": MinMaxScaler(),
    "Standard Scaler": StandardScaler(),
    "Robust Scaler": RobustScaler(),
    "Max-Abs Scaler": MaxAbsScaler()
}

for name, scaler in scalers.items():
    scaled = scaler.fit_transform(df[numeric_cols])
    scaled_df = pd.DataFrame(scaled, columns=numeric_cols)
    print(f"\n{name} Result (first 5 rows):\n", scaled_df.head())

# Step 6: Handle Outliers 
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
df_outlier_removed = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

print("\nDataset after removing outliers (shape):", df_outlier_removed.shape)

# Step 7 : Split Dataset Into Training, Evaluation and Validation Sets 
X = df_outlier_removed.drop(df_outlier_removed.columns[-1], axis=1)   # features
y = df_outlier_removed[df_outlier_removed.columns[-1]]               
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Original Data (first 5 rows):
    Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes 

Dataset Shape: (10, 4)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes
None

Summary Statistics:
        Country        Age        Salary Purchased
count       10   9.000000      9.000000        10
unique       3        NaN           NaN         2
top     France        NaN           NaN        No
freq         4        NaN           NaN         5
mean       NaN  38.777778  63777.77777

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
