# Practise - Fixed & Robust Completed Notebook

In [7]:
# Imports & package check
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
print('Packages imported successfully.')

Packages imported successfully.


In [8]:
# Create dataset (same random seed for reproducibility)
np.random.seed(123)
num1 = np.random.randn(500)
num2 = np.random.uniform(10, 50, size=500)
num3 = np.random.randint(1, 6, size=500).astype('float')

num1[np.random.choice(500, 60, replace=False)] = np.nan
num2[np.random.choice(500, 60, replace=False)] = np.nan
num3[np.random.choice(500, 60, replace=False)] = np.nan

cat1 = np.random.choice(['apple', 'banana', 'grape', np.nan], size=500, p=[0.3, 0.3, 0.3, 0.1])
cat2 = np.random.choice(['A', 'B', 'C', 'D'], size=500)
cat3 = np.random.choice(['Yes', 'No', np.nan], size=500, p=[0.45, 0.45, 0.10])

df_untidy = pd.DataFrame({
    'Score': num1,
    'Height_cm': num2,
    'Rating': num3,
    'Fruit': cat1,
    'Group': cat2,
    'IsActive': cat3
})

# Inject messy strings correctly
height_nonnull_idx = df_untidy[df_untidy['Height_cm'].notnull()].sample(frac=0.15, random_state=1).index
df_untidy.loc[height_nonnull_idx, 'Height_cm'] = df_untidy.loc[height_nonnull_idx, 'Height_cm'].astype(str) + 'cm'

rating_nonnull_idx = df_untidy[df_untidy['Rating'].notnull()].sample(frac=0.15, random_state=2).index
df_untidy.loc[rating_nonnull_idx, 'Rating'] = 'Rating: ' + df_untidy.loc[rating_nonnull_idx, 'Rating'].astype(str)

df_untidy.head()

 '10.509502547356245cm' '12.862404497353292cm' '20.64105976628817cm'
 '48.09955471157855cm' '25.60501244185592cm' '30.646638278877738cm'
 '25.652659700690286cm' '40.68277095705241cm' '24.249592498611463cm'
 '15.558953548579764cm' '17.85236020487941cm' '10.607906366550996cm'
 '28.1338454756111cm' '41.01459051370767cm' '38.03485359867271cm'
 '18.85190425075482cm' '12.04505202146747cm' '40.75834779859456cm'
 '36.253727839177266cm' '22.29011157439592cm' '30.664652412123345cm'
 '24.573667182988366cm' '29.40597551292187cm' '13.92735994485868cm'
 '38.88652968658706cm' '43.905879808939105cm' '12.547634372794828cm'
 '40.128174130939186cm' '49.46319556401813cm' '27.7438520778363cm'
 '34.93076339424511cm' '20.724977212451382cm' '43.413724035210656cm'
 '42.21306753116133cm' '17.41287645034994cm' '42.34504458902001cm'
 '22.962576174780793cm' '26.918929348570554cm' '32.02118761834812cm'
 '16.59992021683937cm' '38.750633419167cm' '20.720844671384647cm'
 '48.66317503549566cm' '23.563969326496185cm' '2

Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive
0,-1.085631,,2.0,banana,D,Yes
1,0.997345,16.480034,5.0,apple,A,No
2,0.282978,49.244711,,banana,B,No
3,-1.506295,,Rating: 3.0,grape,D,
4,-0.5786,31.599083,,banana,C,No


### Q1. Handle Missing Values

In [10]:
from sklearn.impute import SimpleImputer

# Mean imputation (numeric)
imputer_mean = SimpleImputer(strategy='mean')
df_untidy['Score_mean_filled'] = imputer_mean.fit_transform(df_untidy[['Score']]).ravel()

# Median imputation (numeric)
imputer_median = SimpleImputer(strategy='median')
df_untidy['Score_median_filled'] = imputer_median.fit_transform(df_untidy[['Score']]).ravel()

# Mode imputation (categorical)
imputer_mode = SimpleImputer(strategy='most_frequent')
df_untidy['Fruit_mode_filled'] = imputer_mode.fit_transform(df_untidy[['Fruit']]).ravel()

df_untidy[['Score','Score_mean_filled','Score_median_filled','Fruit','Fruit_mode_filled']].head()


Unnamed: 0,Score,Score_mean_filled,Score_median_filled,Fruit,Fruit_mode_filled
0,-1.085631,-1.085631,-1.085631,banana,banana
1,0.997345,0.997345,0.997345,apple,apple
2,0.282978,0.282978,0.282978,banana,banana
3,-1.506295,-1.506295,-1.506295,grape,grape
4,-0.5786,-0.5786,-0.5786,banana,banana


### Q2. Encode Categorical Columns

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_untidy['Group_encoded'] = le.fit_transform(df_untidy['Group'].fillna('Unknown'))

# One-hot encoding for Fruit
df_untidy = pd.get_dummies(df_untidy, columns=['Fruit_mode_filled'], prefix='Fruit')
df_untidy.head()

Unnamed: 0,Score,Height_cm,Rating,Fruit,Group,IsActive,Score_mean_filled,Score_median_filled,Group_encoded,Fruit_apple,Fruit_banana,Fruit_grape,Fruit_nan
0,-1.085631,,2.0,banana,D,Yes,-1.085631,-1.085631,3,False,True,False,False
1,0.997345,16.480034,5.0,apple,A,No,0.997345,0.997345,0,True,False,False,False
2,0.282978,49.244711,,banana,B,No,0.282978,0.282978,1,False,True,False,False
3,-1.506295,,Rating: 3.0,grape,D,,-1.506295,-1.506295,3,False,False,True,False
4,-0.5786,31.599083,,banana,C,No,-0.5786,-0.5786,2,False,True,False,False


### Q3. Fix Mixed Data Types

In [12]:
# Clean height column
df_untidy['Height_cm_clean'] = (
    df_untidy['Height_cm']
    .astype(str)
    .str.replace('cm','',regex=False)
)
df_untidy['Height_cm_clean'] = pd.to_numeric(df_untidy['Height_cm_clean'], errors='coerce')

# Clean rating column
df_untidy['Rating_clean'] = (
    df_untidy['Rating']
    .astype(str)
    .str.replace('Rating: ','',regex=False)
)
df_untidy['Rating_clean'] = pd.to_numeric(df_untidy['Rating_clean'], errors='coerce')

### Q4. Scaling and Normalization

In [13]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler_minmax = MinMaxScaler()
scaler_std = StandardScaler()

df_untidy['Score_minmax'] = scaler_minmax.fit_transform(df_untidy[['Score_mean_filled']]).ravel()
df_untidy['Score_standard'] = scaler_std.fit_transform(df_untidy[['Score_mean_filled']]).ravel()

df_untidy[['Score','Score_mean_filled','Score_minmax','Score_standard']].head()

Unnamed: 0,Score,Score_mean_filled,Score_minmax,Score_standard
0,-1.085631,-1.085631,0.346613,-1.145411
1,0.997345,0.997345,0.683137,1.088461
2,0.282978,0.282978,0.567725,0.322344
3,-1.506295,-1.506295,0.278651,-1.596549
4,-0.5786,-0.5786,0.428529,-0.60165


### Q5. Validation Function

In [14]:
def validate_cleaning(df):
    results = {}

    # Check missing values
    results['missing_values'] = df.isnull().sum().to_dict()

    # Check data types
    results['data_types'] = df.dtypes.astype(str).to_dict()

    # Check outliers in rating (must be 1–5)
    if 'Rating_clean' in df.columns:
        outliers = df[(df['Rating_clean'] < 1) | (df['Rating_clean'] > 5)]
        results['rating_outliers'] = len(outliers)
    else:
        results['rating_outliers'] = None

    return results

validate_cleaning(df_untidy)

{'missing_values': {'Score': 60,
  'Height_cm': 60,
  'Rating': 60,
  'Fruit': 0,
  'Group': 0,
  'IsActive': 0,
  'Score_mean_filled': 0,
  'Score_median_filled': 0,
  'Group_encoded': 0,
  'Fruit_apple': 0,
  'Fruit_banana': 0,
  'Fruit_grape': 0,
  'Fruit_nan': 0,
  'Height_cm_clean': 60,
  'Rating_clean': 60,
  'Score_minmax': 0,
  'Score_standard': 0},
 'data_types': {'Score': 'float64',
  'Height_cm': 'object',
  'Rating': 'object',
  'Fruit': 'object',
  'Group': 'object',
  'IsActive': 'object',
  'Score_mean_filled': 'float64',
  'Score_median_filled': 'float64',
  'Group_encoded': 'int64',
  'Fruit_apple': 'bool',
  'Fruit_banana': 'bool',
  'Fruit_grape': 'bool',
  'Fruit_nan': 'bool',
  'Height_cm_clean': 'float64',
  'Rating_clean': 'float64',
  'Score_minmax': 'float64',
  'Score_standard': 'float64'},
 'rating_outliers': 0}