In [2]:
from google.colab import files
import pandas as pd
import numpy as np

# 1. Upload Test.csv from your computer
uploaded = files.upload()

# 2. Load Test.csv
df = pd.read_csv('Test.csv')

# 3. Show basic info
print("Initial shape:", df.shape)
print("Initial missing values:\n", df.isnull().sum())

# 4. Data Cleaning Techniques

# Technique 1: Fill numeric missing values with median
df.fillna(df.median(numeric_only=True), inplace=True)

# Technique 2: Fill categorical missing values with mode
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# 5. Handle outliers using IQR (remove extreme outliers)
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

# 6. Final info
print("Shape after cleaning:", df.shape)
print("Missing values after cleaning:\n", df.isnull().sum())

# 7. Save cleaned data
df.to_csv('Cleaned_Test.csv', index=False)
print("Cleaned_Test.csv saved successfully!")


Saving Test.csv to Test.csv


  df = pd.read_csv('Test.csv')


Initial shape: (929615, 24)
Initial missing values:
 fecha_dato                    0
ncodpers                      0
ind_empleado                  0
pais_residencia               0
sexo                          5
age                           0
fecha_alta                    0
ind_nuevo                     0
antiguedad                    0
indrel                        0
ult_fec_cli_1t           927932
indrel_1mes                  23
tiprel_1mes                  23
indresi                       0
indext                        0
conyuemp                 929511
canal_entrada              2081
indfall                       0
tipodom                       0
cod_prov                   3996
nomprov                    3996
ind_actividad_cliente         0
renta                         0
segmento                   2248
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Shape after cleaning: (929615, 24)
Missing values after cleaning:
 fecha_dato               0
ncodpers                 0
ind_empleado             0
pais_residencia          0
sexo                     0
age                      0
fecha_alta               0
ind_nuevo                0
antiguedad               0
indrel                   0
ult_fec_cli_1t           0
indrel_1mes              0
tiprel_1mes              0
indresi                  0
indext                   0
conyuemp                 0
canal_entrada            0
indfall                  0
tipodom                  0
cod_prov                 0
nomprov                  0
ind_actividad_cliente    0
renta                    0
segmento                 0
dtype: int64
Cleaned_Test.csv saved successfully!
