***How do you handle missing or corrupted data in a dataset?***

Techniques:

    1. Drop missing values
    2. Impute with mean/median/mode
    3. Predict missing values using a model
    4. Use domain-specific constants

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

df = pd.DataFrame({
    'age': [25, np.nan, 30, 22],
    'gender': ['M', 'F', None, 'M']
})

# Drop rows with any missing values
df_dropped = df.dropna()

# Impute nuerical with median
df['age'] = df['age'].fillna(df['age'].median())

# Impute categorical with mode
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])

df = pd.DataFrame({
    'age': [25, np.nan, 30, 22],
    'gender': ['M', 'F', None, 'M']
})

num_cols = ['age']
cat_cols = ['gender']

# Impute numerical with median
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical with most frequent (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

print(df)


    age gender
0  25.0      M
1  25.0      F
2  30.0   None
3  22.0      M


***How would you handle an imbalanced dataset?***

Techniques:

1. Resampling (undersample majority / oversample minority)
2. SMOTE (Synthetic Minority Oversampling)
3. Class weights
3. Use appropriate metrics (AUC, F1, Precision-Recall)

In [3]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter

df = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'label': [0]*90 + [1]*10   # Imbalanced: 90 class 0, 10 class 1
})
majority = df[df.label == 0]
minority = df[df.label == 1]

minority_upsampled = resample(
    minority,
    replace=True,
    n_samples=len(majority),
    random_state=42
)
df_balanced = pd.concat([majority, minority_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 6: Check new class distribution
print(df_balanced['label'].value_counts())

# Optional: display a few rows
print(df_balanced.head())

X = df[['feature1', 'feature2']]
y = df['label']
print("Original class distribution:", Counter(y))
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X,y)

df_balanced = pd.DataFrame(X_resampled, columns=['feature1', 'feature2'])
df_balanced['label'] = y_resampled

# Step 6: Check new distribution
print("After SMOTE class distribution:", Counter(y_resampled))

# Optional: preview the balanced dataset
print(df_balanced.head())

label
0    90
1    90
Name: count, dtype: int64
   feature1  feature2  label
0 -1.701414  0.273864      0
1 -0.404602  1.084066      0
2  1.056552  0.290671      1
3  0.806050 -1.999391      0
4  0.978726  0.673321      1
Original class distribution: Counter({0: 90, 1: 10})
After SMOTE class distribution: Counter({0: 90, 1: 90})
   feature1  feature2  label
0 -0.177564  1.574800      0
1  0.021793 -1.695312      0
2 -0.313299  3.204328      0
3 -1.648307 -0.395977      0
4  0.746839  1.502410      0


***How do you handle categorical variables in your dataset?***

Techniques:

1. One-hot encoding
2. Label encoding
3. Binary encoding
4. Embeddings (for high-cardinality)

In [None]:
df = pd.DataFrame({'color': ['red', 'blue', 'green']})

# One-hot encoding
pd.get_dummies(df, columns=['color'])

# Label conding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['color_encoded'] = le.fit_transform(df['color'])


import torch
import torch.nn as nn

#  3 categories → embedding dim 2
embedding = nn.Embedding(num_embeddings=3, embedding_dim=2)

# Assume 'red'=2, 'blue'=0, 'green'=1 as per LabelEncoder
category_ids = torch.LongTensor([2, 0, 1, 0, 2])
embedded = embedding(category_ids)
print(embedded)

tensor([[ 1.0547, -0.4482],
        [ 0.9830, -0.6250],
        [ 1.8742,  0.2865],
        [ 0.9830, -0.6250],
        [ 1.0547, -0.4482]], grad_fn=<EmbeddingBackward0>)


 ***How do filtering and wrapper methods work in feature selection?***
 1. Filter Method
 2. Wrapper Method

In [6]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, f_classif
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X,y)

selected_features = X.columns[selector.get_support()]
print("Selected features (Filter method):")
print(selected_features)

Selected features (Filter method):
Index(['mean perimeter', 'mean concave points', 'worst radius',
       'worst perimeter', 'worst concave points'],
      dtype='object')


In [7]:
# Wrapper Method
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=5)
X_rfe = rfe.fit_transform(X, y)

In [8]:
selected_rfe = X.columns[rfe.support_]
print("\nSelected features (Wrapper method):")
print(selected_rfe)


Selected features (Wrapper method):
Index(['mean concave points', 'worst radius', 'worst perimeter', 'worst area',
       'worst concave points'],
      dtype='object')


***How do you handle time-based features in a machine learning model?***

Techniques:

1. Extract year, month, day, hour, weekday
2. Time differences (e.g., time since last event)
3. Use rolling averages or lag features
4. Encode cyclical features (e.g., hour of day)

In [9]:
df = pd.DataFrame({
    'user_id': [1, 2, 1, 3, 2],
    'event': ['login', 'logout', 'purchase', 'login', 'purchase'],
    'timestamp': [
        '2024-07-20 08:15:27',
        '2024-07-20 09:45:00',
        '2024-07-21 12:30:15',
        '2024-07-22 23:10:10',
        '2024-07-23 00:05:05'
    ]
})

df['timestamp'] = pd.to_datetime(df['timestamp'])
print(df.head())

   user_id     event           timestamp
0        1     login 2024-07-20 08:15:27
1        2    logout 2024-07-20 09:45:00
2        1  purchase 2024-07-21 12:30:15
3        3     login 2024-07-22 23:10:10
4        2  purchase 2024-07-23 00:05:05


In [10]:
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['weekday'] = df['timestamp'].dt.weekday

In [11]:
df['hour_sin'] = np.sin(2*np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2*np.pi * df['hour'] / 24)
df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

In [13]:
df = df.sort_values(['user_id', 'timestamp'])
df['time_since_last_event'] = df.groupby('user_id')['timestamp'].diff().dt.total_seconds() / 3600

***What is feature hashing? When would you use it?***

Feature Hashing:
Maps categorical variables to a fixed-length hash space — useful for high-cardinality features.

When:
1. Text classification
2. Large-scale categorical data (e.g., user IDs, URLs)

In [14]:
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=10, input_type='string')
hashed_features = h.transform([{'user': '123'}, {'user': '456'}])
print(hashed_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (2, 10)>
  Coords	Values
  (0, 5)	-1.0
  (1, 5)	-1.0


***How do you handle hierarchical categorical variables?***

Strategies:

1. Encode each level separately
2. Combine levels: "Country|City"
3. Use embeddings for each level

In [15]:
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame({
    'Country': ['US', 'US', 'DE', 'DE'],
    'State': ['CA', 'NY', 'BE', 'BW'],
    'City': ['LA', 'NYC', 'Berlin', 'Stuttgart']
})
df_encoded = pd.get_dummies(df, columns=['Country', 'State', 'City'])
print(df_encoded)

   Country_DE  Country_US  State_BE  State_BW  State_CA  State_NY  \
0       False        True     False     False      True     False   
1       False        True     False     False     False      True   
2        True       False      True     False     False     False   
3        True       False     False      True     False     False   

   City_Berlin  City_LA  City_NYC  City_Stuttgart  
0        False     True     False           False  
1        False    False      True           False  
2         True    False     False           False  
3        False    False     False            True  


In [16]:
df['Country_City'] = df['Country'] + '|' + df['City']
df_encoded = pd.get_dummies(df[['Country_City']])
print(df_encoded.head())

   Country_City_DE|Berlin  Country_City_DE|Stuttgart  Country_City_US|LA  \
0                   False                      False                True   
1                   False                      False               False   
2                    True                      False               False   
3                   False                       True               False   

   Country_City_US|NYC  
0                False  
1                 True  
2                False  
3                False  


In [18]:
# target = pd.Series([1, 0, 1, 0])
# df['city_target_mean'] = df['City'].map(df.groupby('City')[target.name].mean())

In [19]:
import torch.nn as nn

# Suppose: Country=3 unique, State=10, City=50
country_emb = nn.Embedding(num_embeddings=3, embedding_dim=2)
state_emb = nn.Embedding(num_embeddings=10, embedding_dim=4)
city_emb = nn.Embedding(num_embeddings=50, embedding_dim=8)

# During training, you pass ID tensors through these layers

***Explain strategies for handling outliers in different ML algorithms***

General Strategies:

1. Remove outliers using Z-score or IQR
2. Cap/floor values (Winsorization)
3. Use robust models (e.g., tree-based)
4. Transform data (log, sqrt)

In [20]:
from scipy.stats import zscore
df = pd.DataFrame({
    'amount': [10, 12, 11, 13, 12, 11, 1000]  # 1000 is an outlier
})
z_scores = np.abs(zscore(df['amount']))
# Keep only data where z < 3
df_z_filtered = df[z_scores < 3]

print("After Z-score filtering:")
print(df_z_filtered)

After Z-score filtering:
   amount
0      10
1      12
2      11
3      13
4      12
5      11
6    1000


In [21]:
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

# Filter
df_iqr_filtered = df[(df['amount'] >= lower) & (df['amount'] <= upper)]

print("\nAfter IQR filtering:")
print(df_iqr_filtered)


After IQR filtering:
   amount
0      10
1      12
2      11
3      13
4      12
5      11


In [22]:
from scipy.stats.mstats import winsorize

# Cap lowest and highest 10% values
df['amount_winsor'] = winsorize(df['amount'], limits=[0.1, 0.1])

print("\nAfter Winsorization:")
print(df['amount_winsor'])


After Winsorization:
0      10
1      12
2      11
3      13
4      12
5      11
6    1000
Name: amount_winsor, dtype: int64


In [24]:
df['amount_log'] = np.log1p(df['amount'])  # log(1 + x) to handle 0s

print("\nAfter Log Transformation:")
print(df['amount_log'])


After Log Transformation:
0    2.397895
1    2.564949
2    2.484907
3    2.639057
4    2.564949
5    2.484907
6    6.908755
Name: amount_log, dtype: float64
