# Explanation
**Load the Data: Load the training and testing datasets.**


**Custom Transformer for Hashed Feature**: Use the hashlib library with sha256 to build custom transformer.

**Identify and Transform Features:** Separate features into numeric, categorical, and boolean types, and transform them appropriately.

**Model Building:** Combine all features into a pipeline, build a multi-label classifier using RandomForestClassifier, and train the model.

**Prediction and Submission:** Preprocess the test set, make predictions, and format the results for submission.

This approach ensures that the hashed feature transformation is handled correctly within the pipeline, and the test set is prepared properly for prediction without dropping the hashed column prematurely.

The final step involves making predictions on the test set and preparing the submission file in the required format.

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, hamming_loss, precision_score, recall_score, f1_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
train_df = pd.read_csv('/content/drive/My Drive/AIqod_Assignment/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/AIqod_Assignment/test.csv')
train_labels_df = pd.read_csv('/content/drive/My Drive/AIqod_Assignment/trainLabels.csv')

In [5]:
train_df.head()

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,1,NO,NO,dqOiM6yBYgnVSezBRiQXs9bvOFnRqrtIoXRIElxD7g8=,GNjrXXA3SxbgD0dTRblAPO9jFJ7AIaZnu/f48g5XSUk=,0.576561,0.073139,0.481394,0.115697,0.472474,...,0.0,0.81,3306,4676,YES,NO,YES,2,0.375535,0.46461
1,2,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.51,4678,3306,YES,NO,YES,4,0.741682,0.59363
2,3,NO,NO,ib4VpsEsqJHzDiyL0dZLQ+xQzDPrkxE+9T3mx5fv2wI=,X6dDAI/DZOWvu0Dg6gCgRoNr2vTUz/mc4SdHTNUPS38=,1.341803,0.051422,0.935572,0.04144,0.50171,...,0.0,0.85,4678,3306,NO,NO,NO,1,0.776467,0.493159
3,4,YES,NO,BfrqME7vdLw3suQp6YAT16W2piNUmpKhMzuDrVrFQ4w=,YGCdISifn4fLao/ASKdZFhGIq23oqzfSbUVb6px1pig=,0.653912,0.041471,0.940787,0.090851,0.556564,...,0.0,0.945,3306,4678,NO,NO,YES,3,0.168234,0.546582
4,5,NO,NO,RTjsrrR8DTlJyaIP9Q3Z8s0zseqlVQTrlSe97GCWfbk=,3yK2OPj1uYDsoMgsxsjY1FxXkOllD8Xfh20VYGqT+nU=,1.415919,0.0,1.0,0.0,0.375297,...,0.0,1.0,1263,892,NO,NO,NO,1,0.246637,0.361045


In [6]:
test_df.head()

Unnamed: 0,1698001,NO,NO.1,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.41479820627803,0,1,0.1,0.202060221870048,...,0.30,1.8,1262.3,892.3,NO.36,NO.37,NO.38,0.31,0.0896860986547085,0.193343898573693.1
0,1698002,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,...,1.0,0.866667,4672,3311,NO,NO,NO,5,0.945032,0.471318
1,1698003,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.0,1.0,0.0,0.703088,...,-1.0,1.0,1263,892,NO,NO,NO,8,0.557175,0.693587
2,1698004,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.87,4672,3306,YES,NO,YES,0,0.870538,0.405822
3,1698005,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.08702,0.81424,1.112804,0.874318,...,0.0,0.87,4400,3413,YES,NO,YES,2,0.224729,0.870909
4,1698006,NO,NO,kM4KU87XvnvKRvf4dN3Tu4zQYq8fpcqhDTFADWdfCg8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.415919,0.0,1.0,0.0,0.232779,...,0.0,1.0,1263,892,YES,NO,YES,6,0.536996,0.223278


In [7]:
# Create a DataFrame with column names as the first row
a = pd.DataFrame([test_df.columns], columns=test_df.columns)


# Append the original DataFrame
test_df = pd.concat([a, test_df], ignore_index=True)

In [8]:
train_labels_df.head()

Unnamed: 0,id,y1,y2,y3,y4,y5,y6,y7,y8,y9,...,y24,y25,y26,y27,y28,y29,y30,y31,y32,y33
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
test_df.columns = train_df.columns

In [10]:
test_df.head()

Unnamed: 0,id,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,1698001,NO,NO.1,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.41479820627803,0.0,1.0,0.1,0.202060221870048,...,0.3,1.8,1262.3,892.3,NO.36,NO.37,NO.38,0.31,0.0896860986547085,0.193343898573693.1
1,1698002,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,...,1.0,0.866667,4672.0,3311.0,NO,NO,NO,5.0,0.945032,0.471318
2,1698003,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.0,1.0,0.0,0.703088,...,-1.0,1.0,1263.0,892.0,NO,NO,NO,8.0,0.557175,0.693587
3,1698004,,,,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.87,4672.0,3306.0,YES,NO,YES,0.0,0.870538,0.405822
4,1698005,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.08702,0.81424,1.112804,0.874318,...,0.0,0.87,4400.0,3413.0,YES,NO,YES,2.0,0.224729,0.870909


In [11]:
print(train_df.shape)
print(test_df.shape)
print(train_labels_df.shape)

(9999, 146)
(2000, 146)
(49999, 34)


In [12]:
X_train = train_df.drop(columns=['id'])
y_train = train_labels_df.loc[:9998, :]
X_test = test_df.drop(columns=['id'])
y_train = y_train.drop(columns=['id'])

In [13]:
X_train.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,NO,NO,dqOiM6yBYgnVSezBRiQXs9bvOFnRqrtIoXRIElxD7g8=,GNjrXXA3SxbgD0dTRblAPO9jFJ7AIaZnu/f48g5XSUk=,0.576561,0.073139,0.481394,0.115697,0.472474,YES,...,0.0,0.81,3306,4676,YES,NO,YES,2,0.375535,0.46461
1,,,,,0.0,0.0,0.0,0.0,0.0,,...,0.0,0.51,4678,3306,YES,NO,YES,4,0.741682,0.59363
2,NO,NO,ib4VpsEsqJHzDiyL0dZLQ+xQzDPrkxE+9T3mx5fv2wI=,X6dDAI/DZOWvu0Dg6gCgRoNr2vTUz/mc4SdHTNUPS38=,1.341803,0.051422,0.935572,0.04144,0.50171,NO,...,0.0,0.85,4678,3306,NO,NO,NO,1,0.776467,0.493159
3,YES,NO,BfrqME7vdLw3suQp6YAT16W2piNUmpKhMzuDrVrFQ4w=,YGCdISifn4fLao/ASKdZFhGIq23oqzfSbUVb6px1pig=,0.653912,0.041471,0.940787,0.090851,0.556564,YES,...,0.0,0.945,3306,4678,NO,NO,YES,3,0.168234,0.546582
4,NO,NO,RTjsrrR8DTlJyaIP9Q3Z8s0zseqlVQTrlSe97GCWfbk=,3yK2OPj1uYDsoMgsxsjY1FxXkOllD8Xfh20VYGqT+nU=,1.415919,0.0,1.0,0.0,0.375297,NO,...,0.0,1.0,1263,892,NO,NO,NO,1,0.246637,0.361045


In [14]:
X_test.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,NO,NO.1,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.41479820627803,0.0,1.0,0.1,0.202060221870048,NO.2,...,0.3,1.8,1262.3,892.3,NO.36,NO.37,NO.38,0.31,0.0896860986547085,0.193343898573693.1
1,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,YES,...,1.0,0.866667,4672.0,3311.0,NO,NO,NO,5.0,0.945032,0.471318
2,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.0,1.0,0.0,0.703088,NO,...,-1.0,1.0,1263.0,892.0,NO,NO,NO,8.0,0.557175,0.693587
3,,,,,0.0,0.0,0.0,0.0,0.0,,...,0.0,0.87,4672.0,3306.0,YES,NO,YES,0.0,0.870538,0.405822
4,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.08702,0.81424,1.112804,0.874318,NO,...,0.0,0.87,4400.0,3413.0,YES,NO,YES,2.0,0.224729,0.870909


In [15]:
y_train.head()

Unnamed: 0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,...,y24,y25,y26,y27,y28,y29,y30,y31,y32,y33
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(9999, 145)
(2000, 145)
(9999, 33)


In [17]:
X_train.dtypes

x1       object
x2       object
x3       object
x4       object
x5      float64
         ...   
x141     object
x142     object
x143      int64
x144    float64
x145    float64
Length: 145, dtype: object

In [18]:

X_test.dtypes

x1      object
x2      object
x3      object
x4      object
x5      object
         ...  
x141    object
x142    object
x143    object
x144    object
x145    object
Length: 145, dtype: object

In [19]:
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [26]:
import re
import numpy as np

def clean_and_convert_to_float(value):
    # Handle non-string values
    if not isinstance(value, str):
        try:
            return float(value)
        except ValueError:
            return np.nan

    # Check for problematic pattern (for string values)
    if re.match(r'^\d+\.\d+\.\d+', value):
        # Split the value at the second period
        parts = value.split('.')
        corrected_value = f"{parts[0]}.{parts[1]}"
        return float(corrected_value)
    try:
        return float(value)
    except ValueError:
        return np.nan  # Return NaN for non-convertible values


In [27]:
# Apply the function to the columns
def clean_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(clean_and_convert_to_float)
    return df

In [28]:
clean_columns(X_test, numeric_features)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145
0,NO,NO.1,5KaYd5siHnBD/IjH8BF1fPz5zrCADHZia/Lrhlyxkvc=,FzMc/XY2ETaomhy8gPc9UL8LRkEnQA56+/wVF1fogk8=,1.414798,0.000000,1.000000,0.100000,0.202060,NO.2,...,0.3,1.800000,1262.3,892.3,NO.36,NO.37,NO.38,0.31,0.089686,0.193344
1,NO,NO,9ACcuXc7MMm9V7jZSr3P3VxAKyMvLAtsdwPKwgncc+k=,WV5vAHFyqkeuyFB5KVNGFOBuwjkUGKYc8wh9QfpVzAA=,0.832679,0.049834,0.945938,0.317427,0.482021,YES,...,1.0,0.866667,4672.0,3311.0,NO,NO,NO,5.00,0.945032,0.471318
2,NO,NO,MeBJ/ZzEIXfNKat4w1oeDxiMNKrAeY0PH41i00hpYDo=,tnLDGLnpYhzsik5+X+WPo4KQJoQA0TfWRlmEtQ3XNJQ=,1.415919,0.000000,1.000000,0.000000,0.703088,NO,...,-1.0,1.000000,1263.0,892.0,NO,NO,NO,8.00,0.557175,0.693587
3,,,,,0.000000,0.000000,0.000000,0.000000,0.000000,,...,0.0,0.870000,4672.0,3306.0,YES,NO,YES,0.00,0.870538,0.405822
4,NO,NO,uduY7XWJ8eFgTltv5P0rPh5GW6KwBu+tPFH13uQRN+0=,0L7+hNDV8S57etySgdljbm2AK1zQuLP77lGk2hyEmCo=,1.129212,0.087020,0.814240,1.112804,0.874318,NO,...,0.0,0.870000,4400.0,3413.0,YES,NO,YES,2.00,0.224729,0.870909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,NO,NO,Z6vucL/W0MPoFsgu2ewNXrvNCAQFiKzUJTYuqh6lP28=,yhI9Bw5Q8l1vEll4sw/Tem/jojpE9KwjKvQQIyrAqgU=,1.294118,0.000000,1.000000,0.000000,0.164141,YES,...,0.0,1.000000,1188.0,918.0,YES,NO,YES,2.00,0.198257,0.155724
1996,NO,NO,LKQ9Uh6tQ3ZrIxAKaPaDEuiYFunnK/2d+oKAfpN9tuY=,h0cPLYjd7nmw9FJsQA+KUsnChH0SajbHjNdfMk47k9o=,1.020217,0.583944,0.625842,1.003516,0.791136,YES,...,0.0,0.720000,4400.0,3413.0,YES,NO,YES,0.00,0.582479,0.778864
1997,NO,NO,/tuZYGMsFx4A/Ou+jSol6t/TpLRkSl8Ku+1tnQPvwww=,aLEeZ8ZFKt2jQfkG5e9Nmad+QJlfpPmSfQS3CHlL6Ik=,0.354706,0.550882,0.930882,0.207941,0.207500,NO,...,0.0,0.845000,4400.0,3400.0,NO,NO,NO,5.00,0.930588,0.201591
1998,NO,NO,uMIU2KDOxlgzhYToCFCa3nMxIOPV0WqCnKWfooGaw+8=,4LhhvTzxwvh2SnFtcpaRasyvph66a3YDIQCshAfyS2o=,1.220588,0.102059,0.326176,1.213824,0.942955,NO,...,1.5,0.676667,4400.0,3400.0,NO,NO,NO,2.00,0.256471,0.938182


In [29]:
X_test.dtypes

x1       object
x2       object
x3       object
x4       object
x5      float64
         ...   
x141     object
x142     object
x143    float64
x144    float64
x145    float64
Length: 145, dtype: object

In [30]:
# Identify the hashed columns
hashed_columns = [col for col in X_train.columns if X_train[col].astype(str).str.endswith('=').any()]

In [31]:
hashed_columns

['x3', 'x4', 'x34', 'x35', 'x61', 'x64', 'x65', 'x91', 'x94', 'x95']

In [32]:
boolean_features = [col for col in X_train.columns if col not in hashed_columns and col not in numeric_features]

In [33]:
# Impute missing values for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [34]:
for col in boolean_features:
    X_train[col] = X_train[col].apply(lambda x: 1 if x == 'YES' else 0)
    X_test[col] = X_test[col].apply(lambda x: 1 if x == 'YES' else 0)

In [35]:
# Impute missing values for hashed columns
# Define a custom transformer for hashed columns
import hashlib
class HashedColumnTransformer:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = np.array([[self.hash_to_int(value) for value in row] for row in X])
        return X_transformed

    @staticmethod
    def hash_to_int(value):
        if value == 'missing':
            return 0
        return int(hashlib.sha256(value.encode('utf-8')).hexdigest(), 16) % 10**8  # Limit to 8 digits

hashed_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('converter', FunctionTransformer(HashedColumnTransformer().transform))
])

In [37]:
len(numeric_features)+len(boolean_features)+len(hashed_columns)

145

In [38]:
# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('hash', hashed_transformer, hashed_columns)
    ],
    remainder='passthrough'
)

# Apply preprocessing to all columns
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [39]:
# Function to remove labels with only one class
def remove_single_class_labels(y):
    return y.loc[:, (y != y.iloc[0]).any()]

# Remove labels with only one class from the entire dataset
y_train_filtered = remove_single_class_labels(y_train)

In [40]:
# Split the dataset while ensuring labels have at least two classes
def balanced_train_test_split(X, y, test_size=0.2, random_state=42):
    # Split the dataset
    X_train_split, X_val, y_train_split, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)


    # Ensure y_val has the same columns as y_train_split
    y_val = y_val[y_train_split.columns]
    return X_train_split, X_val, y_train_split, y_val

In [41]:
# Perform the balanced train-test split
X_train_split, X_val, y_train_split, y_val = balanced_train_test_split(X_train_processed, y_train_filtered)


In [49]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier

# Build the model
model = MultiOutputClassifier(xgb.XGBClassifier())

# Train the model
model.fit(X_train_split, y_train_split)

# Evaluate the model
y_val_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
hamming = hamming_loss(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='samples')
recall = recall_score(y_val, y_val_pred, average='samples')
f1 = f1_score(y_val, y_val_pred, average='samples')

print(f'Validation Accuracy: {accuracy * 100:.2f}%')
print(f'Hamming Loss: {hamming:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Validation Accuracy: 77.45%
Hamming Loss: 0.0106
Precision: 0.8149
Recall: 0.8170
F1 Score: 0.8108


In [57]:
y_test_pred = model.predict(X_test_processed)

In [58]:
# Prepare submission DataFrame
submission2 = pd.DataFrame(columns=['id_label', 'probability'])

# Fill submission DataFrame
rows = []
for i in range(y_test_pred.shape[0]) :
    for j in range(y_test_pred.shape[1]):
        rows.append({
            'id_label': f'{test_df.iloc[i]["id"]}_y{j+1}',
            'pred': y_test_pred[i][j]  # Probability of positive class (assuming this is what's needed)
        })

submission2 = pd.DataFrame(rows)


In [59]:
submission2

Unnamed: 0,id_label,pred
0,1698001_y1,0
1,1698001_y2,0
2,1698001_y3,0
3,1698001_y4,0
4,1698001_y5,0
...,...,...
63995,1700000_y28,0
63996,1700000_y29,0
63997,1700000_y30,0
63998,1700000_y31,0


In [61]:
# Save submission to CSV file
submission2.to_csv('sumission2.csv',index=None)

# **Since there are no direct text features available, traditional text vectorization techniques like Bag of Words, TF-IDF, and word embeddings cannot be directly applied. Instead, the focus is on effectively using the given features to build a robust model.**