In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

# Import Data

In [None]:
train = pd.read_csv("train_8wry4cB.csv")
test = pd.read_csv("test_Yix80N0.csv")

# Training

### Preprocess Training Data

In [None]:
train_v1 = (train.set_index(['session_id', 'startTime', 'endTime', 'gender'])
               .stack()
               .str.split(';', expand=True)
               .stack()
               .unstack(-2)
               .reset_index(-1, drop=True)
               .reset_index()
            )
train_v1[['category', 'sub category', 'sub-sub category', 'product', '/']] = (pd.DataFrame(train_v1['ProductList'].str
                                                                                           .split('/').values.tolist()))
train_v1['SecsTimeDiff'] = np.abs(pd.to_datetime(train_v1['endTime'])-pd.to_datetime(train_v1['startTime'])).dt.total_seconds()
train_v1['MinsTimeDiff'] = np.abs(pd.to_datetime(train_v1['endTime'])-pd.to_datetime(train_v1['startTime'])).dt.total_seconds() / 60
train_v1.drop(['startTime', 'endTime', 'ProductList', '/', 'SecsTimeDiff'], axis=1, inplace=True)
train_v1 = train_v1[['session_id', 'category', 'sub category', 'sub-sub category', 'product', 'MinsTimeDiff', 'gender']]
le = LabelEncoder()
train_v1.gender = le.fit_transform(train_v1.gender)
train_v1.replace('([A-Za-z]+)', '', regex=True, inplace=True)
train_v1 = train_v1.apply(pd.to_numeric)
train_v1 = train_v1.sample(frac=1).reset_index(drop=True)
train_v1.head()

### Feature Selection

In [None]:
train_X1 = train_v1.iloc[:,:2]
train_X2 = train_v1.iloc[:,:3]
train_X3 = train_v1.iloc[:,:4]
train_y = train_v1.iloc[:,-1]

### Model Training

In [None]:
model_RF1 = RandomForestClassifier( n_estimators=100, bootstrap = True, max_features = 'sqrt', random_state=123)
model_RF2 = RandomForestClassifier( n_estimators=100, bootstrap = True, max_features = 'sqrt', random_state=123)
model_RF1.fit(train_X1, train_y)
model_RF2.fit(train_X2, train_y)

model_XGB1 = XGBClassifier(random_state=123)
model_XGB2 = XGBClassifier(random_state=123)
model_XGB3 = XGBClassifier(random_state=123)
model_XGB1.fit(train_X1, train_y)
model_XGB2.fit(train_X2, train_y)
model_XGB3.fit(train_X3, train_y)

model_SVM1 = SVC(random_state=123)
model_SVM2 = SVC(random_state=123)
model_SVM1.fit(train_X1, train_y)
model_SVM2.fit(train_X2, train_y)

# Testing

### Preprocess Testing Data

In [None]:
test_v1 = (test.set_index(['session_id', 'startTime', 'endTime'])
               .stack()
               .str.split(';', expand=True)
               .stack()
               .unstack(-2)
               .reset_index(-1, drop=True)
               .reset_index()
            )
test_v1[['category', 'sub category', 'sub-sub category', 'product', '/']] = (pd.DataFrame(test_v1['ProductList'].str
                                                                                           .split('/').values.tolist()))

test_v1['SecsTimeDiff'] = np.abs(pd.to_datetime(test_v1['endTime'])-pd.to_datetime(test_v1['startTime'])).dt.total_seconds()
test_v1['MinsTimeDiff'] = np.abs(pd.to_datetime(test_v1['endTime'])-pd.to_datetime(test_v1['startTime'])).dt.total_seconds() / 60

test_v1.drop(['startTime', 'endTime', 'ProductList', '/', 'SecsTimeDiff'], axis=1, inplace=True)
test_v2 = test_v1.replace('([A-Za-z]+)', '', regex=True)
test_v2 = test_v2.apply(pd.to_numeric)
test_v2.head()

### Feature Selection

In [None]:
test_X1 = test_v2.iloc[:,:2]
test_X2 = test_v2.iloc[:,:3]
test_X3 = test_v2.iloc[:,:4]

### Model Testing

In [None]:
result = pd.DataFrame()
pred_RF1 = model_RF1.predict(test_X1)
predictions_RF1 = [round(value) for value in pred_RF1]
result['gender_RF1'] = list(le.inverse_transform(predictions_RF1))
pred_RF2 = model_RF2.predict(test_X2)
predictions_RF2 = [round(value) for value in pred_RF2]
result['gender_RF2'] = list(le.inverse_transform(predictions_RF2))

pred_XGB1 = model_XGB1.predict(test_X1)
predictions_XGB1 = [round(value) for value in pred_XGB1]
result['gender_XGB1'] = list(le.inverse_transform(predictions_XGB1))
pred_XGB2 = model_XGB2.predict(test_X2)
predictions_XGB2 = [round(value) for value in pred_XGB2]
result['gender_XGB2'] = list(le.inverse_transform(predictions_XGB2))
pred_XGB3 = model_XGB3.predict(test_X3)
predictions_XGB3 = [round(value) for value in pred_XGB3]
result['gender_XGB3'] = list(le.inverse_transform(predictions_XGB3))

pred_SVM1 = model_SVM1.predict(test_X1)
predictions_SVM1 = [round(value) for value in pred_SVM1]
result['gender_SVM1'] = list(le.inverse_transform(predictions_SVM1))
pred_SVM2 = model_SVM2.predict(test_X2)
predictions_SVM2 = [round(value) for value in pred_SVM2]
result['gender_SVM2'] = list(le.inverse_transform(predictions_SVM2))


result['session_id']=test_v1.session_id
result['gender'] = result[['gender_RF1','gender_RF2', 'gender_XGB1', 'gender_XGB2', 'gender_XGB3', 'gender_SVM1', 'gender_SVM2']].mode(axis=1)
result.drop(['gender_RF1','gender_RF2', 'gender_XGB1', 'gender_XGB2', 'gender_XGB3', 'gender_SVM1', 'gender_SVM2'],axis=1,inplace=True)
result.drop_duplicates(subset=['session_id'], inplace=True)

# Export Result

In [None]:
result.to_csv("final.csv", index = False)