In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
# np.random.seed(123)
sns.set_style("whitegrid")

In [2]:
# Bigger font
# sns.set_context("poster")
# Figure size
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 10, 4
plt.rc('figure.subplot', wspace=.33)

In [3]:
traintest = pd.DataFrame({'is_train':[True,True,False,False],
              'f0':[13,13,13,13], 'f1':['H','H','H','G'], 'f2':[1.2,36.6,0,-14],
              'f3':[1.2,36.6,0,-14], 'f4':['A','B','A','C'], 'f5':['V1','V2','V1','V3']})
traintest

Unnamed: 0,is_train,f0,f1,f2,f3,f4,f5
0,True,13,H,1.2,1.2,A,V1
1,True,13,H,36.6,36.6,B,V2
2,False,13,H,0.0,0.0,A,V1
3,False,13,G,-14.0,-14.0,C,V3


## Duplicated and constant features

### f0

Some features can be constant, and can be removed

In [4]:
traintest.nunique(axis=0) == 1

is_train    False
f0           True
f1          False
f2          False
f3          False
f4          False
f5          False
dtype: bool

### f1

It can be constant in the train set but not in the test set like

- Then **remove it**
- Or create a model for the object with new feature values

### f2 and f3

These are duplicated features, then remove one.

In [5]:
traintest = traintest.T.drop_duplicates().T
traintest

Unnamed: 0,is_train,f0,f1,f2,f4,f5
0,True,13,H,1.2,A,V1
1,True,13,H,36.6,B,V2
2,False,13,H,0.0,A,V1
3,False,13,G,-14.0,C,V3


## f4 and f5

These are duplicated categorical features. That can be seen only after factorizing.

In [6]:
for f in ['f4','f5']:
    traintest[f] = traintest[f].factorize()[0]
traintest

Unnamed: 0,is_train,f0,f1,f2,f4,f5
0,True,13,H,1.2,0,0
1,True,13,H,36.6,1,1
2,False,13,H,0.0,0,0
3,False,13,G,-14.0,2,2


In [7]:
traintest = traintest.T.drop_duplicates().T
traintest

Unnamed: 0,is_train,f0,f1,f2,f4
0,True,13,H,1.2,0
1,True,13,H,36.6,1
2,False,13,H,0.0,0
3,False,13,G,-14.0,2


## Duplicated rows

- Check if same rows have the same label. If they have different labels, then they randomness has to be considered.
- Understand why the duplicity
- Test and train can have same rows. That can tell us more.

## Check if dataset is shuffled

If it's not, then there is a chance to find data leakage.

![](images/check_shuffle.jpg)