A little exploration of the data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

In [2]:
# read datasets
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
# Let's explore the datatypes
train.dtypes

In [4]:
len(list(train.columns.values))

In [5]:
# Helper function to find duplicate columns, from
# https://stackoverflow.com/questions/14984119/python-pandas-remove-duplicate-columns/32961145#32961145
def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break
    return dups

In [6]:
# How many duplicate columns are there?
dups = duplicate_columns(train)
print(len(dups))
print(dups)

In [7]:
# Remove duplicates
train = train.drop(dups, axis=1)
test = test.drop(dups, axis=1)

In [8]:
#Use label encoder
cols = list(train.columns.values)
for c in cols:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [9]:
# Remove columns that only contain a single entry type
# Remove columns that don't share information between train and test data
cols.remove('ID')
cols.remove('y')
for col in cols:
    uniquestrain = list(pd.Series(train[col].values.ravel()).unique())
    uniquestest = list(pd.Series(test[col].values.ravel()).unique())
    overlaptraintest = len(list(set(uniquestrain) & set(uniquestest)))
    if len(uniquestrain) < 2:
        train = train.drop(col, axis=1)
        test = test.drop(col, axis=1)
        print("Dropping since in train only one type {0}".format(col))
    if overlaptraintest == 0:
        train = train.drop(col, axis=1)
        test = test.drop(col, axis=1)
        print("Dropping since no overlap between train and test on {0}".format(col))

In [10]:
# How many columns are we left with
len(list(train.columns.values))

In [11]:
# What's the distribution of the test time?
%matplotlib inline
sns.distplot(train.y, bins=20, kde=False, rug=True)

In [12]:
# How many outliers do we have with a large value
train.loc[train.y > 200]

In [13]:
# Is there similar data in the training set lloking at the categorical values?
row = train.loc[train.y > 200]
subset = train.loc[(train.X0 == row.X0.values[0]) & (train.X1 == row.X1.values[0]) & 
          (train.X2 == row.X2.values[0]) & (train.X3 == row.X3.values[0]) &
          (train.X4 == row.X4.values[0]) & (train.X5 == row.X5.values[0])]
ax = sns.boxplot(x='X6', y="y", data=subset)

In [14]:
# Is there such a value in the test set?
test.loc[(test.X0 == row.X0.values[0]) & (test.X1 == row.X1.values[0]) & 
         (test.X2 == row.X2.values[0]) & (test.X3 == row.X3.values[0]) &
         (test.X4 == row.X4.values[0]) & (test.X5 == row.X5.values[0])]

In [15]:
# How many other fairly large values are there?
train.loc[(train.y <= 200) & (train.y > 156)]

In [16]:
# Focus on the distribution without the outlier
sns.distplot(train.loc[train.y < 200, 'y'], bins=20, kde=False, rug=True)

In [17]:
# Zoom in
sns.distplot(train.loc[(train.y < 125) & (train.y > 85), 'y'], bins=20, kde=False, rug=True)

In [18]:
# Determine quantiles for stratification and visualization
quantiles = pd.qcut(train['y'], 5, labels=False)

In [19]:
# Stratify on quantiles
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.15, random_state=0)
X = train.drop('y', axis=1).values
y = train['y'].values
for train_index, test_index in sss.split(train, quantiles):
    X_train, X_val = X[train_index], X[test_index]
    y_train, y_val = y[train_index], y[test_index]

In [20]:
# How does train and validation set compare
%matplotlib inline
fig, axs = plt.subplots(1,2)
s1 = sns.distplot(y_train, bins=10, kde=False, rug=True, ax=axs[0])
s1.set_xlim(70,160)
s2 = sns.distplot(y_val, bins=10, kde=False, rug=True, ax=axs[1])
s2.set_xlim(70,160)

In [21]:
# Determine mean and median prediction
y_mean = np.mean(train.y)
print(y_mean)
y_median = np.median(train.y)
print(y_median)

Mean and median are fairly similar -- the single outlier does not have a large effect
Not plot the dsitribution of the test time by each categorical variable

In [22]:
# Boxplot for X0
plt.figure(figsize=(20, 6))
ax = sns.boxplot(x="X0", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [23]:
# Are train and test set similarly distributed?
x = train.X0.values
y = test.X0.values
bins = np.linspace(0, 52, 53)
plt.hist(x, bins, alpha=0.8, label='train', color='k')
plt.hist(y, bins, alpha=0.3, label='test', color='r')
plt.legend(loc='upper left')

X0 seems like a valuable variable for prediction. Distributions of train and test are very similar. Similarity of train and test are quite good for all of the categorical variables as we see below

In [24]:
# Same for X1
plt.figure(figsize=(20, 6))
ax = sns.boxplot(x="X1", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [25]:
# Are train and test set similarly distributed?
x = train.X1.values
y = test.X1.values
bins = np.linspace(0, 26, 27)

plt.hist(x, bins, alpha=0.8, label='train', color='k')
plt.hist(y, bins, alpha=0.3, label='test', color='r')
plt.legend(loc='upper right')

In [26]:
# Same for X2
plt.figure(figsize=(20, 6))
ax = sns.boxplot(x="X2", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [27]:
# Are train and test set similarly distributed?
x = train.X2.values
y = test.X2.values
bins = np.linspace(0, 49, 50)

plt.hist(x, bins, alpha=0.8, label='train', color='k')
plt.hist(y, bins, alpha=0.3, label='test', color='r')
plt.legend(loc='upper right')
train.X2.value_counts()[:3]

In [28]:
train.loc[train.X2==19, 'y'].hist()

X2 also seems like interesting information. The most used value (19) has a wide distribution.

In [29]:
# For X3 use a violin plot to get more details (since there are fewer values)
plt.figure(figsize=(20, 6))
ax = sns.violinplot(x="X3", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [30]:
# Are train and test set similarly distributed?
x = train.X3.values
y = test.X3.values
bins = np.linspace(0, 6, 7)

plt.hist(x, bins, alpha=0.8, label='train', color='k')
plt.hist(y, bins, alpha=0.3, label='test', color='r')
plt.legend(loc='upper right')

In [31]:
# Same for X4
plt.figure(figsize=(20, 6))
ax = sns.violinplot(x="X4", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [32]:
# Are train and test set similarly distributed? Use log here
x = train.X4.values
y = test.X4.values
bins = np.linspace(0, 3, 4)

plt.hist(x, bins, alpha=0.8, label='train', color='k', log=True)
plt.hist(y, bins, alpha=0.3, label='test', color='r', log=True)
plt.legend(loc='upper right')

In [33]:
# Boxplot for X5
plt.figure(figsize=(20, 6))
ax = sns.boxplot(x="X5", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [34]:
# Are train and test set similarly distributed? Use log here
x = train.X5.values
y = test.X5.values
bins = np.linspace(0, 31, 32)

plt.hist(x, bins, alpha=0.8, label='train', color='k')
plt.hist(y, bins, alpha=0.3, label='test', color='r')
plt.legend(loc='upper right')

Only the values were there's very little data we see a difference

In [35]:
# Violinplots X6
plt.figure(figsize=(20, 6))
ax = sns.violinplot(x="X6", y="y", data=train)
plt.plot([0, y_mean], [100, y_mean], linewidth=2)

In [36]:
# Are train and test set similarly distributed? Use log here
x = train.X6.values
y = test.X6.values
bins = np.linspace(0, 11, 12)

plt.hist(x, bins, alpha=0.8, label='train', color='k', log=True)
plt.hist(y, bins, alpha=0.3, label='test', color='r', log=True)
plt.legend(loc='upper right')