**Test notebook.**

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [None]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test.csv")

In [None]:
print("Train data dimensions: ", train_data.shape)
print("Test data dimensions: ", test_data.shape)

In [None]:
train_data.head()

In [None]:
print("Number of missing values",train_data.isnull().sum().sum())

**Lets analyze the distribution of continuous features:**

In [None]:
train_data.describe()

All continous variables are in in range 0-1

In [None]:
#create continous column names list
contFeatureslist = []
for colName,x in train_data.iloc[1,:].iteritems():
    #print(x)
    if(not str(x).isalpha()):
        contFeatureslist.append(colName)

In [None]:
print(contFeatureslist)

In [None]:
contFeatureslist.remove("id")
contFeatureslist.remove("loss")

### Box plots for continuous features

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
plt.figure(figsize=(13,9))
sns.boxplot(train_data[contFeatureslist])

As we  can see, some of the features like cont13, cont14 etc. are highly skewed. we might need to normalize these features before running any algorithms

### Correlation between continuous features

In [None]:
# Include  target variable also to find correlation between features and target feature as well
contFeatureslist.append("loss")

In [None]:
correlationMatrix = train_data[contFeatureslist].corr().abs()

plt.subplots(figsize=(13, 9))
sns.heatmap(correlationMatrix,annot=True)

# Mask unimportant features
sns.heatmap(correlationMatrix, mask=correlationMatrix < 1, cbar=False)
plt.show()

### Analysis of loss feature

In [None]:
plt.figure(figsize=(13,9))
sns.distplot(train_data["loss"])
sns.boxplot(train_data["loss"])

Lets check out the histogram of loss feature

In [None]:
plt.figure(figsize=(13,9))
sns.distplot(train_data["loss"])

In [None]:
plt.figure(figsize=(13,9))
sns.distplot(np.log1p(train_data["loss"]))

So we got a normal distribution by applying log on loss feature.

###Analyzing categorical variables 

In [None]:

catCount = sum(str(x).isalpha() for x in train_data.iloc[1,:])
print("Number of categories: ",catCount)

converting these categorical values to numeric values

In [None]:
catFeatureslist = []
for colName,x in train_data.iloc[1,:].iteritems():
    if(str(x).isalpha()):
        catFeatureslist.append(colName)

**Unique categorical values per each category**

In [None]:
print(train_data[catFeatureslist].apply(pd.Series.nunique))

### Convert categorical string values to numeric values

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
for cf1 in catFeatureslist:
    le = LabelEncoder()
    le.fit(train_data[cf1].unique())
    train_data[cf1] = le.transform(train_data[cf1])

In [None]:
train_data.head(5)

In [None]:
sum(train_data[catFeatureslist].apply(pd.Series.nunique) > 2)

### Analysis of categorical features with levels greater than 5

In [None]:
filterG5 = list((train_data[catFeatureslist].apply(pd.Series.nunique) > 5))

In [None]:
catFeaturesG5List = [i for (i, v) in zip(catFeatureslist, filterG5) if v]

In [None]:
len(catFeaturesG5List)

In [None]:
catFeaturesG5List

In [None]:
#lets plot for cats >5
for x in catFeaturesG5List:
    plt.figure(figsize=(8,4))
    sns.distplot(train_data[x])

### Correlation between categorical variables

In [None]:
filterG2 = list((train_data[catFeatureslist].apply(pd.Series.nunique) == 2))
catFeaturesG2List = [i for (i, v) in zip(catFeatureslist, filterG2) if v]
catFeaturesG2List.append("loss")

In [None]:
corrCatMatrix = train_data[catFeaturesG2List].corr().abs()
s = corrCatMatrix.unstack()
sortedSeries= s.sort_values(ascending=False)
print(sortedSeries[sortedSeries != 1.0][0:10])