In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [None]:
%matplotlib inline

# Display all rows and cols
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
train_dataset = pd.read_csv('data/train.csv')
test_dataset = pd.read_csv('data/test.csv')

In [None]:
# Store id from test dataset separately and drop it
ID_TEST = test_dataset['id']
test_dataset.drop('id', axis=1, inplace=True)

# Drop id from train dataset
train_dataset.drop('id', axis=1, inplace=True)

In [None]:
rows_train = train_dataset.shape[0]

In [None]:
train_dataset.head(5)

In [None]:
print train_dataset.shape
train_dataset.describe()

In [None]:
# Skewness of dataset
# Should be close to 0
train_dataset.skew()

# loss is most skewed

In [None]:
numeric_data_train = train_dataset.select_dtypes(include=['float64'])
numeric_cols = numeric_data_train.columns

In [None]:
# Violin plots for numeric attributes
n_cols = 3
n_rows = (len(numeric_cols) / n_cols) + (len(numeric_cols) % n_cols)

for i in xrange(n_rows):
    fig, ax = plt.subplots(nrows=1, ncols=n_cols, figsize=(12, 6))
    for j in xrange(n_cols):
        sns.violinplot(y=numeric_cols[i*n_cols + j], data=numeric_data_train, ax=ax[j])
        
# loss is heavily skewed

In [None]:
# skew correction
train_dataset['loss'] = np.log1p(train_dataset['loss'])
sns.violinplot(y='loss', data=train_dataset)
plt.show()

In [None]:
# Drop 'loss' from continuous data
numeric_data_train.drop('loss', axis=1, inplace=True)

In [None]:
# Calculate pearson coefficient
corr_train_data = numeric_data_train.corr()
corr_train_data

In [None]:
# Correlation threshold
corr_threshold = 0.5

n_cont_features = len(corr_train_data.columns)

high_corrs = []

# Find features with correaltion more than or equal to the threshold
for i in xrange(0, n_cont_features):
    for j in xrange(i+1, n_cont_features):
        corr = corr_train_data.iloc[i,j]
        if abs(corr) >= corr_threshold:
            high_corrs.append(((corr_train_data.columns[i], corr_train_data.columns[j]), corr))

sorted_corrs = sorted(high_corrs, key=lambda x: -abs(x[1])) # -abs for descending order

# correlations in descending order
for corr in sorted_corrs:
    print "{} and {} -------> {:.3f}".format(corr[0][0], corr[0][1], corr[1])

In [None]:
# Scatter plot of highly correlated feature pairs
for pair, corr in sorted_corrs:
    sns.pairplot(data=train_dataset, x_vars=pair[0], y_vars=pair[1], size=6)
plt.show()

In [None]:
# Data preparation

# Encode categorical variables
labels = {} # holds list of unique categorical vars

cat_cols = train_dataset.select_dtypes(include=['object']).columns

for col in cat_cols:
    # take unique cols from both train and test sets so that no variables are missed
    unique_cat_train = train_dataset[col].unique()
    unique_cat_test = test_dataset[col].unique()
    labels[col] = (list(set(unique_cat_train) | set(unique_cat_test)))


In [None]:
# Data preparation

all_dataset = train_dataset.append(test_dataset)
all_dataset.drop('loss', axis=1, inplace=True)

# Label encode
all_dataset = pd.get_dummies(all_dataset)

In [None]:
# Split into train and test set

train = all_dataset.iloc[0: rows_train]
test = all_dataset.iloc[rows_train:]

# Split train dataset into train and cross-validation set

x_train, x_val, y_train, y_val = train_test_split(train, train_dataset['loss'], test_size=0.2, random_state=7)

In [None]:
random_seed = 7

In [None]:
ridge_model = Ridge(alpha=1.0, random_state=random_seed)
ridge_model.fit(x_train, y_train)
cv_result = ridge_model.predict(x_val)