## 0. Import Libraries

In [13]:
# Lib & Dependencies
import pandas as pd
import numpy as np
import xgboost as xgb
import scipy

import seaborn as sns
import matplotlib.pyplot as plt

import requests
import gc

import crunchdao

## 1. Download data

In [14]:
client = crunchdao.Client(apikey="")
# Chose a file format between parquet and csv 
file_format = 'parquet'

# Download current dataset
client.download_data(directory=".", file_format=file_format)

['./X_train.parquet',
 './y_train.parquet',
 './X_test.parquet',
 './example_submission.parquet']

In [15]:
if file_format == 'parquet':
  # Data for training
  train_features = pd.read_parquet(f'./X_train.{file_format}')
  # Data for which you will submit your prediction
  test_data = pd.read_parquet(f'./X_test.{file_format}')
  # Targets use for your supervised training
  train_targets = pd.read_parquet(f'./y_train.{file_format}')
  # Exemple of an awaited submission
  example_submission = pd.read_parquet(f'./example_submission.{file_format}')
elif file_format == 'csv':
  # Data for training
  train_features = pd.read_csv(f'./X_train.{file_format}')
  # Data for which you will submit your prediction
  test_data = pd.read_csv(f'./X_test.{file_format}')
  # Targets use for your supervised training
  train_targets = pd.read_csv(f'./y_train.{file_format}')
  # Exemple of an awaited submission
  example_submission = pd.read_csv(f'./example_submission.{file_format}')

In [16]:
# Merge train_features and train_targets for ease of use
train_data = pd.merge(train_features, train_targets, on=['id', 'Moons'], how='inner')

del train_features, train_targets
gc.collect()

0

In [17]:
# Get the features columns name and the targets columns name
features = [col for col in train_data.columns if 'Feature' in col]
targets = [col for col in train_data.columns if 'target' in col]

In [18]:
display(train_data.head())
display(test_data.head())

Unnamed: 0,id,Moons,wrythm_Feature_1,wrythm_Feature_2,wrythm_Feature_3,wrythm_Feature_4,wrythm_Feature_5,wrythm_Feature_6,wrythm_Feature_7,wrythm_Feature_8,...,3b1-signal_Feature_46,3b1-signal_Feature_47,3b1-signal_Feature_48,3b1-signal_Feature_49,3b1-signal_Feature_50,3b1-signal_Feature_51,target_w,target_r,target_g,target_b
0,58950,0,0.83,0.33,0.33,0.67,0.33,0.0,0.33,0.33,...,0.33,0.67,0.33,0.5,0.83,0.17,0.5,0.33,0.5,0.33
1,72918,0,0.17,0.5,0.5,0.17,0.67,0.5,0.5,0.5,...,0.5,0.33,0.0,0.0,0.67,0.67,0.67,0.33,0.5,0.67
2,99888,0,0.0,0.67,0.33,0.83,0.67,0.0,1.0,0.33,...,0.0,0.5,0.33,0.5,0.67,0.67,0.67,0.67,0.67,0.83
3,205230,0,0.0,0.67,0.5,0.67,0.67,0.5,0.5,0.67,...,0.5,0.5,0.67,0.83,0.17,0.83,0.33,0.5,0.67,0.5
4,121182,0,0.17,0.33,0.17,0.5,1.0,0.33,0.67,0.5,...,0.17,0.67,0.67,0.17,0.33,0.67,0.33,0.17,0.33,0.33


Unnamed: 0,id,Moons,wrythm_Feature_1,wrythm_Feature_2,wrythm_Feature_3,wrythm_Feature_4,wrythm_Feature_5,wrythm_Feature_6,wrythm_Feature_7,wrythm_Feature_8,...,3b1-signal_Feature_42,3b1-signal_Feature_43,3b1-signal_Feature_44,3b1-signal_Feature_45,3b1-signal_Feature_46,3b1-signal_Feature_47,3b1-signal_Feature_48,3b1-signal_Feature_49,3b1-signal_Feature_50,3b1-signal_Feature_51
0,244649,362,0.67,0.33,1.0,0.67,0.67,0.83,0.33,0.5,...,0.17,0.5,0.33,0.67,0.33,1.0,0.33,0.33,1.0,0.33
1,126506,362,0.67,0.33,1.0,0.83,0.33,0.33,0.17,0.67,...,0.0,0.67,0.83,0.67,0.5,0.83,0.33,0.5,0.83,0.33
2,87368,362,0.67,0.67,0.67,0.83,0.83,0.67,0.83,0.83,...,0.5,0.33,0.17,0.5,0.33,0.33,0.67,0.17,0.33,0.5
3,185658,362,0.5,0.17,0.67,0.5,0.5,0.17,0.83,0.5,...,0.5,0.17,0.33,0.33,0.67,0.5,0.83,0.17,0.33,0.33
4,215543,362,0.17,1.0,0.17,0.0,0.5,0.17,1.0,0.33,...,0.5,0.5,0.5,0.33,0.5,0.5,1.0,0.83,0.5,0.83


# Set features

In [None]:
# Feature columns
features = train.columns[train.columns.str.startswith('Feature')]
features

# Targets

In [None]:
# Targets columns
targets = train.columns[train.columns.str.startswith('target')]
targets

# Observations per Moon

In [None]:
plt.style.use('seaborn')
moonCount = Counter()
for moon in train['Moons']:
    moonCount[moon] += 1
moonCount = pd.DataFrame.from_dict(moonCount, orient='index', columns=['Observations'])
moonCount.plot(legend=None)
plt.xlabel('Moon')
plt.ylabel('Observations')
plt.title('Observations by Moon')
moonCount.describe()

In [None]:
train.describe()

# Targets distribution

In [None]:
# Target columns that are supposed to represent different time horizons
train[targets].describe()

In [None]:
# Number of observations per target value
train[targets].apply(lambda x: x.value_counts())

In [None]:
# Identical Distribution of target values
sns.distplot(train['target_r'], hist=True, kde=True, bins=15)
sns.distplot(train['target_g'], hist=True, kde=True, bins=15)
sns.distplot(train['target_b'], hist=True, kde=True, bins=15)
plt.xlabel('Target Value')
plt.ylabel('Density')
plt.title('Distribution of Targets')
plt.show()

# Train / Test objects info

In [None]:
train.info()

In [None]:
test_data.info()

# Spearman Correlation Matrix

In [None]:
#Using Spearman Correlation
plt.figure(figsize=(12,10))
cor = train.corr(method = 'spearman')
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Using Spearman Correlation
plt.figure(figsize=(12,10))
cor = test_data.corr(method = 'spearman')
sns.heatmap(cor, annot=False, cmap=plt.cm.Reds)
plt.show()

# Feature Correlations

In [None]:
feature_corrs = train[features].corr()

In [None]:
feature_corrs.stack().head()

In [None]:
tdf = feature_corrs.stack()
tdf = tdf[tdf.index.get_level_values(0) < tdf.index.get_level_values(1)]
tdf.sort_values()

In [None]:
moons = train.Moons

In [None]:
df1 = train[moons<=moons.median()]
df2 = train[moons>moons.median()]

In [None]:
corr1 = df1[features].corr().unstack()
corr1 = corr1[corr1.index.get_level_values(0) < corr1.index.get_level_values(1)]

In [None]:
corr2 = df2[features].corr().unstack()
corr2 = corr2[corr2.index.get_level_values(0) < corr2.index.get_level_values(1)]

In [None]:
tdf = pd.DataFrame({
    "corr1": corr1,
    "corr2": corr2,
})
tdf["corr_diff"] = tdf.corr2 - tdf.corr1
tdf.sort_values(by="corr_diff")

# Feature Correlation over time (Moons)

In [None]:
# Single features do not work consistently though
by_moon_correlation = pd.Series({
    moon: np.corrcoef(tdf['target_r'], tdf["Feature_2"])[0,1]
    for moon, tdf in train.groupby(moons)
})
by_moon_correlation.plot()

In [None]:
# With a rolling 2 moons average you can see some trends
by_moon_correlation.rolling(2).mean().plot()

# About

Last updated: 2021-02-23

Created by: [Jeremy Berros](https://github.com/jberros)

Greatly inspired by the work from: [Jon Taylor](https://github.com/jonrtaylor)