In [1]:
# CONSTANTS
TECH_FS_WORKDIR = '../'
DATA_PRIMARY = TECH_FS_WORKDIR + 'data/primary/'
DATA_PRIMARY_TRAIN = DATA_PRIMARY + 'train_data.csv'
DATA_PRIMARY_TEST = DATA_PRIMARY + 'test_data.csv'
DATA_PRIMARY_SUBMISSION = DATA_PRIMARY + 'sample_submission.csv'
DATA_PRIMARY_SUBMISSION_PATTERN = DATA_PRIMARY + 'daniels_submission_{dtime}.csv'

DF_TARGET = 'cancer_type'
DF_FEATURES = None

TECH_DATE_FORMAT_DATE_DATASET = '%Y-%m-%d'
TECH_DATE_FORMAT_DATE_STD = '%Y%m%d_%H%M%S'

DF_TRAIN_TEST_SPLIT = 0.3
TECH_RANDOM_SEED = 1337

In [2]:
# IMPORTS
# regular Python imports
import os
import time
from datetime import datetime
from calendar import timegm

import pandas as pd
import numpy as np
import statistics

import lightgbm as lgb
from lightgbm import LGBMRegressor

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import chart_studio.plotly as py
import matplotlib.pyplot as plt
from matplotlib import pyplot
from plotly import graph_objs

# scikit-learn imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# TF-specific imports
import tensorflow as tf

from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras import Model

In [3]:
# STATIC METHODS
def get_timestamp_since_epoch(timestamp_date,
                              pattern=TECH_DATE_FORMAT_DATE_DATASET):
    if None == timestamp_date:
        return None
    
    return timegm(time.strptime(timestamp_date, pattern))

def get_current_time_in_std_format(dtime=None, dateformat=TECH_DATE_FORMAT_DATE_STD):
    d = datetime.utcnow()
    if None != dtime:
        d = dtime
    return d.strftime(dateformat)

In [4]:
pd.set_option('display.max_rows', 512)

init_notebook_mode(connected=True)

In [5]:
d_train = pd.read_csv(DATA_PRIMARY_TRAIN)
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 13 columns):
radius_0          398 non-null float64
texture_0         398 non-null float64
perimeter_0       398 non-null float64
radius_1          343 non-null float64
texture_1         398 non-null float64
perimeter_1       264 non-null float64
radius_2          398 non-null float64
texture_2         398 non-null object
perimeter_2       398 non-null float64
age               398 non-null int64
treatment_date    398 non-null object
diagnose_date     398 non-null object
cancer_type       398 non-null int64
dtypes: float64(8), int64(2), object(3)
memory usage: 40.5+ KB


In [6]:
d_train['treatment_date'] = d_train['treatment_date'].apply(lambda d: get_timestamp_since_epoch(d))
d_train['diagnose_date'] = d_train['diagnose_date'].apply(lambda d: get_timestamp_since_epoch(d))

In [7]:
d_train.head()

Unnamed: 0,radius_0,texture_0,perimeter_0,radius_1,texture_1,perimeter_1,radius_2,texture_2,perimeter_2,age,treatment_date,diagnose_date,cancer_type
0,19.858394,27.204437,136.324256,22.68329,32.802578,119.523841,21.477052,27.3070874472,82.366936,44,1149292800,1130025600,0
1,14.182069,15.75473,80.916983,14.043753,30.094704,94.911073,15.012329,17.8551305385,103.078286,59,1077408000,1187568000,1
2,25.380268,21.291553,152.281062,23.852166,46.237931,,28.563252,21.0971528265,143.367792,37,1136505600,1091836800,0
3,11.835961,17.820702,72.178523,11.260258,44.805167,,12.082749,16.4992370844,65.920413,51,1050278400,1118880000,1
4,14.8756,17.534187,98.54583,14.380683,26.190447,89.712492,12.930685,19.8566873539,108.380754,21,1087776000,1038355200,1


In [8]:
if None == DF_FEATURES:
    DF_FEATURES = list(d_train.columns)
    DF_FEATURES.remove(DF_TARGET)
DF_FEATURES

['radius_0',
 'texture_0',
 'perimeter_0',
 'radius_1',
 'texture_1',
 'perimeter_1',
 'radius_2',
 'texture_2',
 'perimeter_2',
 'age',
 'treatment_date',
 'diagnose_date']

In [9]:
d_train.isna().sum()
# almost identical to: d_train.isnull().sum()

radius_0            0
texture_0           0
perimeter_0         0
radius_1           55
texture_1           0
perimeter_1       134
radius_2            0
texture_2           0
perimeter_2         0
age                 0
treatment_date      0
diagnose_date       0
cancer_type         0
dtype: int64

In [10]:
f'Currently, there are {sum(d_train.isna().sum())} NA/missing values in the \"d_train\" DataFrame.'

'Currently, there are 189 NA/missing values in the "d_train" DataFrame.'

In [11]:
d_train['texture_2'] = pd.to_numeric(d_train['texture_2'], errors='coerce')

In [12]:
d_train

Unnamed: 0,radius_0,texture_0,perimeter_0,radius_1,texture_1,perimeter_1,radius_2,texture_2,perimeter_2,age,treatment_date,diagnose_date,cancer_type
0,19.858394,27.204437,136.324256,22.68329,32.802578,119.523841,21.477052,27.307087,82.366936,44,1149292800,1130025600,0
1,14.182069,15.75473,80.916983,14.043753,30.094704,94.911073,15.012329,17.855131,103.078286,59,1077408000,1187568000,1
2,25.380268,21.291553,152.281062,23.852166,46.237931,,28.563252,21.097153,143.367792,37,1136505600,1091836800,0
3,11.835961,17.820702,72.178523,11.260258,44.805167,,12.082749,16.499237,65.920413,51,1050278400,1118880000,1
4,14.8756,17.534187,98.54583,14.380683,26.190447,89.712492,12.930685,19.856687,108.380754,21,1087776000,1038355200,1
5,11.016351,24.013399,72.37356,12.074242,41.714316,71.440328,11.308987,,73.637586,27,1215993600,1051315200,1
6,19.379444,21.850345,107.734027,16.748725,22.265567,,18.089348,20.962623,184.390751,51,1193097600,1127260800,0
7,14.292161,28.430808,81.293588,15.042501,36.480522,78.668608,14.584187,24.05564,126.014415,40,1051401600,1113868800,1
8,13.119916,14.619103,88.293516,13.093215,12.299673,81.907981,13.963749,16.908342,56.235029,43,1110240000,1131321600,1
9,15.100628,7.440004,90.024419,14.645858,11.425323,95.242578,13.57079,11.934938,139.979154,50,956448000,1144368000,1


In [13]:
d_train.isna().sum()
# almost identical to: d_train.isnull().sum()

radius_0            0
texture_0           0
perimeter_0         0
radius_1           55
texture_1           0
perimeter_1       134
radius_2            0
texture_2          16
perimeter_2         0
age                 0
treatment_date      0
diagnose_date       0
cancer_type         0
dtype: int64

In [14]:
f'Currently, there are {sum(d_train.isna().sum())} NA/missing values in the \"d_train\" DataFrame.'

'Currently, there are 205 NA/missing values in the "d_train" DataFrame.'

In [49]:
imputer_simple = SimpleImputer(missing_values=np.nan)
imputer_simple.fit(d_train[DF_FEATURES])
print(imputer_simple)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)


In [48]:
dimred_pca = PCA()
dimred_pca.fit(d_train[DF_FEATURES])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [34]:
X_train, X_val, Y_train, Y_val = train_test_split(d_train[DF_FEATURES],
                                                  d_train[DF_TARGET],
                                                  test_size=DF_TRAIN_TEST_SPLIT,
                                                  random_state=TECH_RANDOM_SEED)

In [36]:
pipeline = Pipeline([('imp', imputer_simple),
                     ('pca', dimred_pca,
                     ('feat_select', SelectKBest()),
                     ('classify', LGBMRegressor())])

parameters = dict(
    imp__strategy = ['mean', 'median', 'most_frequent'],
    pca__n_components = [2, 4, 6, 8, 10],
    feat_select__k = [2]
)   

CV = GridSearchCV(pipeline, parameters, cv = 3, scoring = 'neg_mean_absolute_error', n_jobs = 1)
CV.fit(X_train, Y_train)

Y_forec = CV.predict(X_val)
Y_forec = np.rint(Y_forec)
len(Y_forec)


The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



120

In [37]:
mae = mean_absolute_error(Y_val, Y_forec)

In [38]:
f'The current MAE is {mae:.4f}.'

'The current MAE is 0.1667.'

In [39]:
d_test = pd.read_csv(DATA_PRIMARY_TEST)
d_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 12 columns):
radius_0          171 non-null float64
texture_0         171 non-null float64
perimeter_0       171 non-null float64
radius_1          146 non-null float64
texture_1         171 non-null float64
perimeter_1       105 non-null float64
radius_2          171 non-null float64
texture_2         171 non-null object
perimeter_2       171 non-null float64
age               171 non-null int64
treatment_date    171 non-null object
diagnose_date     171 non-null object
dtypes: float64(8), int64(1), object(3)
memory usage: 16.2+ KB


In [40]:
d_test.head()

Unnamed: 0,radius_0,texture_0,perimeter_0,radius_1,texture_1,perimeter_1,radius_2,texture_2,perimeter_2,age,treatment_date,diagnose_date
0,12.567724,13.561447,77.106898,10.773643,45.494416,,12.526989,15.7063580493,123.583682,31,2008-11-19,2003-04-22
1,11.195949,19.693575,81.244301,15.058411,7.909249,86.766622,13.72896,21.485344712,154.164201,18,2001-08-18,2003-07-07
2,15.71272,26.114134,90.977022,13.832857,18.086143,,14.758324,27.0205254475,114.023403,43,2006-11-17,2004-03-06
3,13.428698,26.649458,76.456016,14.837875,6.12295,89.609565,16.279206,29.1837924649,199.756098,57,2001-01-10,2006-02-24
4,179.763472,14.175435,51.125047,,21.116416,52.041704,9.191477,13.5857306814,74.879232,26,2008-07-12,2004-06-21


In [41]:
d_test['texture_2'] = pd.to_numeric(d_test['texture_2'], errors='coerce')

In [42]:
d_test['treatment_date'] = d_test['treatment_date'].apply(lambda d: get_timestamp_since_epoch(d))
d_test['diagnose_date'] = d_test['diagnose_date'].apply(lambda d: get_timestamp_since_epoch(d))

In [43]:
d_test

Unnamed: 0,radius_0,texture_0,perimeter_0,radius_1,texture_1,perimeter_1,radius_2,texture_2,perimeter_2,age,treatment_date,diagnose_date
0,12.567724,13.561447,77.106898,10.773643,45.494416,,12.526989,15.706358,123.583682,31,1227052800,1050969600
1,11.195949,19.693575,81.244301,15.058411,7.909249,86.766622,13.72896,21.485345,154.164201,18,998092800,1057536000
2,15.71272,26.114134,90.977022,13.832857,18.086143,,14.758324,27.020525,114.023403,43,1163721600,1078531200
3,13.428698,26.649458,76.456016,14.837875,6.12295,89.609565,16.279206,29.183792,199.756098,57,979084800,1140739200
4,179.763472,14.175435,51.125047,,21.116416,52.041704,9.191477,13.585731,74.879232,26,1215820800,1087776000
5,19.059032,13.582267,143.925475,19.728102,18.516768,107.619881,20.918542,34.825627,147.172568,47,987724800,1162684800
6,12.999343,18.19063,57.052625,8.370817,7.745115,,11.410154,16.758077,158.297407,58,1186012800,1180656000
7,9.233075,14.743812,57.914909,,4.572889,71.583291,9.386501,18.992244,101.078559,18,1195084800,1035504000
8,5.778161,16.680313,41.982155,8.221302,17.267871,,5.429977,25.91392,150.96756,53,1020384000,1131235200
9,118.24659,16.8934,51.990658,7.922835,24.147166,47.589153,7.227484,6.052493,134.863373,56,1060560000,1158019200


In [44]:
d_test.isna().sum()
# almost identical to: d_test.isnull().sum()

radius_0           0
texture_0          0
perimeter_0        0
radius_1          25
texture_1          0
perimeter_1       66
radius_2           0
texture_2         12
perimeter_2        0
age                0
treatment_date     0
diagnose_date      0
dtype: int64

In [45]:
f'Currently, there are {sum(d_test.isna().sum())} NA/missing values in the \"d_test\" DataFrame.'

'Currently, there are 103 NA/missing values in the "d_test" DataFrame.'

In [46]:
X_test = d_test[DF_FEATURES]

In [47]:
Y_forec = pipeline.predict(X_test)
Y_forec = np.rint(Y_forec)
len(Y_forec)

NotFittedError: This PCA instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
d_submission = pd.read_csv(DATA_PRIMARY_SUBMISSION)
d_submission.info()

In [None]:
d_submission.head()

In [None]:
f'The average value from the original submission file, before forecasting is {statistics.mean(d_submission.cancer_type):.4f}.'

In [None]:
d_submission[DF_TARGET] = Y_forec
d_submission[DF_TARGET] = np.rint(d_submission[DF_TARGET])
d_submission[DF_TARGET] = d_submission[DF_TARGET].astype(int)

In [None]:
d_submission.head()

In [None]:
f'The average value after forecasting is {statistics.mean(d_submission.cancer_type):.4f}.'

In [None]:
len(d_submission[DF_TARGET])

In [None]:
len(Y_forec)

In [None]:
target_filename = DATA_PRIMARY_SUBMISSION_PATTERN.format(dtime=get_current_time_in_std_format())
f'The resulting file will be saved to \"{target_filename}\".'

In [None]:
d_submission.to_csv(path_or_buf=target_filename, index=False)

In [None]:
d_submission.info()

In [None]:
d_submission.describe()