In [26]:
#scikit learn ensembe workflow for binary probability
import time; start_time = time.time()
import numpy as np
import pandas as pd
from sklearn import ensemble
import xgboost as xgb
from sklearn.metrics import log_loss, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
import random; random.seed(2016)
from math import*
from decimal import Decimal
import os
from scipy import fftpack
import csv

In [24]:
def histogram_intersection(h1, h2, bins):
   bins = np.diff(bins)
   sm = 0
   for i in range(len(bins)):
       sm += min(bins[i]*h1[i], bins[i]*h2[i])
   return sm

def manhattan_distance(x,y):
    return sum(abs(a-b) for a,b in zip(x,y))
 
def nth_root(value, n_root):
    root_value = 1/float(n_root)
    return round (Decimal(value) ** Decimal(root_value),3)
 
def minkowski_distance(x,y,p_value):
     return nth_root(sum(pow(abs(a-b),p_value) for a,b in zip(x, y)),p_value)
    
def square_rooted(x):
    return round(sqrt(sum([a*a for a in x])),3)
 
def cosine_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return round(numerator/float(denominator),3)    

def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

def fft(img):
    # Take the fourier transform of the image.
    F1 = fftpack.fft2(img)
 
    # Now shift the quadrants around so that low spatial frequencies are in
    # the center of the 2D fourier transformed image.
    F2 = fftpack.fftshift( F1 )
 
    # Calculate a 2D power spectrum
    psd2D = np.abs( F2 )**2
    return psd2D

In [6]:
#Load data
train = pd.read_json("./train.json")
test = pd.read_json("./test.json")

# train.head(3)

In [30]:
# Train data
metrics = []

if os.path.isfile('./train_metrics.csv'):
    print 'Already done...'
else:
    for idx in range(0,train.shape[0]):
        row = []
        img1 = train.loc[idx, ['band_1', 'band_2']]
        target = (train.loc[idx, ['is_iceberg']])[0]
        row.append(target)
        
        id = (train.loc[idx, ['id']])[0]
        row.append(id)

        img1 = np.stack([img1['band_1'], img1['band_2']], -1).reshape(75, 75, 2)
        band1 = img1[:, :, 0]
        band1 = (band1 + abs(band1.min())) / np.max((band1 + abs(band1.min())))

        band2 = img1[:, :, 1]
        band2 = (band2 + abs(band2.min())) / np.max(band2 + abs(band2.min()))

        fft_band1 = fft(band1)
        fft_band2 = fft(band2)

        hband1, hbins = np.histogram(band1, bins=256, normed=True)
        hband2, hbins = np.histogram(band2, bins=256, normed=True)

        fft_hband1, hbins = np.histogram(fft_band1, bins=256, normed=True)
        fft_hband2, hbins = np.histogram(fft_band2, bins=256, normed=True)

        hist_band1 = np.array([hband1]).ravel()
        series_band1 = pd.Series(hist_band1)
        hist_band2 = np.array([hband2]).ravel()
        series_band2 = pd.Series(hist_band2)

        fft_hist_band1 = np.array([fft_hband1]).ravel()
        fft_series_band1 = pd.Series(fft_hist_band1)
        fft_hist_band2 = np.array([fft_hband2]).ravel()
        fft_series_band2 = pd.Series(fft_hist_band2)

        diff = series_band1 - series_band2
        distance = np.sqrt(np.dot(diff, diff))
        row.append(distance)

        inter = histogram_intersection(hband1, hband2, hbins)
        row.append(inter)

        diff = fft_series_band1 - fft_series_band2
        distance = np.sqrt(np.dot(diff, diff))
        row.append(distance)

        inter = histogram_intersection(fft_hband1, fft_hband2, hbins)
        row.append(inter)

        manhattan = manhattan_distance(np.array([band1]).ravel(),np.array([band2]).ravel())
        row.append(manhattan)

        minkowski = minkowski_distance(np.array([band1]).ravel(),np.array([band2]).ravel(), 3)
        row.append(minkowski)

        cos_simil = cosine_similarity(np.array([band1]).ravel(),np.array([band2]).ravel())
        row.append(cos_simil)

        jaccard_simil = jaccard_similarity(np.array([band1]).ravel(),np.array([band2]).ravel())
        row.append(jaccard_simil)
        
        metrics.append(row)
        
header = 'target', 'id', 'normal_dist','normal_intersec', 'fft_dist','fft_intersec', 'manhattan_dist', 'minkowski_dist', 'cosine_simil', 'jaccard_simil'
            
with open("./train_metrics.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(metrics)

In [31]:
# Test data
metrics = []

if os.path.isfile('./test_metrics.csv'):
    print 'Already done...'
else:
    for idx in range(0,train.shape[0]):
        row = []
        img1 = train.loc[idx, ['band_1', 'band_2']]
        
        row.append(0)
        
        id = (train.loc[idx, ['id']])[0]
        row.append(id)
        
        img1 = np.stack([img1['band_1'], img1['band_2']], -1).reshape(75, 75, 2)
        band1 = img1[:, :, 0]
        band1 = (band1 + abs(band1.min())) / np.max((band1 + abs(band1.min())))

        band2 = img1[:, :, 1]
        band2 = (band2 + abs(band2.min())) / np.max(band2 + abs(band2.min()))

        fft_band1 = fft(band1)
        fft_band2 = fft(band2)

        hband1, hbins = np.histogram(band1, bins=256, normed=True)
        hband2, hbins = np.histogram(band2, bins=256, normed=True)

        fft_hband1, hbins = np.histogram(fft_band1, bins=256, normed=True)
        fft_hband2, hbins = np.histogram(fft_band2, bins=256, normed=True)

        hist_band1 = np.array([hband1]).ravel()
        series_band1 = pd.Series(hist_band1)
        hist_band2 = np.array([hband2]).ravel()
        series_band2 = pd.Series(hist_band2)

        fft_hist_band1 = np.array([fft_hband1]).ravel()
        fft_series_band1 = pd.Series(fft_hist_band1)
        fft_hist_band2 = np.array([fft_hband2]).ravel()
        fft_series_band2 = pd.Series(fft_hist_band2)

        diff = series_band1 - series_band2
        distance = np.sqrt(np.dot(diff, diff))
        row.append(distance)

        inter = histogram_intersection(hband1, hband2, hbins)
        row.append(inter)

        diff = fft_series_band1 - fft_series_band2
        distance = np.sqrt(np.dot(diff, diff))
        row.append(distance)

        inter = histogram_intersection(fft_hband1, fft_hband2, hbins)
        row.append(inter)

        manhattan = manhattan_distance(np.array([band1]).ravel(),np.array([band2]).ravel())
        row.append(manhattan)

        minkowski = minkowski_distance(np.array([band1]).ravel(),np.array([band2]).ravel(), 3)
        row.append(minkowski)

        cos_simil = cosine_similarity(np.array([band1]).ravel(),np.array([band2]).ravel())
        row.append(cos_simil)

        jaccard_simil = jaccard_similarity(np.array([band1]).ravel(),np.array([band2]).ravel())
        row.append(jaccard_simil)
        
        metrics.append(row)
        
header = 'target', 'id', 'normal_dist','normal_intersec', 'fft_dist','fft_intersec', 'manhattan_dist', 'minkowski_dist', 'cosine_simil', 'jaccard_simil'
            
with open("./test_metrics.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(metrics)

# XGBoost

In [None]:
train = pd.read_csv('./train_metrics.csv')
test = pd.read_csv('./test_metrics.csv')
num_train = train.shape[0]

In [None]:
y_train = train['target']
train = train.drop(['target'],axis=1)
id_test = test['ID']

def fill_nan_null(val):
    ret_fill_nan_null = 0.0
    if val == True:
        ret_fill_nan_null = 1.0
    return ret_fill_nan_null

df_all = pd.concat((train, test), axis=0, ignore_index=True)
df_all['null_count'] = df_all.isnull().sum(axis=1).tolist()
df_all_temp = df_all['ID']
df_all = df_all.drop(['ID'],axis=1)
df_data_types = df_all.dtypes[:] #{'object':0,'int64':0,'float64':0,'datetime64':0}
d_col_drops = []

for i in range(len(df_data_types)):
    df_all[str(df_data_types.index[i])+'_nan_'] = df_all[str(df_data_types.index[i])].map(lambda x:fill_nan_null(pd.isnull(x)))
df_all = df_all.fillna(-9999)
#df_all = df_all.replace(0, -9999)

for i in range(len(df_data_types)):
    if str(df_data_types[i])=='object':
        df_u = pd.unique(df_all[str(df_data_types.index[i])].ravel())
        print("Column: ", str(df_data_types.index[i]), " Length: ", len(df_u))
        d={}
        j = 1000
        for s in df_u:
            d[str(s)]=j
            j+=5
        df_all[str(df_data_types.index[i])+'_vect_'] = df_all[str(df_data_types.index[i])].map(lambda x:d[str(x)])
        d_col_drops.append(str(df_data_types.index[i]))
        if len(df_u)<150:
            dummies = pd.get_dummies(df_all[str(df_data_types.index[i])]).rename(columns=lambda x: str(df_data_types.index[i]) + '_' + str(x))
            df_all_temp = pd.concat([df_all_temp, dummies], axis=1)

df_all_temp = df_all_temp.drop(['ID'],axis=1)
df_all = pd.concat([df_all, df_all_temp], axis=1)
print(len(df_all), len(df_all.columns))
#df_all.to_csv("df_all.csv")
train = df_all.iloc[:num_train]
test = df_all.iloc[num_train:]
train = train.drop(d_col_drops,axis=1)
test = test.drop(d_col_drops,axis=1)

def flog_loss(ground_truth, predictions):
    flog_loss_ = log_loss(ground_truth, predictions) #, eps=1e-15, normalize=True, sample_weight=None)
    return flog_loss_
LL  = make_scorer(flog_loss, greater_is_better=False)

g={'ne':150,'md':6,'mf':80,'rs':2016} #change to g={'ne':500,'md':40,'mf':60,'rs':2016}
etc = ensemble.ExtraTreesClassifier(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], criterion='entropy', min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)      
etr = ensemble.ExtraTreesRegressor(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)      
rfc = ensemble.RandomForestClassifier(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], criterion='entropy', min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)
rfr = ensemble.RandomForestRegressor(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)
xgr = xgb.XGBRegressor(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'], missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85, objective='reg:linear')
xgc = xgb.XGBClassifier(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'], missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85, objective='binary:logistic') #try 'binary:logitraw'
#clf = {'etc':etc, 'etr':etr, 'rfc':rfc, 'rfr':rfr, 'xgr':xgr, 'xgc':xgc} # use this line instead
clf = {'etr':etr, 'rfr':rfr, 'xgr':xgr} # removed due to kaggle performance, would prefer less time and more cores than more time and less cores :)

y_pred=[]
best_score = 0.0
id_results = id_test[:]
for c in clf:
    if c[:1] != "x": #not xgb
        model = GridSearchCV(estimator=clf[c], param_grid={}, n_jobs =-1, cv=2, verbose=0, scoring=LL)
        model.fit(train, y_train.values)
        if c[-1:] != "c": #not classifier
            y_pred = model.predict(test)
            print("Ensemble Model: ", c, " Best CV score: ", model.best_score_, " Time: ", round(((time.time() - start_time)/60),2))
        else: #classifier
            best_score = (log_loss(y_train.values, model.predict_proba(train)))*-1
            y_pred = model.predict_proba(test)[:,1]
            print("Ensemble Model: ", c, " Best CV score: ", best_score, " Time: ", round(((time.time() - start_time)/60),2))
    else: #xgb
        X_fit, X_eval, y_fit, y_eval= train_test_split(train, y_train, test_size=0.35, train_size=0.65, random_state=g['rs'])
        model = clf[c]
        model.fit(X_fit, y_fit.values, early_stopping_rounds=20, eval_metric="logloss", eval_set=[(X_eval, y_eval)], verbose=0)
        if c == "xgr": #xgb regressor
            best_score = (log_loss(y_train.values, model.predict(train)))*-1
            y_pred = model.predict(test)
        else: #xgb classifier
            best_score = (log_loss(y_train.values, model.predict_proba(train)))*-1
            y_pred = model.predict_proba(test)[:,1]
        print("Ensemble Model: ", c, " Best CV score: ", best_score, " Time: ", round(((time.time() - start_time)/60),2))

    for i in range(len(y_pred)):
        if y_pred[i]<0.0:
            y_pred[i] = 0.0
        if y_pred[i]>1.0:
            y_pred[i] = 1.0
    df_in = pd.DataFrame({"ID": id_test, c: y_pred})
    id_results = pd.concat([id_results, df_in[c]], axis=1)
id_results['avg'] = id_results.drop('ID', axis=1).apply(np.average, axis=1)
id_results['min'] = id_results.drop('ID', axis=1).apply(min, axis=1)
id_results['max'] = id_results.drop('ID', axis=1).apply(max, axis=1)
id_results['diff'] = id_results['max'] - id_results['min']
for i in range(10):
    print(i, len(id_results[id_results['diff']>(i/10)]))
id_results.to_csv("results_analysis.csv", index=False)
ds = id_results[['ID','avg']]
ds.columns = ['ID','PredictedProb']
ds.to_csv('submission.csv',index=False)