In [45]:
import pandas as pd
import numpy as np
import sklearn.cluster as skc
import sklearn.preprocessing as skp
from collections import defaultdict

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from math import floor

In [3]:
data_keys = pd.read_csv('../Processing/FrameContainer/DataRows/Data_keys.csv', header=None)
data_keys = data_keys.iloc[:,0].tolist()

In [6]:
K = np.loadtxt('../Processing/DTW_Matrix.out.gz', delimiter=',')

In [7]:
## Run Hierarchical clustering on the matrix 
cluster_ward = skc.AgglomerativeClustering(n_clusters=10, affinity='euclidean', connectivity=K, compute_full_tree=True, linkage='ward')
# cluster_complete = skc.AgglomerativeClustering(n_clusters=10, affinity='euclidean', connectivity=K, compute_full_tree=True, linkage='complete')
# cluster_average = skc.AgglomerativeClustering(n_clusters=10, affinity='euclidean', connectivity=K, compute_full_tree=True, linkage='average')

# Predict the class labels
labels_ward = cluster_ward.fit_predict(K)
# labels_complete = cluster_complete.fit_predict(K)
# labels_average = cluster_average.fit_predict(K)

## Read-in Offline Processed Files

In [13]:
# Helpful dict structure
labeled_data = dict(list(zip(data_keys, labels_ward)))

In [14]:
dataResult_PATH = '../Processing/FrameContainer/DataRows/Data_Keys.csv'
data_keys = pd.read_csv(dataResult_PATH, delimiter='\n', header=None)
data_keys = data_keys[0].values.tolist()

for idx in range(0,15):
    print("Starting join process for Query Chunk: %d" % idx)
    
    outResult_PATH = '../Processing/FrameContainer/OutRows/'
    queryKey_PATH = '../Processing/FrameContainer/QueryRows/'
    
    out_FNAME = 'Out_Rows_'
    query_FNAME = 'Query_Keys_'
    
    queryIndices = pd.read_csv(queryKey_PATH + query_FNAME + str(idx) + '.csv', delimiter='\n', header=None)
    searchResults = pd.read_csv(outResult_PATH + out_FNAME + str(idx) + '.txt', delimiter='\n', header=None)
    
    queryIndices = queryIndices[0].values.tolist()
    searchResults = searchResults[0].values.tolist()
    # Map search results to corresponding data row index
    mappedResults = [data_keys[i] for i in searchResults]
    
    for idx, q in enumerate(queryIndices):
        labeled_data[q] = labeled_data[mappedResults[idx]]

class_data = defaultdict(list)
for k, v in labeled_data.items():
    class_data[v].append(k)

Starting join process for Query Chunk: 0
Starting join process for Query Chunk: 1
Starting join process for Query Chunk: 2
Starting join process for Query Chunk: 3
Starting join process for Query Chunk: 4
Starting join process for Query Chunk: 5
Starting join process for Query Chunk: 6
Starting join process for Query Chunk: 7
Starting join process for Query Chunk: 8
Starting join process for Query Chunk: 9
Starting join process for Query Chunk: 10
Starting join process for Query Chunk: 11
Starting join process for Query Chunk: 12
Starting join process for Query Chunk: 13
Starting join process for Query Chunk: 14


In [15]:
for k in class_data.keys():
    print("Label %d | Class size: %d" % (k, len(class_data[k])))

Label 0 | Class size: 982
Label 1 | Class size: 5908
Label 2 | Class size: 2915
Label 3 | Class size: 2991
Label 4 | Class size: 21433
Label 5 | Class size: 20520
Label 6 | Class size: 89502
Label 7 | Class size: 247
Label 8 | Class size: 186
Label 9 | Class size: 379


## XGBOOST

In [1]:
import xgboost as xgb

'''
alg: XGBClassifier() or XGBRegressor()
dtrain: the entire training dataframe
predictors: list of feature columns in dtrain
target: column we want to classify
'''

def modelfit(alg, dtrain, predictors, target, useTrainCV=True, cv_folds=3, early_stopping_rounds=4):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=target)
        cvresult = xgb.cv(xgb_param, 
                          xgtrain, 
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='logloss', 
                          early_stopping_rounds=early_stopping_rounds, 
                          verbose_eval=True)
        
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], target, eval_metric='logloss')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(target, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(target, dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')



In [None]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

## Train Indirect Models on Single Class


In [19]:
rawData_df = pd.read_csv('../input/train_1.csv')
rawData_df.fillna(value=0.0, inplace=True)

In [59]:
# Let's just attempt for Class 9 
CLASS_SIZE = len(class_data[9])
FIXED_WIDTH = 25
FIXED_ROW_CAP = 10e3
SAMPLE_PER_ROW = floor(FIXED_ROW_CAP / CLASS_SIZE)

In [60]:
class_slice_df = rawData_df.ix[class_data[9]]

In [115]:
NR, NC = class_slice_df.shape

jointFrame = pd.DataFrame()

# TRAIN A MODEL FOR EACH GAP
for gap in range(0,60):
    
    # Create the training samples by going through each row
    for spridx in range(0, SAMPLE_PER_ROW):
        indices = list(range(NC-2-gap-spridx-FIXED_WIDTH, NC-1-gap-spridx))
#         print(NC-2-gap-spridx-FIXED_WIDTH)
#         print(NC-2-gap-spridx)
#         print(NC-1-spridx)
        tmp_slice = class_slice_df.iloc[:,indices]
        tmp_slice.columns = list(range(0,FIXED_WIDTH+1))
            # Alway update the joint frame
        if (jointFrame.empty):
            jointFrame = tmp_slice
        else:
            jointFrame = jointFrame.append(tmp_slice, ignore_index=True)

    jointFrame.rename(columns={FIXED_WIDTH:'y'}, inplace=True)
    break
    
    # XGBoost SVM on the generated training samples
    
    #Choose all predictors except target & IDcols


modelfit(xgb1, df_X, SEMANTIC_COL_LABELS + TREEKERNEL_COL_LABELS, df_y.values)
    
    # Map the new datapoints to page-names
    
    # Concatenate the training samples on an empty dataframe