In [1]:
import os
import sys
import random
import collections
from multiprocessing import Pool as ThreadPool

In [2]:
continous_features = range(1, 14)
categorial_features = range(14, 40)

In [3]:
continous_clip = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]

In [132]:
class CategoryDictGenerator:
    def __init__(self, num_feature):
        self.dicts = []
        self.num_feature = num_feature
        for i in range(0, num_feature):
            self.dicts.append(collections.defaultdict(int))
    def build(self, datafile, categorial_features, cutoff=0):
        with open(datafile, 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')
                for i in range(0, self.num_feature):
                    if features[categorial_features[i]] != '':
                        self.dicts[i][features[categorial_features[i]]] += 1
        for i in range(0, self.num_feature):
            self.dicts[i] = filter(lambda x: x[1] >= cutoff, self.dicts[i].items())
            self.dicts[i] = sorted(self.dicts[i], key=lambda x: (-x[1], x[0]))
            vocabs, _ = list(zip(*self.dicts[i]))
            self.dicts[i] = dict(zip(vocabs, range(1, len(vocabs) + 1)))
            self.dicts[i]['<unk>'] = 0
            
    def gen(self, idx, key):
        if key not in self.dicts[idx]:
            res = self.dicts[idx]['<unk>']
        else:
            res = self.dicts[idx][key]
        return res
    
    def dicts_sizes(self):
        return list(map(len, self.dicts))

In [28]:
class ContinuousFeatureGenerator:
    def __init__(self, num_feature):
        self.num_feature = num_feature
        self.min = [float("inf")] * num_feature
        self.max = [-float("inf")] * num_feature
    def build(self, datafile, continous_features):
        with open(datafile, 'r') as f:
            for line in f:
                features = line.rstrip('\n').split('\t')
                for i in range(0, self.num_feature):
                    val = features[continous_features[i]]
                    if val != '':
                        val = int(val)
                        if val > continous_clip[i]:
                            val = continous_clip[i]
                        self.min[i] = min(self.min[i], val)
                        self.max[i] = max(self.max[i], val)
                        
    def gen(self, idx, val):
        if val == '':
            return 0.0
        val = float(val)
        return (val - self.min[idx]) / (self.max[idx] - self.min[idx])

In [133]:
dists = ContinuousFeatureGenerator(len(continous_features))
dicts = CategoryDictGenerator(len(categorial_features))

In [134]:
categorial_features

range(14, 40)

In [135]:
dists.build('dac_sample.txt', continous_features)
dicts.build('dac_sample.txt', categorial_features)

In [23]:
dists.max

[20, 600, 100, 50, 64000, 500, 100, 50, 500, 6, 10, 10, 50]

In [63]:
output = open('feature_map', 'w')
for i in continous_features:
    output.write("{0} {1}\n".format('I'+str(i), i))

In [139]:
dicts.dicts[25]['<unk>']

0

In [67]:
dict_sizes = dicts.dicts_sizes()
dict_sizes

[542,
 498,
 43870,
 25184,
 146,
 12,
 7624,
 258,
 4,
 10998,
 3800,
 41312,
 2797,
 27,
 5239,
 34617,
 11,
 2549,
 1303,
 4,
 38618,
 11,
 15,
 12335,
 51,
 9527]

In [64]:
categorial_feature_offset = [dists.num_feature]

In [70]:
for i in range(1, len(categorial_features)+1):
    offset = categorial_feature_offset[i-1] + dict_sizes[i-1]
    categorial_feature_offset.append(offset)
    for key, val in dicts.dicts[i-1].items():
        output.write("{0} {1}\n".format('C' + str(i) + '|' + key, categorial_feature_offset[i-1]+val+1))

In [43]:
C_COLUMNS = ['I' + str(i) for i in range(1,  14)]
D_COLUMNS = ['C' + str(i) for i in range(14, 40)]
LABEL_COLUMN = "is_click"
CSV_COLUMNS = [LABEL_COLUMN] + C_COLUMNS + D_COLUMNS

In [48]:
CSV_COLUMN_DEFAULTS = [[0.0]]
C_COLUMN_DEFAULTS = [[0.0] for i in range(13)]
D_COLUMN_DEFAULTS = [[0] for i in range(26)]
CSV_COLUMN_DEFAULTS = CSV_COLUMN_DEFAULTS + C_COLUMN_DEFAULTS + D_COLUMN_DEFAULTS
print(CSV_COLUMN_DEFAULTS)

[[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]


In [71]:
output.close()

In [72]:
f = open('dac_sample.txt')

In [75]:
lines = f.readlines()

In [78]:
line = lines[0]

In [79]:
features = line.rstrip('\n').split('\t')

In [93]:
f.close()

In [109]:
my_dicts = []

In [110]:
num_feature = dicts.num_feature

In [111]:
for i in range(0, num_feature):
    my_dicts.append(collections.defaultdict(int))

In [112]:
with open('dac_sample.txt', 'r') as f:
    for line in f:
        features = line.rstrip('\n').split('\t')
        for i in range(0, num_feature):
            if features[categorial_features[i]] != '':
                my_dicts[i][features[categorial_features[i]]] += 1

In [125]:
my_dicts[25]

defaultdict(int,
            {'9727dd16': 9,
             '731c3655': 307,
             '9a556cfc': 102,
             '8967c0d2': 4,
             'c43c3f58': 20,
             'aa5f0a15': 1271,
             '9593bba9': 1,
             'af7ece63': 11,
             '49d68486': 5046,
             'cdfe5ab7': 20,
             'a70a038a': 8,
             '47b6f269': 4,
             '09b76f8d': 1,
             'b7d9c3bc': 1299,
             'f159b6cb': 6,
             '987ea0be': 5,
             '491eeeef': 5,
             'a00829e6': 26,
             'a8cf207e': 17,
             'a475662f': 68,
             '303cea07': 1,
             'b1262ddd': 3,
             'bc7f21c2': 1,
             '1219b447': 261,
             '33ced911': 4,
             'ccc71a58': 1,
             'b820b6c5': 13,
             'b9809574': 864,
             '1c7f8927': 9,
             'c84c4aec': 2330,
             '4a6648b5': 14,
             'c23979db': 2,
             'b4a4615f': 84,
             'd9bcfc08': 234,


In [126]:
my_dicts[25] = filter(lambda x: x[1] >= 0, my_dicts[25].items())

In [127]:
my_dicts[25] = sorted(my_dicts[25], key=lambda x: (-x[1], x[0]))

In [128]:
vocabs, _ = list(zip(*my_dicts[25]))

In [129]:
my_dicts[25] = dict(zip(vocabs, range(1, len(vocabs) + 1)))

In [130]:
my_dicts[25]['<unk>'] = 0

In [131]:
my_dicts[25] 

{'49d68486': 1,
 'c84c4aec': 2,
 '2fede552': 3,
 '984e0db0': 4,
 'b7d9c3bc': 5,
 'aa5f0a15': 6,
 'c27f155b': 7,
 '9904c656': 8,
 'b9809574': 9,
 '56be3401': 10,
 'b13f4ade': 11,
 'c986348f': 12,
 'b29c74dc': 13,
 '4e7af834': 14,
 '731c3655': 15,
 '70b6702c': 16,
 'd15c0cc8': 17,
 '6935065e': 18,
 '54ca28ff': 19,
 '8d8eb391': 20,
 '1219b447': 21,
 '938732a0': 22,
 '6c27a535': 23,
 'd9bcfc08': 24,
 'd1d45fc5': 25,
 '074bb89f': 26,
 'cfd96da1': 27,
 '9f6a34e7': 28,
 'f3737bd0': 29,
 'e001324a': 30,
 'e75c9ae9': 31,
 '409c7293': 32,
 '99f4f64c': 33,
 '988b0775': 34,
 '0015d4de': 35,
 '64f08cc6': 36,
 'b1c17344': 37,
 'aa86a675': 38,
 '632dae8a': 39,
 '42c1aa63': 40,
 '1c57d70a': 41,
 'd21d0b82': 42,
 'fa3124de': 43,
 '6ab28812': 44,
 '602f0609': 45,
 '3547c540': 46,
 'd8a062c4': 47,
 '01774abe': 48,
 '71236095': 49,
 'c4304c4b': 50,
 'd67a6f5b': 51,
 'eb9a9610': 52,
 '68a2a837': 53,
 'f8d62db8': 54,
 'e740b2da': 55,
 '8e1ae331': 56,
 'bdf46dce': 57,
 '414c6af0': 58,
 '882f541d': 59,
 '9a55

In [141]:
output.close()

In [142]:
output = open('feature_map', 'w')
for i in continous_features:
    output.write("{0} {1}".format('I' + str(i), i))

In [143]:
dict_sizes = dicts.dicts_sizes()
dict_sizes

[542,
 498,
 43870,
 25184,
 146,
 12,
 7624,
 258,
 4,
 10998,
 3800,
 41312,
 2797,
 27,
 5239,
 34617,
 11,
 2549,
 1303,
 4,
 38618,
 11,
 15,
 12335,
 51,
 9527]

In [144]:
categorial_feature_offset = [dists.num_feature]

In [147]:
categorial_feature_offset[0]+dict_sizes[0]

555

In [149]:
for i in range(1, len(categorial_features)+1):
    offset = categorial_feature_offset[i-1] + dict_sizes[i-1]
    categorial_feature_offset.append(offset)
    for key, val in dicts.dicts[i-1].items():
        output.write("{0}, {1}\n".format('C'+str(i)+'|'+key, categorial_feature_offset[i-1]+val+1))

In [151]:
output.close()

In [154]:
f = open('dac_sample.txt', 'r')
lines  = f.readlines()
line = lines[0]

In [155]:
line

'0\t1\t1\t5\t0\t1382\t4\t15\t2\t181\t1\t2\t\t2\t68fd1e64\t80e26c9b\tfb936136\t7b4723c4\t25c83c98\t7e0ccccf\tde7995b8\t1f89b562\ta73ee510\ta8cd5504\tb2cb9c98\t37c9c164\t2824a5f6\t1adce6ef\t8ba8b39a\t891b62e7\te5ba7672\tf54016b9\t21ddcdc9\tb1252a9d\t07b5194c\t\t3a171ecb\tc5c50484\te8b83407\t9727dd16\n'

In [166]:
features = line.rstrip('\n').split('\t')
features

['0',
 '1',
 '1',
 '5',
 '0',
 '1382',
 '4',
 '15',
 '2',
 '181',
 '1',
 '2',
 '',
 '2',
 '68fd1e64',
 '80e26c9b',
 'fb936136',
 '7b4723c4',
 '25c83c98',
 '7e0ccccf',
 'de7995b8',
 '1f89b562',
 'a73ee510',
 'a8cd5504',
 'b2cb9c98',
 '37c9c164',
 '2824a5f6',
 '1adce6ef',
 '8ba8b39a',
 '891b62e7',
 'e5ba7672',
 'f54016b9',
 '21ddcdc9',
 'b1252a9d',
 '07b5194c',
 '',
 '3a171ecb',
 'c5c50484',
 'e8b83407',
 '9727dd16']

In [172]:
feat_vals = []

In [169]:
val = dists.gen(0, features[continous_features[0]])

In [160]:
features[continous_features[0]]

'1'

In [173]:
feat_vals.append(str(continous_features[0]) + ':' + "{0:.6f}".format(val).rstrip('0').rstrip('.'))

In [174]:
feat_vals

['1:0.05']

In [175]:
val = dicts.gen(0, features[categorial_features[0]]) + categorial_feature_offset[0]

In [167]:
dicts.gen(0, features[categorial_features[0]])

2

In [168]:
categorial_feature_offset[0]

13

In [176]:
feat_vals.append(str(val) + ':1')

In [178]:
feat_vals

['1:0.05', '15:1']