In [15]:
# %load 002_category_property.py
import gc
import numpy as np
import pandas as pd
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
from tqdm import tqdm
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import time


start = time.time()
path = '../data/'
train_file = 'round1_ijcai_18_train_20180301.txt'
test_file = 'round1_ijcai_18_test_a_20180301.txt'

train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
test = pd.read_table(path + test_file, encoding='utf8', delim_whitespace=True)
train.drop_duplicates('instance_id', inplace=True)
test.drop_duplicates('instance_id', inplace=True)

len_train = train.shape[0]
df = pd.concat([train, test], axis=0, ignore_index=True)


corpus = df.item_property_list.values.astype('U').tolist()
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
countvector = vectorizer.transform(df.item_property_list)

pror_count = np.array(countvector.sum(axis=0))[0]
count = pd.Series(data=pror_count, index=np.arange(0, len(pror_count)))
selected_index = list(count.sort_values(ascending=False)[:10].index)
column_name = np.array(vectorizer.get_feature_names())[selected_index]
column_value = countvector[:, selected_index]
final_feat = pd.DataFrame(column_value.toarray(), columns=column_name)
final_feat['instance_id'] = df['instance_id']

train_feat = final_feat.iloc[:len(train), :]
test_feat = final_feat.iloc[len(train):, :]

print('the shape of train {}'.format(train_feat.shape))
print('the shape of test {}'.format(test_feat.shape))
dump_pickle(train_feat, path=raw_data_path + 'train_feature/' + '002_property_count.pkl')
dump_pickle(test_feat, path=raw_data_path + 'test_feature/' + '002_property_count.pkl')

end = time.time()
print('time elapsed {}'.format(end-start))

the shape of train (478087, 11)
the shape of test (18371, 11)
time elapsed 60.010128021240234


In [6]:
pror_count

array([30,  4,  6, ...,  1,  3,  1], dtype=int64)

In [8]:
selected_index

[12433, 31061, 61175, 41247, 1919, 27618, 7854, 47641, 5840, 14131]

In [9]:
countvector

<496509x62368 sparse matrix of type '<class 'numpy.int64'>'
	with 16998124 stored elements in Compressed Sparse Row format>

In [10]:
column_value

<496509x10 sparse matrix of type '<class 'numpy.int64'>'
	with 2881149 stored elements in Compressed Sparse Row format>

In [11]:
final_feat.shape

(496509, 11)

In [12]:
train_feat.shape

(478138, 11)

In [13]:
len(np.unique(df.instance_id))

496455