In [1]:
import tqdm
import os
import argparse
import config
import logging
import numpy as np
from skift import FirstColFtClassifier
from sklearn.externals import joblib
from util import load_data_from_csv, seg_words, get_f1_score, seg_words_multiprocessor

logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %(message)s')
logger = logging.getLogger(__name__)

In [2]:
learning_rate = 1.0
epoch = 10
word_ngrams = 3
min_count = 1
model_name = 'fasttext_model_lr{}_e{}_n{}_c{}.pkl'.format(learning_rate, epoch, word_ngrams, min_count)

In [3]:
# load train data
# logger.info("start load load")
train_data_df = load_data_from_csv(config.train_data_path)
validate_data_df = load_data_from_csv(config.validate_data_path)

content_train = train_data_df.iloc[:, 1]

In [None]:
train_data_df.head()

In [None]:
logger.info("start seg train data")
content_train = seg_words_multiprocessor(content_train)
logger.info("complete seg train data")

In [None]:
content_train[0]

In [None]:
logger.info("prepare train format")
train_data_format = np.asarray([content_train]).T
logger.info("complete formate train data")

np.save('train_data_format', train_data_format)

In [6]:
train_data_format = np.load('train_data_format.npy')

In [None]:
train_data_format.shape
train_data_format[0]

In [8]:
columns = train_data_df.columns.values.tolist()

In [8]:
columns

['id',
 'content',
 'location_traffic_convenience',
 'location_distance_from_business_district',
 'location_easy_to_find',
 'service_wait_time',
 'service_waiters_attitude',
 'service_parking_convenience',
 'service_serving_speed',
 'price_level',
 'price_cost_effective',
 'price_discount',
 'environment_decoration',
 'environment_noise',
 'environment_space',
 'environment_cleaness',
 'dish_portion',
 'dish_taste',
 'dish_look',
 'dish_recommendation',
 'others_overall_experience',
 'others_willing_to_consume_again']

In [9]:
# model train

# logger.info("start train model")
classifier_dict = dict()
for column in tqdm.tqdm(columns[2:]):
    train_label = train_data_df[column]
#     logger.info("start train %s model" % column)
    sk_clf = FirstColFtClassifier(lr=learning_rate, epoch=epoch,
                                  wordNgrams=word_ngrams,
                                  minCount=min_count, verbose=1)
    sk_clf.fit(train_data_format, train_label)
#     logger.info("complete train %s model" % column)
    classifier_dict[column] = sk_clf
# logger.info("complete train model")
# logger.info("start save model")
model_path = config.model_path
if not os.path.exists(model_path):
    os.makedirs(model_path)
joblib.dump(classifier_dict, model_path + model_name)
# logger.info("complete svae model")

100%|██████████| 20/20 [07:48<00:00, 23.04s/it]


['/jet/prs/workspace/fasttext/data/model/fasttext_model_lr1.0_e10_n3_c1.pkl']

In [None]:
joblib.dump(classifier_dict, model_path + model_name)

In [5]:
model_path = config.model_path
classifier_dict = joblib.load(model_path + model_name)

In [10]:
# del train_data_format
# del train_data_df
# del validate_data_df
import gc
gc.collect()

0

In [None]:
# validata model
content_validata = validate_data_df.iloc[:, 1]

logger.info("start seg validata data")
content_validata = seg_words(content_validata)
logger.info("complet seg validata data")

logger.info("prepare valid format")
validata_data_format = np.asarray([content_validata]).T
logger.info("complete formate train data")

np.save('validata_data_format', validata_data_format)

In [6]:
validata_data_format = np.load('validata_data_format.npy')

In [9]:
logger.info("start compute f1 score for validata model")
f1_score_dict = dict()
for column in columns[2:]:
    true_label = np.asarray(validate_data_df[column])
    classifier = classifier_dict[column]
    pred_label = classifier.predict(validata_data_format).astype(int)
    f1_score = get_f1_score(true_label, pred_label)
    f1_score_dict[column] = f1_score

f1_score = np.mean(list(f1_score_dict.values()))
# str_score = "\n"
for column in columns[2:]:
    logger.info("{:}: {:.2f}".format(column, f1_score_dict[column]))
#     print("{:}: {:.2f}".format(column, f1_score_dict[column]))
#     str_score += column + ":" + str(f1_score_dict[column]) + "\n"

# logger.info("f1_scores: %s\n" % str_score)
logger.info("f1_score: %s" % f1_score)
logger.info("complete compute f1 score for validate model")

2019-04-01 15:30:58,428 [INFO] <MainProcess> (MainThread) start compute f1 score for validata model
2019-04-01 15:32:12,435 [INFO] <MainProcess> (MainThread) location_traffic_convenience: 0.51
2019-04-01 15:32:12,437 [INFO] <MainProcess> (MainThread) location_distance_from_business_district: 0.41
2019-04-01 15:32:12,438 [INFO] <MainProcess> (MainThread) location_easy_to_find: 0.60
2019-04-01 15:32:12,439 [INFO] <MainProcess> (MainThread) service_wait_time: 0.49
2019-04-01 15:32:12,439 [INFO] <MainProcess> (MainThread) service_waiters_attitude: 0.68
2019-04-01 15:32:12,440 [INFO] <MainProcess> (MainThread) service_parking_convenience: 0.56
2019-04-01 15:32:12,441 [INFO] <MainProcess> (MainThread) service_serving_speed: 0.57
2019-04-01 15:32:12,442 [INFO] <MainProcess> (MainThread) price_level: 0.66
2019-04-01 15:32:12,443 [INFO] <MainProcess> (MainThread) price_cost_effective: 0.60
2019-04-01 15:32:12,444 [INFO] <MainProcess> (MainThread) price_discount: 0.56
2019-04-01 15:32:12,446 [IN