In [1]:
# Phân loại domain sử dụng XGBoost
!free

              total        used        free      shared  buff/cache   available
Mem:      164620976   145280028    10034976      108368     9305972    18224180
Swap:      16777212    16777100         112


In [2]:
import sklearn
import pandas as pd
from io import open as open_unicode
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from collections import Counter
from xgboost import XGBClassifier
import time
import os



In [3]:
MODE = 'vi' # MODIFY
MONTH = '202001' # MODIFY
TREE_VERSION = '8'

In [4]:
current_root = '/home/polynote/notebooks/anhdt157/classify-use-ml/end-to-end-final-predict/'
root_dir = '/home/polynote/notebooks/anhdt157/'

In [5]:
# OUTPUT - 2 model
output_model = current_root + 'model/model-each-month/' + MODE + '-model-' + MONTH + '.pkl'
output_vectorizer = current_root + 'model/model-each-month/' + MODE + '-vectorizer-' + MONTH + '.pkl'
# OUTPUT - result predict
output_result_predict = current_root + 'result_predict/result_each_month/' + MODE + '-result-accuracy-' + MONTH + '.csv'
# OUTPUT - REPORT - TXT
output_result_accuracy = current_root + 'result_predict/result_each_month/' + MODE + '-only-accuracy-' + MONTH + '.txt'

In [6]:
# For train and test for calculating accuracy
input_train_main_use = current_root + 'train_test_data/train/' + MODE + '/' + MODE + '-train-' + MONTH + '.csv'
input_test_main_use = current_root + 'train_test_data/train/' + MODE + '/' + MODE + '-test-' + MONTH + '.csv'

In [7]:
input_dict_IID_use = current_root + 'train_test_data/dict_IID_label/' + MODE + '-dict-IID-label-' + MONTH + '.csv'

In [8]:
df_train_main_use = pd.read_csv(input_train_main_use, "|")
df_test_main_use = pd.read_csv(input_test_main_use, "|")

In [9]:
len(df_train_main_use)

104021

In [10]:
len(df_test_main_use)

11613

## BY MODE

In [11]:
# create new folder
def util_mkdir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)

In [12]:
# return dict_IID_to_label and dict_label_to_IID
def read_dict_IID_label_file(path_dict):
    df_IID_to_label = pd.read_csv(path_dict, delimiter='|')
    dict_label_to_IID = {}
    dict_IID_to_label = {}
    for index, row in df_IID_to_label.iterrows():
        dict_IID_to_label[row['IID']] = row['label']
        dict_label_to_IID[row['label']] = row['IID']
    return dict_label_to_IID, dict_IID_to_label

In [13]:
dict_label_to_IID_use, dict_IID_to_label_use = read_dict_IID_label_file(input_dict_IID_use)

In [14]:
def keep_top_value_in_dict(input_dict, number=5):
    return {k: v for k, v in sorted(list(input_dict.items()), key=lambda x: x[1], reverse=True)[:number]}

In [15]:
def create_dict_domain_ID(df_inner_test):
    dict_index_with_domain_ID = {}
    dict_index_with_domain_clean = {}
    count_x = 0
    for index, row in df_inner_test.iterrows():
        dict_index_with_domain_ID[count_x] = row['domain_id']
        dict_index_with_domain_clean[count_x] = row['domain']
        count_x += 1
    return dict_index_with_domain_ID, dict_index_with_domain_clean

In [16]:
dict_index_with_domain_ID_use, dict_index_with_domain_clean_use = create_dict_domain_ID(df_test_main_use)

In [17]:
# Tạo dict ban đầu, với mỗi type ('IID', 'level_1', 'level_2', 'level_3')
# sẽ là list các
# row['title_desc'] tương ứng của nó
# Example: {'A999999': ['abc', 'xyz', 'mnp'], 'B010202': ['ppp', 'mmm']}
def generate_list_des_domain(dfx, type_):
    dict_tokenized_vi = {}
    keys = []
    for index, row in dfx.iterrows():
        IID = row[type_]
        title_desc = row['title_desc_preprocess']
        if IID not in keys:
            keys.append(IID)
            new_child_list = []
            new_child_list.append(title_desc)
            #ben duoi chu y check co trong list chua
            dict_tokenized_vi[IID] = new_child_list
        else:
            old_child_list = dict_tokenized_vi[IID]
            old_child_list.append(title_desc)
    print('length-df = ', len(dfx))
    return dict_tokenized_vi

In [18]:
# Return list IID-label
def get_list_IID_label(dict_str):
    list_label_result = []
    list_IID_result = []
    
    max_value = dict_str[list(dict_str)[0]]
    label_max_value = list(dict_str)[0]
    
    if max_value < 0.1:
        list_label_result.append(label_max_value)
        list_IID_result.append(dict_label_to_IID_use[label_max_value])
    else:
        for label, prob in dict_str.items():
            if len(list_label_result) >= 3:
                break
            if prob >= 0.1:
                list_label_result.append(label)
                list_IID_result.append(dict_label_to_IID_use[label])
    return list_label_result, list_IID_result

In [19]:
start_time = time.time()
class Classification:
    def __init__(self, root_dir='.'):
        self.model = None
        self.vectorizer = None
        self.root_dir = root_dir
        self.result = os.path.join(self.root_dir, 'result')
        self.list_test_domains = []
        
    def evaluation(self, x, y):
        count = Counter(y)
        print('count', count)
        print('len-x=', len(x.shape))
        print('len-y=', len(y))
        y_pred = self.model.predict(x)
        y_prob = self.model.predict_proba(x)
        accuracy = accuracy_score(y, y_pred)
        print('accuracy score = %.5f' % (accuracy))
        
        # Writing report file
        print("writing report file...", output_result_accuracy)
        with open_unicode(output_result_accuracy, "w") as f_file:
            f_file.write('LANGUAGE: ' + MODE + '\n')
            f_file.write('RESULT - MONTH: ' + MONTH + '\n')
            f_file.write('REPORT - accuracy score = ' + str(accuracy) + '\n')
            f_file.write('length-train = ' + str(len(df_train_main_use)) + '\n')
            f_file.write('length-test = ' + str(len(df_test_main_use)) + '\n')
        print("=======================")
        return y, y_pred, y_prob
        
    def save(self, model, path):
        print('saving %s...' % (path))
        util_mkdir('model')
        joblib.dump(model, path, compress=True)
        print('Done')
        print("=======================")
        return
    
    def feature_extraction(self, X):
        if self.vectorizer == None:
#             self.vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.6, min_df=2)
            self.vectorizer = TfidfVectorizer()
            self.vectorizer.fit(X)
            self.vectorizer.stop_words_ = None
        return self.vectorizer.transform(X)
    
    def save_model(self):
        self.save(self.vectorizer, output_vectorizer)
        self.save(self.model, output_model)
        
    def load_model(self):
        self.vectorizer = self.load(output_vectorizer)
        self.model = self.load(output_model)
        
    def load(self, model):
        print('loading %s ...' % (model))
        if os.path.isfile(model):
            return joblib.load(model)
        else:
            return None
        
    def predict(self, list_domains):
        X = self.feature_extraction(list_domains)
        return self.model.predict(X)
    
    def training(self):
        # dict trong đó mỗi IID ứng với list các domain tương ứng của nó
        # TRAIN
        print('leng---train----->' + str(len(df_train_main_use)))
        samples_train = generate_list_des_domain(df_train_main_use, 'IID')
        x_train, y_train = self.prepare_data(samples_train)
        print(x_train[0])
        x_train_vec = self.feature_extraction(x_train)
        print(y_train)
        print('===========++++++++++++++++++==============')
        print('len x', x_train_vec.shape)
        print('len y', len(y_train))
        self.fit(x_train_vec, y_train)
        # ===================
        # TEST
        print('leng---test----->' + str(len(df_test_main_use)))
        samples_test = generate_list_des_domain(df_test_main_use, 'IID')
        x_test, y_test = self.prepare_data(samples_test)
        x_test_vec = self.feature_extraction(x_test)
#         y_truth = y_test
        y_truth, y_pred, y_prob = self.evaluation(x_test_vec, y_test)
        self.process_and_save_prob_data(y_truth, y_pred, y_prob, 
                                        output_result_predict,
                                        x_test)
        self.save_model()
    def process_and_save_prob_data(self, y_true, y_pre, y_prob, output_path, x_true):
        # example: [{0: 0.98, 1: 0.66, 2: 0.42, 3: 0.64, 4: 0.12, 5: 0.26}, {0: 0.98, 1: 0.66, 2: 0.42, 3: 0.64, 4: 0.12, 5: 0.26},...]
        list_each_dict = []
        for i in range(len(y_prob)):
            dict_prob = {}
            # Với từng dòng là list các xác suất tương ứng, i là index của mỗi dòng, tương ứng thứ tự của domain
            for j in range(len(y_prob[i])):
                dict_prob[j] = y_prob[i][j]
            dict_prob = keep_top_value_in_dict(dict_prob)
            list_each_dict.append(dict_prob)
        #=============
        # Save data
        print("Writing data to csv file '" + output_path + "'")
        with open(output_path, "w") as f:
            f_writer = csv.writer(f, delimiter='|')
            f_writer.writerow(
                ['domain_ID', 'domain_clean', 'IID', 
                 'title_desc_preprocess', 
                 'date', 'tree_version'])
            for i, value in enumerate(y_pre):
                # write each domain ----'
                label_y_predict, IID_y_predict = get_list_IID_label(list_each_dict[i])
                for i_, IID_ in enumerate(IID_y_predict):
                    f_writer.writerow(
                        [dict_index_with_domain_ID_use[i],
                         dict_index_with_domain_clean_use[i],
                         IID_, x_true[i], MONTH, TREE_VERSION])
            time_process = time.time() - start_time
            print("Done writing csv data")
            print("=======================")
            print("PROCESSED TIME: ", str(time_process))

    def predict_exist_model(self):
        pass
    def prepare_data(self, dataset):
        x = []
        y = []
        for name, list_domains in dataset.items():
#             if name == 'B010399':
#                 continue
            label = dict_IID_to_label_use[name]
            for i, domain_title in enumerate(list_domains):
                y.append(label)
                x.append(domain_title)
        return x, y
    def fit(self, x, y):
        print('fit model...')
        self.model = XGBClassifier()
        self.model.fit(x, y)
        self.model.coefs_paths_ = None
        self.model.scores_ = None
        self.model.C_ = None
        self.n_iter_ = None
    def run(self):
        self.training()

In [20]:
c = Classification()
c.run()

leng---train----->104021
length-df =  104021
vietdesi thiết_kế in_ấn quảng_cáo thiết_kế kế web logo banner chuyên_nghiệp cao cấpchuẩn seo
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# END