In [1]:
import numpy as np
import pandas as pd
import json
import urllib.request, json
import underthesea
import re
import emoji
import transformers
import torch
import joblib
from tqdm import tqdm_notebook
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from sklearn.feature_extraction.text import TfidfVectorizer


phobert = transformers.AutoModel.from_pretrained("vinai/phobert-base")
phobert.cuda()
tokenizer = transformers.AutoTokenizer.from_pretrained("vinai/phobert-base")

In [None]:
def clean_str(string):
    string = re.sub(r"[^aAàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬbBcCdDđĐeEèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆfFgGhHiIìÌỉỈĩĨíÍịỊjJkKlLmMnNoOòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢpPqQrRsStTuUùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰvVwWxXyYỳỲỷỶỹỸýÝỵỴzZ0-9]", " ", string)
    string = give_emoji_free_text(string)
    return string.strip()

def give_emoji_free_text(text):
    for i in text:
        if i in emoji.UNICODE_EMOJI:
            text = text.replace(i,'')
    return text

def remove_blank(s):
    re = ''
    for i in range(0,len(s)):
        if s[i] == ' ':
            if s[i-1] != ' ':
                re = re + s[i]
        else:
            re = re + s[i]
    return re

def text_lowercase(string):
    return string.lower()

def tokenize(strings):    
    return underthesea.word_tokenize(strings, format="text")

def remove_stopwords(strings):
    strings = strings.split()
    f = open('vietnamese-stopwords.txt', 'r',encoding="utf-8")
    stopwords = f.readlines()
    stop_words = [s.replace("\n", '') for s in stopwords]
    doc_words = []
    
    for word in strings:
        if word not in stop_words:
            doc_words.append(word)
    doc_str = ' '.join(doc_words).strip()
    return doc_str

def text_preprocessing(strings):
    temp = clean_str(strings)
    temp = text_lowercase(temp)
    temp = tokenize(temp)
    return temp

def text_preprocessing2(strings):
    temp = clean_str(strings)
    temp = text_lowercase(temp)
    temp = remove_stopwords(temp)
    return temp

def removeEmptySentens(data):
    data[0].replace('', np.nan, inplace=True)
    data.dropna(subset=[0], inplace=True)

def returnResult(a):
    return len([i for i in a if i == 0]),len([i for i in a if i == 1])

def runAPI(ids):
    product_id = ids
    print(product_id)
    url = "https://tiki.vn/api/v2/reviews?product_id="+product_id+"&sort=score%7Casc,id%7Casc,stars%7Call&page=1&limit=300&include=comments"
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())
    
    data_crawl = []
    list_comment = data.get('data')
    for comment in list_comment:
        if comment.get('content') != '':
            data_crawl.append(comment.get('content'))
    if len(data_crawl) == 0:
        return 0,0
#     data = []
#     for i in data_crawl:
#         data.append(text_preprocessing(i))
#     data = pd.DataFrame(data)
#     removeEmptySentens(data)
    
    temp_id = []
    data_processbert = []
    data_processtfidf = []
    count = 0
    for i in data_crawl:
        data_processbert.append(text_preprocessing(i))
        data_processtfidf.append(text_preprocessing2(i))
        if data_processbert[count] == '' or data_processtfidf[count] == '':
            temp_id.append(count)
        count = count + 1
    data = pd.DataFrame(data_processbert)
    
    count = 0
    data = data.drop(temp_id)
    for i in temp_id:
        data_processtfidf.pop(i-count)
        count = count + 1
    removeEmptySentens(data)
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data_processtfidf).toarray()
    
    #encode lines
    tokenized = data[0].apply((lambda x: tokenizer.encode(x, add_special_tokens = True)))
    labels = np.zeros((len(data),))
    
    # get lenght max of tokenized
    max_len = 254
#     for i in tokenized.values:
#         if len(i) > max_len:
#             max_len = len(i)
#     print('max len:', max_len)

    # if lenght of tokenized not equal max_len , so padding value 0
    padded = []
    for i in tokenized.values:
        if len(i) > max_len:
            padded.append(i[0:max_len])
        else:
            padded.append(i + [0]*(max_len-len(i)))

    padded = np.array(padded)
    print('padded:', padded[1])
    print('len padded:', np.array(padded).shape)

    #get attention mask ( 0: not has word, 1: has word)
    attention_mask = np.where(padded ==0, 0,1)

    # Convert input to tensor
    padded = torch.tensor(padded,dtype=int).cuda()
    attention_mask = torch.tensor(attention_mask).cuda()
    
    labels_tensor = torch.tensor(labels).cuda()
    
    data = TensorDataset(padded, attention_mask,labels_tensor)
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=32)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Train model
    phobert.eval()
    all_features = []
    with torch.no_grad():
        for index,batch in tqdm_notebook(enumerate(dataloader)):
            batch = tuple(t.to(device) for t in batch)
            padded,attention_mask,_ = batch
            last_hidden_states = phobert(padded, attention_mask =attention_mask)
            all_features.extend(last_hidden_states[1].tolist().copy())

    features = np.concatenate((all_features,X), axis=1)
    
    res = 5311
    
    tem = np.zeros((len(features),res-len(features[0])))
    features = np.concatenate((features,tem), axis=1)
    
    model = joblib.load(r'save_model_nlp_svm.pkl')
    result = model.predict(features)
    return returnResult(result)

In [None]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    print('using device: cuda')
else:
    print('using device: cpu')

In [None]:
from flask import Flask, request
import flask
from flask import jsonify
from IPython.display import clear_output
app = Flask(__name__)

# Khai báo các route 1 cho API
@app.route("/<ids>", methods=["GET"])
# Khai báo hàm xử lý dữ liệu.
def _hello_world(ids):
    clear_output()
    c0,c1 = runAPI(ids)
    return jsonify(
        tt=c0,
        tc=c1
    )


if __name__ == "__main__":
    print("App run!")
    # Load model
    app.run(debug=False, threaded=False,host='0.0.0.0', port=80)