In [1]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

In [2]:
# extract text from html_document
def extract_text(html_document):
        from bs4 import BeautifulSoup
        article_soup = BeautifulSoup(html_document, 'lxml')
        
        text = article_soup.get_text(strip=True)
        text = text.strip().replace(' ', '')
        return text

In [3]:
def get_corpus_from_mysql(sql_database, sql_table, column_name):
    article_text = ""

    import mysql.connector
    sql_db = mysql.connector.connect(
        host = '127.0.0.1',
        user = 'root',
        password = '',
        database = sql_database
    )

    cursor = sql_db.cursor()

    # to check whether TABLE exists or not
    cursor.execute("SHOW TABLES LIKE '{}'".format(sql_table))
    temp_result = cursor.fetchone()
    if (temp_result):
        logging.info("TABLE {} exists in DATABASE {}".format(sql_table, sql_database))
    else:
        logging.info("TABLE {} exists in DATABASE {}".format(sql_table, sql_database))
        
        
    # select the value of specific column
    select_sql = "SELECT {}, {} FROM {}".format(column_name[0], column_name[1], sql_table)
    logging.info("SELECT syntax: {}".format(select_sql))
    cursor.execute(select_sql)
    result = cursor.fetchall()
    
    return result

In [4]:
sql_database = 'contentanalysis'
sql_table = 'articles'
column_name = ('id', 'content')
result = get_corpus_from_mysql(sql_database, sql_table, column_name)

# get text
logging.info("start to extract text from the html documnet")
content = []
for ad in result:
    text = extract_text(ad[1])
    content.append([ad[0], text])
    if (ad[0]%1000) == 0:
        logging.info("Have extracted {} text".format(ad[0]))

# get word embedding from bert-as-server
logging.info("start to get sentence vector from BERT-as-server")
vector = []
#######REMEMBER to start the server#######################
from bert_serving.client import BertClient
bertclient = BertClient()
for ad in content:
    #logging.info(len(ad[1]))
    if len(ad[1]) == 0:
        vec = ['-']*768
        vector.append([ad[0], vec])
    else:
        vec = bertclient.encode([ad[1]])
        vector.append([ad[0], vec[0]])
    if (ad[0]%1000) == 0:
        logging.info("Have generated {} sentence vector".format(ad[0]))
print(vector[:5])
logging.info("sentence vector DONE")

2021-09-15 14:58:21,092: INFO: TABLE articles exists in DATABASE contentanalysis
2021-09-15 14:58:21,103: INFO: SELECT syntax: SELECT id, content FROM articles
2021-09-15 14:58:28,025: INFO: start to extract text from the html documnet
2021-09-15 14:58:30,130: INFO: Have extracted 1000 text
2021-09-15 14:58:32,280: INFO: Have extracted 2000 text
2021-09-15 14:58:34,922: INFO: Have extracted 3000 text
2021-09-15 14:58:38,174: INFO: Have extracted 4000 text
2021-09-15 14:58:41,066: INFO: Have extracted 5000 text
2021-09-15 14:58:44,086: INFO: Have extracted 6000 text
2021-09-15 14:58:47,606: INFO: Have extracted 7000 text
2021-09-15 14:58:51,103: INFO: Have extracted 8000 text
2021-09-15 14:58:53,313: INFO: Have extracted 9000 text
2021-09-15 14:58:55,131: INFO: Have extracted 10000 text
2021-09-15 14:58:56,793: INFO: Have extracted 11000 text
2021-09-15 14:58:58,290: INFO: Have extracted 12000 text
2021-09-15 14:58:59,049: INFO: Have extracted 16000 text
2021-09-15 14:59:00,434: INFO: H

2021-09-16 07:32:41,340: INFO: Have generated 38000 sentence vector
2021-09-16 08:04:04,487: INFO: Have generated 39000 sentence vector
2021-09-16 08:35:32,803: INFO: Have generated 40000 sentence vector
2021-09-16 09:06:58,191: INFO: Have generated 41000 sentence vector
2021-09-16 09:37:54,837: INFO: Have generated 42000 sentence vector
2021-09-16 10:09:08,590: INFO: Have generated 43000 sentence vector
2021-09-16 10:28:40,742: INFO: Have generated 44000 sentence vector
2021-09-16 11:00:06,761: INFO: Have generated 45000 sentence vector
2021-09-16 11:31:34,309: INFO: Have generated 46000 sentence vector
2021-09-16 12:03:03,138: INFO: Have generated 47000 sentence vector
2021-09-16 12:34:21,127: INFO: Have generated 48000 sentence vector
2021-09-16 12:36:07,222: INFO: Have generated 50000 sentence vector
2021-09-16 13:37:07,911: INFO: Have generated 51000 sentence vector
2021-09-16 14:08:26,914: INFO: Have generated 52000 sentence vector
2021-09-16 14:39:51,757: INFO: Have generated 53

[[1, array([ 2.02674717e-02,  7.57471561e-01, -4.66564029e-01, -1.12558909e-01,
       -3.34122032e-02, -1.41966462e+00,  4.18391317e-01, -7.26783514e-01,
        5.53853773e-02, -2.35040665e-01, -4.52869594e-01, -2.62251526e-01,
        1.38920680e-01, -1.02300324e-01,  1.05231786e+00, -1.07180104e-01,
        3.92980993e-01, -1.83953956e-01,  3.90699476e-01,  1.35855451e-01,
       -4.10960644e-01,  5.45931101e-01, -5.04030697e-02,  1.05124459e-01,
        9.32210684e-01, -1.83526814e-01,  2.96283603e-01, -1.71599805e-01,
        1.39829546e-01, -3.80621821e-01, -8.92593339e-02, -1.98223186e-03,
        3.40232670e-01, -7.26842701e-01, -6.70038583e-03,  4.93229926e-01,
       -3.62546086e-01, -1.42731100e-01,  7.73018062e-01, -1.04714490e-01,
       -7.92013556e-02, -8.72298837e-01, -3.00289810e-01,  4.25681055e-01,
       -4.69243348e-01, -5.31427525e-02,  2.22486436e-01,  1.61917537e-01,
        4.32501584e-02, -5.75746521e-02, -7.18718708e-01,  8.66720009e+00,
        1.28066942e-

In [5]:
%%time
import pandas as pd
all_list = []
for v in vector:
    temp_l = []
    temp_l.append(v[0])
    for value in v[1]:
        temp_l.append(value)
    all_list.append(temp_l)
print(all_list[:5])
col_name = ['article_id']
for i in range(768):
    col_name.append("bert_vector_{}".format(i))
    
all_df = pd.DataFrame(all_list, columns=col_name)
# export the result as a csv file
file_name = 'bert_as_server_vector_v2-512.csv'
all_df.to_csv(file_name, index=False)
logging.info("Saved all BERT vector as {}".format(file_name))
all_df.head(5)

[[1, 0.020267472, 0.75747156, -0.46656403, -0.11255891, -0.033412203, -1.4196646, 0.41839132, -0.7267835, 0.055385377, -0.23504066, -0.4528696, -0.26225153, 0.13892068, -0.10230032, 1.0523179, -0.1071801, 0.392981, -0.18395396, 0.39069948, 0.13585545, -0.41096064, 0.5459311, -0.05040307, 0.10512446, 0.9322107, -0.18352681, 0.2962836, -0.1715998, 0.13982955, -0.38062182, -0.089259334, -0.0019822319, 0.34023267, -0.7268427, -0.006700386, 0.49322993, -0.3625461, -0.1427311, 0.77301806, -0.10471449, -0.079201356, -0.87229884, -0.3002898, 0.42568105, -0.46924335, -0.053142752, 0.22248644, 0.16191754, 0.04325016, -0.057574652, -0.7187187, 8.6672, 0.12806694, 0.38339704, -1.2156874, -0.06698326, 0.19969922, 0.7663105, -0.05637453, -0.9939056, 0.15783726, -0.40328723, -0.17959341, 0.120717056, -0.07817364, -0.056609575, -0.2980815, 0.3424655, 0.66119015, 0.12653387, 0.24351135, 0.1117923, 0.2096138, -0.23489137, 1.0055087, -0.050649613, -0.7995177, -0.052079648, 0.054094665, -0.39085862, -0.10

2021-09-17 14:26:47,938: INFO: Saved all BERT vector as bert_as_server_vector_v2-512.csv


Wall time: 2min 2s


Unnamed: 0,article_id,bert_vector_0,bert_vector_1,bert_vector_2,bert_vector_3,bert_vector_4,bert_vector_5,bert_vector_6,bert_vector_7,bert_vector_8,...,bert_vector_758,bert_vector_759,bert_vector_760,bert_vector_761,bert_vector_762,bert_vector_763,bert_vector_764,bert_vector_765,bert_vector_766,bert_vector_767
0,1,0.0202675,0.757472,-0.466564,-0.112559,-0.0334122,-1.41966,0.418391,-0.726784,0.0553854,...,0.475443,0.578828,0.251246,0.337179,-0.043612,0.104732,0.123725,0.26182,-0.153349,0.209349
1,2,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,4,1.00211,-0.0622116,-0.234405,0.0806226,0.292167,-0.993474,-0.0271607,0.221724,-0.120717,...,-0.0480543,0.101174,0.0944484,0.263728,-0.216997,0.333728,0.149238,0.116588,-0.238715,0.327553
3,5,0.87569,0.177629,-0.273085,-0.0341245,-0.194976,-1.38861,0.0671176,-0.0742332,0.0493314,...,-0.0981333,-0.21729,0.40726,0.453563,-0.317844,0.503338,0.0391796,0.32122,-0.430772,0.316583
4,6,0.902296,0.0502674,-0.295325,0.068135,-0.265932,-1.1516,-0.145657,0.135472,0.0543197,...,-0.176754,-0.101848,0.0861506,0.0852659,-0.58392,0.641287,0.115043,0.203288,-0.314522,0.207483


In [8]:
print(len(all_df))

90840
