In [None]:
# import neccessary libraries
import pandas as pd 
import re
from nltk import ngrams
from underthesea import word_tokenize
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn import metrics
import os
from sklearn.model_selection import GridSearchCV
from bs4 import BeautifulSoup
import urllib.request
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from scipy import sparse as sp

In [None]:
emb = None
df = pd.read_csv('corpus.csv',encoding='utf-8') # Reading file news dataset with encoding mode utf-8 for Vietnamese
print(df.head(10))
print(df.shape) 
# The corpus includes 6024 articles of vtc news, and label 0: economy, 1: sport, 2: education, 3: health, 4: technology

In [None]:
def tokenizer(row):
    '''This function will tokenize a sentence in corpus to meaningful phrases'''
    return word_tokenize(row, format="text")
def embedding(X_train, X_test):
    '''This function will input tokenized words in train and text data to make embedding vector for documents'''
    global  emb
    emb = TfidfVectorizer(min_df=5, max_df=0.8,max_features=3000,sublinear_tf=True)
    emb.fit(X_train)
    X_train =  emb.transform(X_train)
    X_test = emb.transform(X_test)

    # Save pkl file
    joblib.dump(emb, 'tfidf.pkl')
    return X_train, X_test

In [None]:
df["content"] = df["content"].apply(tokenizer)
print(df['content'])

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df["content"],df["label"],test_size=0.2, random_state=42)

In [None]:
X_train,X_test  = embedding(X_train, X_test)
print(X_train)
print(X_train.shape)
print(type(X_train))

In [None]:
model = svm.SVC(kernel='linear', C = 5)
model.fit(X_train,y_train)
joblib.dump(model, 'saved_model.pkl')

In [None]:
# Check accuracy:
print("Model score=", model.score(X_test, y_test))
print("Done")

In [None]:
def crawl(url):
    response = urllib.request.urlopen(url) 
    html = response.read()
    soup = BeautifulSoup(html,  "html.parser")
    text = soup.get_text(strip=True)
    pattern = "(\(VTC News\) -)(.+)(\(adsvtc)?"
    a = re.search(pattern, text)
    if a==None:
        print('! error, could not find the structure defined before!')
    else:
        text = a.group(2).split('(adsvtc = window.adsvtc || []).push();')[0]
        text = text.lower()
        text = re.sub('[0-9\""\!\@\#\$\%\^\&\*\(\)\_\.\+\-\,\?\:\/"\'\<\>\=\~\[\]\{\}\|\–\;]+',' ',text)
    return text

def analyze(re):
    if re==0:
        print('Your news is about economic')
    if re==1:
        print('Your news is about sport')
    if re==2:
        print('Your news is about education')
    if re==3:
        print('Your news is about health')
    if re==4:
        print('Your news is about science, technology')


In [None]:
url = input('Enter url to the news:')
data = crawl(url)
print(data)
data_frame = pd.Series(data)
data_frame = pd.DataFrame(data_frame)

In [None]:
# 3. Tokenizer
data_frame[0] = data_frame[0].apply(tokenizer)

In [None]:
X_val = data_frame[0]
emb = joblib.load('tfidf.pkl')
X_val = emb.transform(X_val)

In [None]:
model = joblib.load('saved_model.pkl')
result = model.predict(X_val)
print(analyze(result))
print("All done")