Alert! This script needs to access your GDrive

Please upload the files to 'drive/My Drive/Colab Notebooks'

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install polyglot

Collecting polyglot
[?25l  Downloading https://files.pythonhosted.org/packages/e7/98/e24e2489114c5112b083714277204d92d372f5bbe00d5507acf40370edb9/polyglot-16.7.4.tar.gz (126kB)
[K     |████████████████████████████████| 133kB 3.3MB/s 
[?25hBuilding wheels for collected packages: polyglot
  Building wheel for polyglot (setup.py) ... [?25l[?25hdone
  Created wheel for polyglot: filename=polyglot-16.7.4-py2.py3-none-any.whl size=52559 sha256=17db4dfa6f5442d7be0360e53296d5d11025af56795b6feee890397b5c702fa1
  Stored in directory: /root/.cache/pip/wheels/5e/91/ef/f1369fdc1203b0a9347d4b24f149b83a305f39ab047986d9da
Successfully built polyglot
Installing collected packages: polyglot
Successfully installed polyglot-16.7.4


# Library

In [3]:
import pandas as pd
import re
from collections import Counter
import pickle
import numpy as np
import string

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from imblearn.under_sampling import RandomUnderSampler

from polyglot.mapping import Embedding
from gensim.models import Word2Vec

pd.set_option('max_columns', 1000)
pd.set_option('max_rows', 1000)

punctuation = string.punctuation # list of punctuation
digit = [i for i in range(0,10)] # list of digits

english_stemmer = SnowballStemmer("english", ignore_stopwords=True) # english stemmer
en_stops = set(stopwords.words('english'))  # english stopwords
nlp = spacy.load("en_core_web_sm") # model to do lemmatization
words, embeddings = pickle.load(open('drive/My Drive/Colab Notebooks/polyglot-en.pkl', 'rb'), encoding='latin1') # word embedding from polyglot

# Special tokens
Token_ID = {"<UNK>": 0, "<S>": 1, "</S>":2, "<PAD>": 3}
ID_Token = {v:k for k,v in Token_ID.items()}

# Map words to indices and vice versa
word_id = {w:i for (i, w) in enumerate(words)}
id_word = dict(enumerate(words))

# Normalize digits by replacing them with #
DIGITS = re.compile("[0-9]", re.UNICODE)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




# Data import

In [4]:
df_data = pd.read_excel('./drive/My Drive/Colab Notebooks/News Title.xls')
print(df_data.shape[0])
df_data.head()

65535


Unnamed: 0,No,News Title,Category
0,1,Google+ rolls out 'Stories' for tricked out ph...,Technology
1,2,Dov Charney's Redeeming Quality,Business
2,3,White God adds Un Certain Regard to the Palm Dog,Entertainment
3,4,"Google shows off Androids for wearables, cars,...",Technology
4,5,China May new bank loans at 870.8 bln yuan,Business


In [5]:
df_data['Category'].value_counts(dropna=False)

# the target class is imbalanced

Entertainment    23961
Business         17707
Technology       16776
Medical           7091
Name: Category, dtype: int64

# Preprocessing

In [6]:
'''
Function to clean the data, which includes:
1. Lowercasing
2. Punctuation removal
3. Digit removal
''' 
def cleaning(text):
    # lowercase
    normal = text.lower()
    # remove punctuation
    normal = re.sub(r'[^\w\s]', '', normal) 
    # remove numbers
    normal = re.sub(r'\d+', ' ', normal)
    return normal

'''
Function to normalize the form of the token (lemmatization)
and to remove stopwords
'''
def normalize_and_remove_stopwords(text):
    tokens = nlp(text)
    token_new = []
    
    for k in tokens:
        if k.lemma_ not in en_stops:
            token_new.append(k.lemma_)

    str_clean = ' '.join(token_new)
    return str_clean

'''
Function to do stemming, in this case, we use lemmatization
instead of stemming
'''
def stemming(text):
    tokens = nltk.word_tokenize(text)
    stem_sentence = []
    for k in tokens:
        stem_word = english_stemmer.stem(k)
        stem_sentence.append(stem_word)

    stem_sentence_str = ' '.join(stem_sentence)
    return stem_sentence_str

'''
Data preprocessing function, which includes:
1. Text cleaning,
2. Text normalization, and
3. Stopword removal
'''
def preprocessing(list_text):
    text_clean = []
    for t in list_text:
        normal = cleaning(t)
#         normal = stemming(normal)
        normal = normalize_and_remove_stopwords(normal)
        text_clean.append(normal)
    return text_clean

In [7]:
raw_text = df_data['News Title']

clean_text = preprocessing(raw_text) # do the preprocessing
clean_text[:3]

['google roll story trick photo playback',
 'dov charney redeem quality',
 'white god add un certain regard palm dog']

In [8]:
# save the clean comments to csv, so we can use it later on
df_clean_title = pd.DataFrame(clean_text, columns=['title'])
df_clean_title.to_csv('drive/My Drive/Colab Notebooks/df_clean_title_no_stemming.csv', index=False, encoding='utf-8')

In [9]:
df_clean_title = pd.read_csv('./drive/My Drive/Colab Notebooks/df_clean_title_no_stemming.csv')
clean_title = df_clean_title['title'] # clean text

# Feature extraction

In [10]:
'''
Function to extract TF (1-gram) features
'''
def tf_extraction(text, ngram_start, ngram_end):
    ngram = CountVectorizer(ngram_range=(ngram_start, ngram_end), max_features=3000)
    ngram_matrix = ngram.fit_transform(np.array(text)).todense()
    return ngram_matrix

# unigram features
ngram_feat = tf_extraction(clean_title, 1, 1)
print(ngram_feat[:3])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
'''
In case the word is not available in the vocabulary,
we can try multiple case normalizing procedure.
We consider the best substitute to be the one with the lowest index,
which is equivalent to the most frequent alternative.

Source: https://nbviewer.jupyter.org/gist/aboSamoor/6046170
''' 
def case_normalizer(word, dictionary):
    w = word
    lower = (dictionary.get(w.lower(), 1e2), w.lower())
    upper = (dictionary.get(w.upper(), 1e2), w.upper())
    title = (dictionary.get(w.title(), 1e2), w.title())
    results = [lower, upper, title]
    results.sort()
    index, w = results[0]
    if index != 1e2:
        return w
    return word

'''
Find the closest alternative in case the word is OOV.

Source: https://nbviewer.jupyter.org/gist/aboSamoor/6046170
'''
def normalize(word, word_id):
    if not word in word_id:
        word = DIGITS.sub("#", word)
    if not word in word_id:
        word = case_normalizer(word, word_id)

    if not word in word_id:
        return None
    return word

In [12]:
'''
Function to retrieve the Euclidean distance
between two words in the embedding vectors 
'''
def get_distance(word1, word2, embeddings, word_id, id_word):
    word1 = normalize(word1, word_id)
    word2 = normalize(word2, word_id)
    if not word1 or not word2: # if word 1 or word 2 not found in the embedding vectors, return 100 as the distance score
        return 1e2
    word1_index = word_id[word1]
    word2_index = word_id[word2]
    e1 = embeddings[word1_index]
    e2 = embeddings[word2_index]
    distance = ((e2 - e1) ** 2).sum() ** 0.5
    return distance

In [13]:
'''
Function to extract the embedding features, which include minimum distance of text to:
1. Entertainment word
2. Technology word
3. Medical word
4. Business word
'''
def embedding_extraction(text):
    all_embedding_feat = []
    for t in text:
        entertainment_distances = []
        technology_distances = []
        medical_distances = []
        business_distances = []
        token = nltk.word_tokenize(t)
        for k in token:
            entertainment_distances.append(get_distance(k, 'entertainment', embeddings, word_id, id_word))
            technology_distances.append(get_distance(k, 'technology', embeddings, word_id, id_word))
            medical_distances.append(get_distance(k, 'medical', embeddings, word_id, id_word))
            business_distances.append(get_distance(k, 'business', embeddings, word_id, id_word))
        all_embedding_feat.append([min(entertainment_distances), min(technology_distances), min(medical_distances), min(business_distances)])
    return all_embedding_feat

embed_feat = embedding_extraction(clean_title)
embed_feat[:3]

[[2.913120148087336, 2.948760506100634, 3.5125143573508284, 3.091267832678769],
 [2.4571364339319977,
  3.0249680383428195,
  3.3134951266334514,
  2.932971141591275],
 [3.041295033226138,
  3.5203397205450035,
  3.7982305924660396,
  3.269813735043243]]

In [14]:
'''
Function to extract orthography and url occurence features
'''
def ortography(text):
    all_orto_feat = []
    for t in text:
        capital_count = sum(1 for c in t if c.isupper())
        exclamation_count = sum(1 for c in t if c == "!")
        punctuation_count = sum(1 for c in t if c in punctuation)
        word_len = len(nltk.word_tokenize(t))
        char_len = len(t)
        digit_occurence = sum(1 for c in t if c in digit)
        orto_feat = [capital_count, exclamation_count, punctuation_count, word_len, char_len, digit_occurence]
        all_orto_feat.append(orto_feat)
    return all_orto_feat

orto_feat = ortography(df_data['News Title'])
orto_feat[:3]

[[2, 0, 3, 10, 58, 0], [4, 0, 1, 5, 31, 0], [7, 0, 0, 10, 48, 0]]

In [15]:
'''
Function to extract TF-IDF (1-gram) features
'''
def tf_idf_extraction(text):
    vectorizer = TfidfVectorizer(max_features=3000)
    tfidf_matrix = vectorizer.fit_transform(np.array(text)).todense()
    return tfidf_matrix

tfidf_feat = tf_idf_extraction(clean_title)
print(tfidf_feat[:3])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Modelling

In [16]:
category = df_data['Category'].astype('category').cat.codes # target variable

## No sampling

In [17]:
# list of features combinations
feat_list = [np.hstack((ngram_feat, embed_feat)), np.hstack((tfidf_feat, embed_feat))]
feat_name = ['tf and embedding', 'tf-idf and embedding']
mnb = MultinomialNB()
bnb = BernoulliNB()

# list of model to do prediction
model_list = [mnb, bnb]
model_name = ['Multinomial Naive Bayes', 'Bernoulli Naive Bayes']

# build the model and evaluate the performance of it for each feature combination
df_recap = pd.DataFrame()
for f, fn in zip(feat_list, feat_name):
    print("Features : ", fn)
    X = f
    y = category
    for m, n in zip(model_list, model_name):
        scoring = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']
        scores = cross_validate(m, X, y, cv=4, scoring=scoring)
        acc = np.mean(scores['test_accuracy'])
        f1 = np.mean(scores['test_f1_macro'])
        precision = np.mean(scores['test_precision_macro'])
        recall = np.mean(scores['test_recall_macro'])
        print("Classifier : ", n)
        print("Accuracy:", acc)
        print("F1-Measure:", f1)
        print("Precision:", precision)
        print("Recall:", recall)
        df_recap = df_recap.append({
            'features': fn,
            'classifier': n,
            'accuracy': acc,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        }, ignore_index=True)
        print('='*90)

Features :  tf and embedding
Classifier :  Multinomial Naive Bayes
Accuracy: 0.8750743445681832
F1-Measure: 0.862972451637912
Precision: 0.8648583814099748
Recall: 0.8612402290195584
Classifier :  Bernoulli Naive Bayes
Accuracy: 0.8749827843827729
F1-Measure: 0.8630592125250387
Precision: 0.864534543396968
Recall: 0.8616718862690685
Features :  tf-idf and embedding
Classifier :  Multinomial Naive Bayes
Accuracy: 0.8749827992848437
F1-Measure: 0.8623230265997273
Precision: 0.8780468486455774
Recall: 0.85049955941954
Classifier :  Bernoulli Naive Bayes
Accuracy: 0.8749827843827729
F1-Measure: 0.8630592125250387
Precision: 0.864534543396968
Recall: 0.8616718862690685


In [18]:
# the recap of scenarios with no sampling
df_recap

Unnamed: 0,accuracy,classifier,f1_score,features,precision,recall
0,0.875074,Multinomial Naive Bayes,0.862972,tf and embedding,0.864858,0.86124
1,0.874983,Bernoulli Naive Bayes,0.863059,tf and embedding,0.864535,0.861672
2,0.874983,Multinomial Naive Bayes,0.862323,tf-idf and embedding,0.878047,0.8505
3,0.874983,Bernoulli Naive Bayes,0.863059,tf-idf and embedding,0.864535,0.861672


## Undersampling

In [19]:
# list of features combinations
feat_list = [np.hstack((ngram_feat, embed_feat)), np.hstack((tfidf_feat, embed_feat))]
feat_name = ['tf and embedding', 'tf-idf and embedding']
mnb = MultinomialNB()
bnb = BernoulliNB()

# list of model to do prediction
model_list = [mnb, bnb]
model_name = ['Multinomial Naive Bayes', 'Bernoulli Naive Bayes']

# build the model and evaluate the performance of it for each feature combination
df_recap_undersample = pd.DataFrame()
for f, fn in zip(feat_list, feat_name):
    print("Features = ", fn)
    X = f
    y = category
    under = RandomUnderSampler(random_state=0)
    Xt, yt = under.fit_resample(X, y)
    display(Counter(yt))
    for m, n in zip(model_list, model_name):
        scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']
        scores=cross_validate(m, Xt, yt, cv=4, scoring=scoring)
        acc=np.mean(scores['test_accuracy'])
        f1=np.mean(scores['test_f1_macro'])
        precision=np.mean(scores['test_precision_macro'])
        recall=np.mean(scores['test_recall_macro'])
        print("Classifier : ", n)
        print("Accuracy:", acc)
        print("F1-Measure:", f1)
        print("Precision:", precision)
        print("Recall:", recall)
        df_recap_undersample = df_recap_undersample.append({
            'features': fn,
            'classifier': n,
            'accuracy': acc,
            'f1_score': f1,
            'precision': precision,
            'recall': recall
        }, ignore_index=True)
        print('='*90)

Features =  tf and embedding




Counter({0: 7091, 1: 7091, 2: 7091, 3: 7091})

Classifier :  Multinomial Naive Bayes
Accuracy: 0.8628895783387391
F1-Measure: 0.8626083980159777
Precision: 0.8625976044133797
Recall: 0.8628899133159926
Classifier :  Bernoulli Naive Bayes
Accuracy: 0.8629248342969962
F1-Measure: 0.8628363197817955
Precision: 0.8628325521843135
Recall: 0.8629253433430222
Features =  tf-idf and embedding




Counter({0: 7091, 1: 7091, 2: 7091, 3: 7091})

Classifier :  Multinomial Naive Bayes
Accuracy: 0.863065858130024
F1-Measure: 0.8628218543335067
Precision: 0.8627525531783041
Recall: 0.863066247824465
Classifier :  Bernoulli Naive Bayes
Accuracy: 0.8629248342969962
F1-Measure: 0.8628363197817955
Precision: 0.8628325521843135
Recall: 0.8629253433430222


In [20]:
# the recap of scenarios with undersampling
df_recap_undersample

Unnamed: 0,accuracy,classifier,f1_score,features,precision,recall
0,0.86289,Multinomial Naive Bayes,0.862608,tf and embedding,0.862598,0.86289
1,0.862925,Bernoulli Naive Bayes,0.862836,tf and embedding,0.862833,0.862925
2,0.863066,Multinomial Naive Bayes,0.862822,tf-idf and embedding,0.862753,0.863066
3,0.862925,Bernoulli Naive Bayes,0.862836,tf-idf and embedding,0.862833,0.862925
