In [18]:
import re
import collections
import numpy as np
import pandas as pd

### 1. Create corpus from wikipedia

In [219]:
import wikipedia
import pandas as pd

def generate_text(topic, min_char=150):
    sentences =  wikipedia.page(topic).content.split('\n')
    sentences = [sentence for sentence in sentences if len(sentence) >= min_char] # keep sentences with more than 150 characters
    return zip([topic for i in range(len(sentences))], sentences)

topics = ['malaysia', 'singapore', 'china']
tmp = [generate_text(topic) for topic in topics]
tmp = [y for x in tmp for y in x] # flatten
text = [content for topic, content in tmp]
y = [topic for topic, content in tmp]

df = pd.DataFrame(list(zip(y, text)), columns=['topic', 'text'])
df.head()

Unnamed: 0,topic,text
0,malaysia,Malaysia (/məˈleɪʒə/ mə-LAY-zhə or /məˈleɪsiə/...
1,malaysia,Malaysia has its origins in the Malay kingdoms...
2,malaysia,The country is multi-ethnic and multi-cultural...
3,malaysia,"Since its independence, Malaysia has had one o..."
4,malaysia,"The name ""Malaysia"" is a combination of the wo..."


### 2A. Pre-processing - Basic

In [228]:
import re

# Convert text to lower-case and strip punctuation/symbols from words
def tokenize(text):
    if pd.notnull(text):
        norm_text = text.lower().strip()

        # Pad punctuation with spaces on both sides
        for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
            norm_text = norm_text.replace(char, ' ' + char + ' ')

        # Replace breaks with spaces
        norm_text = re.sub(r'\n', ' <LINEBREAK> ', norm_text)
        norm_text = re.sub(r'\b[0-9,\.]{1,}\b', ' <NUMBER> ', norm_text)
        norm_text = re.sub(r' {2,}', ' ', norm_text)

        return norm_text.strip().split(' ')

In [229]:
df['tokens'] = df.text.apply(tokenize)
print('shape',df.shape)
df.head()

shape (279, 5)


Unnamed: 0,topic,text,tokens,cluster,count
0,malaysia,Malaysia (/məˈleɪʒə/ mə-LAY-zhə or /məˈleɪsiə/...,"[malaysia, (, /məˈleɪʒə/, mə-lay-zhə, or, /məˈ...",2,1
1,malaysia,Malaysia has its origins in the Malay kingdoms...,"[malaysia, has, its, origins, in, the, malay, ...",6,1
2,malaysia,The country is multi-ethnic and multi-cultural...,"[the, country, is, multi-ethnic, and, multi-cu...",4,1
3,malaysia,"Since its independence, Malaysia has had one o...","[since, its, independence, ,, malaysia, has, h...",2,1
4,malaysia,"The name ""Malaysia"" is a combination of the wo...","[the, name, "", malaysia, "", is, a, combination...",6,1


### 2B. Pre-processing - Spacy

In [221]:
import spacy

# load english model
nlp = spacy.load('en') 

In [222]:
def extract_verb(doc):
    return [token.lemma_ for token in doc if ( \
        token.pos == spacy.parts_of_speech.ADJ or \
        token.pos == spacy.parts_of_speech.VERB or \
        token.pos == spacy.parts_of_speech.PRON or \
        token.pos == spacy.parts_of_speech.PROPN or \
        token.pos == spacy.parts_of_speech.NOUN)]

def extract_tokens(doc):
    return extract_verb(doc) + [s.lemma_ for s in doc.noun_chunks]


tokenized = [extract_verb(doc) for doc in nlp.pipe(df.text, batch_size=10000, n_threads=4)]   
df['tokens'] = tokenized
print('shape',df.shape)
df.head()

shape (279, 3)


Unnamed: 0,topic,text,tokens
0,malaysia,Malaysia (/məˈleɪʒə/ mə-LAY-zhə or /məˈleɪsiə/...,"[malaysia, mə-lay, zhə, mə-lay, see-ə, malaysi..."
1,malaysia,Malaysia has its origins in the Malay kingdoms...,"[malaysia, have, its, origin, malay, kingdom, ..."
2,malaysia,The country is multi-ethnic and multi-cultural...,"[country, be, multi, ethnic, multi, cultural, ..."
3,malaysia,"Since its independence, Malaysia has had one o...","[its, independence, malaysia, have, have, good..."
4,malaysia,"The name ""Malaysia"" is a combination of the wo...","[name, malaysia, be, combination, word, malay,..."


### 3A. Doc2Vec

In [93]:
from gensim.models import doc2vec

tagged_docs = [doc2vec.TaggedDocument(tokens, [i]) for i, tokens in enumerate(df.tokens)]
model = doc2vec.Doc2Vec(tagged_docs, size=100, window=8, min_count=5, workers=4, iter=20, seed=0)
doc_vectors = model.docvecs

print(doc_vectors.shape)

### 3B. TFIDF

In [230]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(max_features=10000, stop_words='english', strip_accents='unicode', max_df=0.5, min_df=0.01)
doc_vectors = vec.fit_transform(df.tokens.apply(lambda x: ' '.join(x)))
vocab = vec.get_feature_names()

print(doc_vectors.shape)

(279, 1252)


### 4. Clustering

In [231]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

In [232]:
n = 8 # num topics
kmean = MiniBatchKMeans(n, max_iter=200)
dist = kmean.fit_transform(doc_vectors)
Y = kmean.labels_


pd_dist = pd.DataFrame(dist, columns=['topic'+str(i) for i in range(n)])

df['cluster'] = Y
results = df.join(pd_dist)
results.head()

Unnamed: 0,topic,text,tokens,cluster,count,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7
0,malaysia,Malaysia (/məˈleɪʒə/ mə-LAY-zhə or /məˈleɪsiə/...,"[malaysia, (, /məˈleɪʒə/, mə-lay-zhə, or, /məˈ...",7,1,1.160762,1.020485,1.00682,1.074643,1.013091,1.005256,1.012445,0.854317
1,malaysia,Malaysia has its origins in the Malay kingdoms...,"[malaysia, has, its, origins, in, the, malay, ...",5,1,1.149768,1.018124,1.022901,1.085439,1.031698,0.871327,1.040618,0.979079
2,malaysia,The country is multi-ethnic and multi-cultural...,"[the, country, is, multi-ethnic, and, multi-cu...",4,1,1.145528,1.016103,1.0052,1.081613,0.909561,0.980437,1.032442,1.008901
3,malaysia,"Since its independence, Malaysia has had one o...","[since, its, independence, ,, malaysia, has, h...",6,1,1.156174,1.013717,1.002109,1.075794,1.018792,1.035296,0.81599,0.977437
4,malaysia,"The name ""Malaysia"" is a combination of the wo...","[the, name, "", malaysia, "", is, a, combination...",5,1,1.158562,1.039677,1.023455,1.099471,1.047707,0.953124,1.059544,0.999549


### Extract keywords for each Topic

In [233]:
from sklearn.feature_extraction.text import TfidfVectorizer

grouped = df.groupby('cluster').agg({'tokens': lambda x: ' '.join([' '.join(t) for t in x])})
vec = TfidfVectorizer(max_features=10000, stop_words='english', strip_accents='unicode', max_df=0.5, min_df=0.05)
X = vec.fit_transform(grouped.tokens)
features = vec.get_feature_names()

def get_top(features, scores):
    tmp = zip(features, scores)
    tmp = filter(lambda x: x[1] > 0.01, tmp)
    return sorted(tmp, key=lambda x: x[1], reverse=True)[0:100]

keywords = [get_top(features, scores) for scores in X.toarray()]

grouped['keywords'] = pd.Series(keywords)
grouped.head()

Unnamed: 0_level_0,tokens,keywords
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,the education system features a non-compulsory...,"[(secondary, 0.482077966541), (schools, 0.3491..."
1,"singapore is a global commerce , finance and t...","[(tax, 0.187161739826), (best, 0.161505225048)..."
2,the malaysian constitution says it guarantees ...,"[(dynasty, 0.295337667668), (war, 0.1341327237..."
3,popular sports in malaysia include association...,"[(basketball, 0.371056393487), (sports, 0.3710..."
4,the country is multi-ethnic and multi-cultural...,"[(elected, 0.316998035239), (federal, 0.246885..."


In [234]:
for name, group in results.groupby('cluster'):
    print('\nCLUSTER#', name)
    o = ", ".join([word for word, score in grouped.keywords[name][0:10]])
    print('keywords: ',o,'\n')
    t = "\n-\n".join([text for text in group.sort_values('topic'+str(name)).text[0:3]])
    print(t)


CLUSTER# 0
keywords:  secondary, schools, primary, students, school, university, compulsory, universities, stage, junior 

The education system features a non-compulsory kindergarten education followed by six years of compulsory primary education, and five years of optional secondary education. Schools in the primary education system are divided into two categories: national primary schools, which teach in Malay, and vernacular schools, which teach in Chinese or Tamil. Secondary education is conducted for five years. In the final year of secondary education, students sit for the Malaysian Certificate of Education examination. Since the introduction of the matriculation programme in 1999, students who completed the 12-month programme in matriculation colleges can enroll in local universities. However, in the matriculation system, only 10 per cent of places are open to non-bumiputera students.
-
Since 1986, compulsory education in China comprises primary and junior secondary school, whi

### Labelling

In [227]:
df['count'] = 1
df[['topic', 'cluster', 'count']].groupby(['topic', 'cluster']).agg({'count':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,count
topic,cluster,Unnamed: 2_level_1
china,0,4
china,1,10
china,2,1
china,3,10
china,4,4
china,5,2
china,6,1
china,7,77
malaysia,0,2
malaysia,2,34


In [214]:
grouped['labels'] = ['unknown' , 'crime', 'war', 'movie-manga', 'singapore']
grouped

Unnamed: 0_level_0,tokens,keywords,labels
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,kyoto seika university japan have offer compet...,"[(water, 0.294164579135), (secondary, 0.210117...",unknown
1,legal system singapore be base english common ...,"[(code, 0.388392398476), (offence, 0.358117190...",crime
2,victim recognize their own role victim they ma...,"[(argue, 0.223907995858), (casualty, 0.2180578...",war
3,manga manga be comic create japan creator japa...,"[(film, 0.827452219334), (manga, 0.33148697471...",movie-manga
4,singapore /ˈsɪŋɡəpɔːr/ republic singapore refe...,"[(island, 0.275399194911), (singaporean, 0.247...",singapore
