In [1]:
from TNT_Api import TNT_Api
UN = TNT_Api(tag='聯合國',nlimit=10000)

In [2]:
UN.get_data()

1377 documents found in tag 聯合國
downloading, 10000 to go
0 data downloaded
100 data downloaded
200 data downloaded
300 data downloaded
400 data downloaded
500 data downloaded
600 data downloaded
700 data downloaded
800 data downloaded
900 data downloaded
1000 data downloaded
1100 data downloaded
1200 data downloaded
1300 data downloaded


In [3]:
## Simple Cleaning and Spelling Correction
ocr = UN.ocr.copy().reset_index()
def replace_line_change(t):
    if len(t)>0: return t[0].replace('\n',' ')
    else: return None
ocr['ocr_text'] = ocr.ocr.map(replace_line_change,na_action='ignore')

In [4]:
## Remove non-English
from langdetect import detect, DetectorFactory
from langdetect import detect_langs
DetectorFactory.seed =0 
def remove_non_eng(text):
    if detect(text)=='en':return text
    else: return None
ocr['ocr_eng'] = ocr.ocr_text.map(remove_non_eng,na_action='ignore')

In [5]:
ocr = ocr.dropna(axis=0,how='any')

In [7]:
ocr.to_csv('data/UN_raw.csv')

### Checkpoint

In [5]:
import enchant
import spelling_correction as sc
import numpy as np
import pandas as pd
ocr = pd.read_csv('data/UN_raw.csv')
chkr = enchant.Dict('en_US')

In [6]:
import numpy as np
test = ocr.ocr_eng.copy()
def check_mis_spell(text):
    mis_rate = np.sum([chkr.check(w) for w in sc.words(text)])/len(sc.words(text))
    return mis_rate
mis_rate = test.map(check_mis_spell,na_action='ignore')

In [7]:
mis_rate.describe()

count    1284.000000
mean        0.744819
std         0.129981
min         0.142857
25%         0.647444
50%         0.763569
75%         0.849251
max         1.000000
Name: ocr_eng, dtype: float64

In [13]:
text_length = test.map(len,na_action='ignore')
total_words = text_length.sum()
print ('There are total {:d} documents and {:d} words'.format(len(text_length),text_length.sum()))

There are total 1284 documents and 2199153 words


In [None]:
import time
def auto_correct(text):
    wordlist = []
    correct_dic = {}
    for w in sc.words(text):
        if chkr.check(w): wordlist.append(w)
        else: 
            c = sc.correction(w)
            correct_dic[w]=c
    art = " ".join(wordlist)
    return art,correct_dic
start = time.time()
subset = test.map(auto_correct,na_action='ignore')
end = time.time()-start
print (end)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

documents = list(ocr)
no_features = 200

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()


no_topics = 3
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic %d:" % (topic_idx)
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
print "=============================="
display_topics(lda, tf_feature_names, no_top_words)