In [None]:
import codecs, nltk, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import gensim

wordnet_lemmatizer = WordNetLemmatizer()

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

small_model = gensim.models.KeyedVectors.load_word2vec_format('../small-embeddings.txt', binary=False)


# input should be a string
def text_embedding(text):
    
    #it depends if the words have been lowercased or not
    text = text.lower()
    
    text = nltk.word_tokenize(text)
        
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    text = [token for token in text if token not in stop_word_list]

    article_embedd = []
    
    for word in text:
            try:
                embed_word = small_model[word]
                article_embedd.append(embed_word)
            except KeyError:
                continue

    avg = [float(sum(col))/len(col) for col in zip(*article_embedd)]
    
    # the output is a doc-embedding
    return avg


# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
    
    text = " ".join(text)
    
    # the output is text
    return text

In [None]:
#topic models, finally!

# let's load the dataset
dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")

corpus = []

# you can run wit all data at home
for line in dataset[1:1000]:
    text = line.split("\t")[3]
    
    # for LDA we need tokens not embeddings!
    # be careful, each text-processing step you'll do will influence the analysis
    text = nlp_pipeline(text).split(" ")
    if len(text)>1:
        corpus.append(text)
            
print ("ready!")

In [None]:
from gensim import corpora, models

# for running LDA in gensim we need a dictionary of all the words
dictionary = corpora.Dictionary(corpus)
# and to count the word frequency in each doc
X = [dictionary.doc2bow(text) for text in corpus]

print ("ready!")

In [None]:
ldamodel = models.ldamodel.LdaModel(X, num_topics=30, id2word = dictionary, iterations=500)
print ("done!")

In [None]:
for topic in ldamodel.print_topics(num_words=5):
    print (topic)

In [None]:
# homework - implement the word intrusion evaluation task!

In [None]:
# first of all, an explanation
from sklearn import cross_validation
import codecs
import numpy as np
import warnings
warnings.filterwarnings("ignore")


dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")

corpus = []
labels =  []

# we load the first 30 lines of our dataset 
for line in dataset[1:100]:
    # the topic, like "usa" "uk", etc is the label that we want to predict
    label = line.split("\t")[2]
    text = line.split("\t")[3]
    corpus.append(text)
    labels.append(label)

# again, we use np arrays as they are more efficient
X = np.array(corpus)
y = np.array(labels) 
    
# we use 10 fold cross validation, and we say to consider the labels (y) when splitting
kf_total = cross_validation.StratifiedKFold(y, n_folds=10, shuffle=True)

    
    

In [None]:
# what does it produce?
# let's check it out!

# it splits the data in 10 folds (and every time you use 9 for training and 1 for testing)
for train, test in kf_total:
    # what are here "train" and "test"?
    print (train)
    print (test)
    break
# the numbers in train and test are indices - they mean: "the element x goes in training, the element y goes in test, etc)

In [None]:
# how do we map now these numbers with real documents?
# like this:
for train, test in kf_total:
    # this means: use the indices in training to create the training and the test set
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    #it's a fast method for taking data using indices, but you can also do like this
    X_train = []
    y_train = []
    for index in train:
        #take the element with that index in X
        element = X[index]
        element_label = y[index]
        X_train.append(element)
        y_train.append(element_label)
    
    X_test = []
    y_test = []
    for index in test:
        #take the element with that index in X
        element = X[index]
        element_label = y[index]
        X_test.append(element)
        y_test.append(element_label)
        

In [None]:
# let's try to cluster articles

# i'm re-loading everything here, because I want to use the titles of the articles to interpret the clusters
dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")

article = dataset[4].split("\t")

corpus = []
titles =  []

# you can run wit all data at home
for line in dataset[1:1000]:
    # to better understands which clusters are created, let's check the titles of the articles
    title = line.split("\t")[1]
    text = line.split("\t")[3]
    # you embed the text
    text = text_embedding(text)
    
    if len(text)>0:
        corpus.append(text)
        titles.append(title)
print ("ready!")

In [None]:
from sklearn.cluster import KMeans

# usual thing, np arrays
X = np.array(corpus)
y = np.array(titles) 

# we define kmeans, with 10 clusters (you can change this number and see how the results change)
# then we train it using only the documents
kmeans = KMeans(n_clusters=10).fit(X)


In [None]:
# these are the labels we obtain
kmeans.labels_

In [None]:
# to see which docs are in which clusters, we need to loop over all labels

# so the number of clusters
for i in range(10):
    # print the title of the document if the doc is in this cluster
    print ("this is cluster number",i)
    # then you loop over all titles
    for k in range(len(titles)):
        
        # this is the title
        title = titles[k]
        
        #this is its cluster label
        label = kmeans.labels_[k]
        
        # does it belong to this cluster?
        if i == label:
            #if yes, then print it out!
            print (title)
    print (" ")

In [None]:
# let's count which are the most popular words in the titles of each cluster
from collections import Counter

for i in range(10):
    
    # we create a list where we put the words from the titles
    title_words = []
    
    print ("this is cluster number",i)
    for k in range(len(titles)):
        # we clean the title with our pipeline
        title = nlp_pipeline(titles[k]).split(" ")
        label = kmeans.labels_[k]
        if i == label:
            # we put each word in the list
            for word in title:
                title_words.append(word)
    
    # then we count and print the 10 most common
    most_common = Counter(title_words).most_common(10)
    print (most_common)
    print (" ")