En esta notebook se procesara una secuencia de títulos de notas para ejemplificar diversas técnicas de NLP

In [109]:
import numpy as np
import re

## Cargo dataset

In [206]:
f = open('sentences_raw.txt','r')
full_text = f.read()
f.close()

In [207]:
# Titulo + Bajada separado por \n
print('Contenido completo:')
print(full_text[:3000])

Contenido completo:
Trump Jr. Won’t Provide Details of a Call With His Father. Donald Trump Jr. claimed that his conversation with the president was protected under lawyer-client privilege because lawyers for both men were on the call.
Hall of Fame quarterback Warren Moon sued for sexual harassment. A California woman is suing Hall of Fame quarterback Warren Moon for alleged sexual misconduct and battery when she was employed at his sports marketing firm.
Mother opens up on terrifying moment she 'DIED' giving birth to twins. Mother Kristie Miller, from the Gold Coast, has opened up about the terrifying moment she almost died on the table after giving birth to her twins at 29 weeks.
LA will become largest US city with recreational pot. Los Angeles is in line to become the nation's largest city with legal recreational marijuana after the City Council voted Wednesday to license sales and cultivation next year.
New West man finds more than $1,500 cash in a box, returns it to owner. The own

# Preprocesamiento:

In [208]:
full_text = full_text.lower()

In [209]:
# Obtengo un array con cada titulo + bajada por elemento
sentences = full_text.split('\n')
print('Total de items de lista:')
len(sentences)

Total de items de lista:


38246

In [212]:
preprocesed_sentences = []
non_alpha_regex = re.compile('[\W_ \#]', re.UNICODE)
for sentence in sentences:
    preproc_sentence = non_alpha_regex.sub(' ', sentence)
    preprocesed_sentences.append(preproc_sentence)

In [216]:
print('Ejemplo de primeros 3 elementos de la lista:')
preprocesed_sentences[:3]

Ejemplo de primeros 3 elementos de la lista:


['trump jr  won t provide details of a call with his father  donald trump jr  claimed that his conversation with the president was protected under lawyer client privilege because lawyers for both men were on the call ',
 'hall of fame quarterback warren moon sued for sexual harassment  a california woman is suing hall of fame quarterback warren moon for alleged sexual misconduct and battery when she was employed at his sports marketing firm ',
 'mother opens up on terrifying moment she  died  giving birth to twins  mother kristie miller  from the gold coast  has opened up about the terrifying moment she almost died on the table after giving birth to her twins at 29 weeks ']

# Stopwords, Lemmatization y Stemming

In [217]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer, ENGLISH_STOP_WORDS

In [218]:
print(ENGLISH_STOP_WORDS)

frozenset({'thereupon', 'hasnt', 'although', 'has', 'almost', 'beyond', 'once', 'interest', 'as', 'often', 'two', 'beforehand', 'de', 'describe', 'us', 'except', 'amount', 'former', 'see', 'cry', 'ourselves', 'via', 'three', 'being', 'most', 'wherever', 'herein', 'been', 'hereby', 'ten', 'thru', 'with', 'below', 'latterly', 'its', 'fill', 'to', 'please', 'cant', 'few', 'perhaps', 'whoever', 'call', 'hers', 'serious', 'anyone', 'much', 'top', 'upon', 'were', 'con', 'show', 'ltd', 'namely', 'together', 'him', 'thick', 'your', 'mostly', 'would', 'not', 'co', 'or', 'though', 'whenever', 'whence', 'everywhere', 'mill', 'several', 'something', 'too', 'whole', 'side', 'even', 'whose', 'somewhere', 'himself', 'whatever', 'without', 'nobody', 'among', 'keep', 'amongst', 'no', 'then', 'will', 'elsewhere', 'over', 'full', 'sometime', 'found', 'four', 'give', 'it', 'around', 'next', 'thin', 'amoungst', 'while', 'after', 'are', 'throughout', 'move', 'from', 'sincere', 'the', 'cannot', 'at', 'her', 

In [219]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/julianganzabal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/julianganzabal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Ejemplo Lemma y Stemming

In [220]:
stemmer = nltk.stem.porter.PorterStemmer()

testing_sentence = 'The president is driving the cars and playing. Are the presidential cars faster?'
tokens = nltk.word_tokenize(testing_sentence)
print('Original')
print(tokens)
print()
stemmed = []
lemmatized = []
for t in tokens:
    stemmed.append(stemmer.stem(t))
    lemma = WordNetLemmatizer().lemmatize(t, pos='v')
    lemma = WordNetLemmatizer().lemmatize(lemma, pos='a')
    lemma = WordNetLemmatizer().lemmatize(lemma, pos='s')
    lemma = WordNetLemmatizer().lemmatize(lemma, pos='r')
    lemma = WordNetLemmatizer().lemmatize(lemma, pos='n')
    lemmatized.append(lemma)
print('Stemmed:')
print(stemmed)
print()
print('Lemmatized:')
print(lemmatized)

Original
['The', 'president', 'is', 'driving', 'the', 'cars', 'and', 'playing', '.', 'Are', 'the', 'presidential', 'cars', 'faster', '?']

Stemmed:
['the', 'presid', 'is', 'drive', 'the', 'car', 'and', 'play', '.', 'are', 'the', 'presidenti', 'car', 'faster', '?']

Lemmatized:
['The', 'president', 'be', 'drive', 'the', 'car', 'and', 'play', '.', 'Are', 'the', 'presidential', 'car', 'fast', '?']


In [221]:
def tokenize_and_stem(text):
    do_stemming = True
    # Function that takes text, tokenizes it and 
    # returns list of stemmed tokens
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.stem.porter.PorterStemmer()
    output = [] 
    for t in tokens:
        #stem = stemmer.stem(t)
        stem = WordNetLemmatizer().lemmatize(t, pos='v')
        stem = WordNetLemmatizer().lemmatize(stem, pos='a')
        stem = WordNetLemmatizer().lemmatize(stem, pos='s')
        stem = WordNetLemmatizer().lemmatize(stem, pos='r')
        stem = WordNetLemmatizer().lemmatize(stem, pos='n')
        # Dont consider stem shorter than 3? is it correct
        if len(stem)>2:
            output.append(stem) 
        elif len(t)>2:
            output.append(t)
    return output

In [222]:
index = 2
print(sentences[index])
print(tokenize_and_stem(sentences[index]))

mother opens up on terrifying moment she 'died' giving birth to twins. mother kristie miller, from the gold coast, has opened up about the terrifying moment she almost died on the table after giving birth to her twins at 29 weeks.
['mother', 'open', 'terrify', 'moment', 'she', "'died", 'give', 'birth', 'twin', 'mother', 'kristie', 'miller', 'from', 'the', 'gold', 'coast', 'have', 'open', 'about', 'the', 'terrify', 'moment', 'she', 'almost', 'die', 'the', 'table', 'after', 'give', 'birth', 'her', 'twin', 'week']


# CountVectorizer, TFIDF

In [234]:
count_vectorizer = CountVectorizer(max_df= 0.9, \
                                    min_df= 5, \
                                    max_features= 100000 , \
                                    stop_words= ENGLISH_STOP_WORDS, \
                                    tokenizer = tokenize_and_stem, \
                                    ngram_range = (1,2)) #(1,2)

In [235]:
doc_term_matrix_count = count_vectorizer.fit_transform(preprocesed_sentences)

In [236]:
doc_term_matrix_count.shape

(38246, 22695)

### Cual fue el token que mas aparecio?

In [272]:
doc_term_matrix_count

<38246x22695 sparse matrix of type '<class 'numpy.int64'>'
	with 702062 stored elements in Compressed Sparse Row format>

In [240]:
doc_term_matrix_count.sum(axis=0).shape

(1, 22695)

In [257]:
sorted_indexes = np.array(np.argsort(doc_term_matrix_count.sum(axis=0)))[0][::-1]

In [264]:
np.array(count_vectorizer.get_feature_names())[sorted_indexes][:50]

array(['say', 'year', 'trump', 'new', 'make', 'president', 'time', 'tax',
       'woman', 'police', 'world', 'man', 'people', 'state', 'win', 'plan',
       'week', 'house', 'day', 'city', 'old', '2017', 'star', 'report',
       'sexual', 'big', 'late', 'just', 'look', 'home', 'come', 'game',
       'donald', 'help', 'best', 'high', 'like', 'christmas', 'company',
       'know', 'donald trump', 'work', 'use', 'need', 'want', 'leave',
       'market', 'continue', 'year old', 'good'],
      dtype='<U27')

### En cuantos articulos aparece Trump y Tweet. Cuales son?

In [347]:
trump_idx = count_vectorizer.vocabulary_['trump']
tweet_idx = count_vectorizer.vocabulary_['tweet']
print('trump index:',trump_idx)
print('tweet index:',tweet_idx)

trump_tweet_articles = doc_term_matrix_count[:,trump_idx].multiply(doc_term_matrix_count[:,tweet_idx]).nonzero()[0]
print('Cantidad de Articulos:', len(trump_tweet_articles))
print('Articulos')
np.array(preprocesed_sentences)[trump_tweet_articles]

trump index: 20703
tweet index: 20986
Cantidad de Articulos: 267
Articulos


array([ 'chelsea handler criticized for calling out trump in fire evacuation tweet   jesus christ  obsessed with the president much   one critic of chelsea handler s trump tweet says',
       'dems ask tillerson if trump anti islam tweets put us at risk  a group of house democrats is calling on secretary of state rex tillerson to tell them whether president donald trump s frequent use of social media is putting u s  diplomats and americans traveling abroad at risk ',
       'people can t stop reading a professor s theory of a trump russia conspiracy   true or not  seth abramson lays out a story of astonishing intrigue and stupidity  one tweet at a time ',
       'on the night news desk when trump s tweeting starts  lara jakes  recent night editor for our washington bureau  and steve kenny  night editor in new york  discuss how year 1 of the trump era has affected their jobs and their sleep ',
       'jeff flake  2020   sen  jeff flake tweeted a picture tuesday afternoon that is the mos

## TF-IDF (Term Frecuency - Inverse document frecuency)
- TF -> Cantidad de veces que aparece una palabra particular en el documento
- DF -> Cantidad de documentos en los que aparece esa palabra particular
- Ademas se normaliza

No es exactamente el cociente pero la idea general es esa:

http://scikit-learn.org/stable/modules/feature_extraction.html#tfidf-term-weighting

In [265]:
tfidf_transformer = TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [268]:
doc_term_matrix_tfidf = tfidf_transformer.fit_transform(doc_term_matrix_count)

In [270]:
doc_term_matrix_tfidf.shape

(38246, 22695)

In [274]:
doc_term_matrix_tfidf

<38246x22695 sparse matrix of type '<class 'numpy.float64'>'
	with 702062 stored elements in Compressed Sparse Row format>

In [289]:
# Verificamos que estan normalizados
index = 10
(doc_term_matrix_tfidf[index]*doc_term_matrix_tfidf[index].T).sum()

1.0000000000000002

## Encontrar articulos relacionados

In [290]:
from sklearn.neighbors import NearestNeighbors

In [293]:
nearest_neig = NearestNeighbors(n_neighbors = 10, algorithm='brute', radius = 1.0, metric = 'cosine')

In [294]:
nearest_neig_fitted = nearest_neig.fit(doc_term_matrix_tfidf)

In [311]:
similar_to_article = 3
distances, indexes = nearest_neig_fitted.kneighbors(doc_term_matrix_tfidf[similar_to_article])
distances, indexes

(array([[ 0.        ,  0.        ,  0.1128909 ,  0.14895583,  0.16753991,
          0.26477447,  0.51142003,  0.53651656,  0.60692289,  0.67297547]]),
 array([[    3,  1215,  1604,  1623,  1649,   949,  1050,   164,  1391,
         33997]]))

In [312]:
np.array(preprocesed_sentences)[indexes][0]

array([ 'la will become largest us city with recreational pot  los angeles is in line to become the nation s largest city with legal recreational marijuana after the city council voted wednesday to license sales and cultivation next year ',
       'la will become largest us city with recreational pot  los angeles is in line to become the nation s largest city with legal recreational marijuana after the city council voted wednesday to license sales and cultivation next year ',
       'la to become largest us city with recreational pot  los angeles will become the nation s largest city with recreational pot after the city council voted wednesday to license sales next year ',
       'la to become largest us city with recreational pot  los angeles will become the nation s largest city with recreational pot after the city council voted wednesday to license sales next year  after months of debate and political snags  the   click to continue  ',
       'los angeles to become largest us city w

## Topic Modeling
http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

In [348]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [361]:
NMF_model = NMF(n_components=10, random_state=1,alpha=0.1, l1_ratio=0.5, max_iter=200, tol=1e-4)
W = NMF_model.fit_transform(doc_term_matrix_tfidf)
H = NMF_model.components_

In [362]:
print(W.shape, H.shape,doc_term_matrix_tfidf.shape)

(38246, 10) (10, 22695) (38246, 22695)


In [369]:
def print_top_words(H, feature_names, n_top_words):
    for topic_idx, topic in enumerate(H):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

### Listado de topics

In [370]:
print_top_words(H, count_vectorizer.get_feature_names(), 10)

Topic #0: new, make, say, time, christmas, best, people, holiday, 2017, york
Topic #1: trump, president, donald, donald trump, jerusalem, israel, capital, president donald, israel capital, jerusalem israel
Topic #2: tax, republican, senate, gop, plan, reform, cut, tax reform, tax cut, tax plan
Topic #3: flynn, michael, michael flynn, guilty, fbi, national security, security, adviser, lie, plea
Topic #4: sexual, harassment, sexual harassment, lauer, allegation, matt, matt lauer, misconduct, sexual misconduct, conyers
Topic #5: year, old, year old, girl, boy, person year, person, old girl, time, miss
Topic #6: police, man, say, shoot, officer, charge, kill, arrest, woman, police say
Topic #7: world, cup, world cup, draw, england, 2018, russia, cup draw, australia, final
Topic #8: korea, north, north korea, missile, south, korean, nuclear, south korea, test, north korean
Topic #9: moore, roy, roy moore, alabama, senate, republican, candidate, alabama senate, senate candidate, kimmel



### Paginas en topics

In [391]:
pages_top_topic_index = np.argsort(W)[:,-1]
pages_top_topic_index

array([1, 4, 0, ..., 8, 0, 9])

In [414]:
topic_id = 8
pages_in_topic = np.where(pages_top_topic_index==topic_id)[0]

In [415]:
pages_idx_sorted = np.argsort(W[pages_in_topic][:,topic_id])[::-1]

In [416]:
np.array(preprocesed_sentences)[pages_in_topic][pages_idx_sorted][:10]

array([ 'south korea doubts north korea s ability to launch nuclear icbm  south korea s foreign minister says there is  no concrete evidence  that north korea has mastered delivering a nuclear armed intercontinental ballistic missile ',
       'could north korea s nuclear tipped missile actually reach entire us   north korea s bomb blast rhetoric might just bombast ',
       'the north korea standoff will get worse  in the dark of night tuesday  north korea launched its latest mobile missile  a humongous intercontinental range   ',
       'why north korea s latest ballistic missile test is worrisome  the latest test means a significant step forward in north korea s missile development ',
       'surgeon says  it s a miracle  north korea defector is alive  the soldier from north korea was like a  broken jar  when he arrived at the south korea trauma center ',
       'stability and strategy  why is china so easy on north korea   while the world watches with concern as north korea tests y