In [1]:
import re
import logging

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

from num2words import num2words

stopwords = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

# punct = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~’\"—“”•]'
punct = '[!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~’—“”•]'

logging.basicConfig(level=logging.INFO, force=True)

[nltk_data] Downloading package punkt to /home/aldos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/aldos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## All-in-one preprocess function 


### The process of data preparation is divided into 3 main parts
#### Tokenization: 
  Text into individual tokens, which are mostly words (some combinations of words and/ or numbers are present)
#### Removing stopwords and punctuation:
  We gotta get rid of all the words and symbold that don't add a lot of meaning to the text
#### Lemmatization:
  Probably the most interesting part: we transform words into their normal form, so that all the words from the list ['run', 'running', 'runs'] becomes 'run'. This allows for better generalization as all these three form have almost the same meaning

In [2]:
def preprocess_text(text: str) -> str:
    """
    Preprocess text:
    1) tokenize lowercase text
    2) exclude all the stopwords and punctuation
    3) get lemmas of the tokens
    
    :text str: string of raw text
    :returns str: string of preprocessed text
    """
    doc = " ".join([token for token in word_tokenize(text.lower())
                        if token not in stopwords and token not in punct])
    
    result = [token.lemma_ for token in nlp(doc)] 
    
    return " ".join(result)

### !!! A part of numerical data appears to be in some kind of strange format so this function throws an error at some point while preprocessing data. Further analysis of numerical data types should be conducted to understand the problem and solve it. !!!

In [None]:
# def preprocess_text_with_nums(text: str) -> str:
#     """
#     Preprocess text:
#     1) tokenize lowercase text
#     2) turn numbers into their textual representation
#     3) exclude all the stopwords and punctuation
#     4) get lemmas of the tokens
    
#     :text str: string of raw text
#     :returns str: string of preprocessed text
#     """
    
#     doc = []
#     for token in word_tokenize(text.lower()):
#         if token.isdigit():
#             try:
#                 token = " ".join([word for word in re.sub(punct, ' ', num2words(token)).split()
#                                       if word not in punct and word not in stopwords])
#                 doc.append(token)
            
#             except Exception:
#                 logging.exception("An exception in tokenization process...")
#         else:
#             if token not in punct and token not in stopwords:
#                 doc.append(token)
    
#     # make a string
#     doc = " ".join(doc)
    
#     # take lemmas of tokens
#     result = [token.lemma_ for token in nlp(doc)] 
    
#     return " ".join(result)

In [3]:
test = "All work and no play makes Jack a dull boy, all work and no play in 1984"

In [4]:
# %%time
# preprocess_text_with_nums(test)

In [5]:
%%time
preprocess_text(test)

CPU times: user 13.1 ms, sys: 328 µs, total: 13.4 ms
Wall time: 12.1 ms


'work play make jack dull boy work play 1984'

### Loading data

In [None]:
# Load the first dataset
df1 = pd.read_csv('data/articles1.csv')
df1.drop('Unnamed: 0', axis=1, inplace=True)
df1.head()

In [None]:
# Preprocess contents of a content column (hehe) and store it as a new one
df1_clean_column = df1.content.apply(lambda x: preprocess_text(x))
df1['clear_text'] = df1_clean_column
df1.to_csv('data/articles1_clean.csv')

In [None]:
# Load the second dataframe
df2 = pd.read_csv('data/articles2.csv')
df2.drop('Unnamed: 0', axis=1, inplace=True)
print(f"df2 shape: {df2.shape}")
df2.head()

In [None]:
# Preprocess contents of a content column (hehe) and store it as a new one
df2_clear_column = df2.content.apply(lambda x: preprocess_text(x))
df2['clear_text'] = df2_clear_column
df2.to_csv('data/articles2_clean.csv')

In [None]:
# Load the third dataframe
df3 = pd.read_csv('data/articles3.csv')
df3.drop('Unnamed: 0', axis=1, inplace=True)
print(df3.shape)
df3.head()

In [None]:
# Cleanse text in articles3.csv and persist the new version on disk
df3_clear_column = df3.content.apply(lambda x: preprocess_text(x))
df3['clear_text'] = df3_clear_column
df3.to_csv('data/articles3_clean.csv')

In [None]:
# Merge 3 cleaned dataframes into one and check the result
df_full = pd.concat([df1, df2, df3], axis=0)
print(f"Resulting DataFrame shape: {df_full.shape}")
df_full.sample(5)

In [None]:
# Persist full dataframe on disk
df_full.to_csv('data/articles_full_cleaned.csv')

In [6]:
df_full = pd.read_csv('data/articles_full_cleaned.csv')
print(df_full.shape)
df_full.head()

(142570, 11)


Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,clear_text
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,washington congressional republican new fear c...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...",bullet shell get count blood dry votive candle...
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...",walt disney bambi open 1942 critic praise spar...
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...",death may great equalizer necessarily evenhand...
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...",seoul south korea north korea leader kim say s...


### It appears that some articles have urls which can be used for boosting ranking as described in the paper "The Anatomy of a Large-Scale Hypertextual Web Search Engine". WIll definetely implement it in future works on this project

### TF-IDF

In [7]:
# Create TF-IDF vectorizer and fit it whole corpora

def initialize_tfidf(data, column_name):
    
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data[column_name])
    
    return tfidf, tfidf_matrix

In [36]:
tfidf, tfidf_matrix = initialize_tfidf(df_full, 'clear_text')

In [None]:
# Save the model and reverse indexed matrix

def save_models(tfidf, tfidf_matrix):
    
    filename = 'models/tfidf.sav'
    dump(tfidf, filename)

    filename = 'models/tfidf_matrix.sav'
    dump(tfidf_matrix, filename)

save_models(tfidf, tfidf_matrix)

In [10]:
# Load the models so we don't need to recompute the tf-idf values

tfidf = load('models/tfidf.sav')
tfidf_matrix = load('models/tfidf_matrix.sav')

In [13]:
def get_query_scores(tfidf, tfidf_matrix, query):
    """
    Get top 10 docs for each word
    """
    query_scoring = {}
    for word in preprocess_text(query).split():

        word_id = tfidf.vocabulary_[word]
        new_vec = tfidf_matrix[:, word_id]
        
        # Argsort returns list of indicies for elements
        # from the original list in ascending order
        # e.g. original = [3, 2, 1]
        # original.argsort() = [3, 1, 0] <- one is the 
        # smallest element and its index in original == 3
        
        top_docs = new_vec.toarray()[:, 0].argsort()[:-11:-1]

        query_scoring[word] = top_docs.tolist()
    
        
    return query_scoring

In [24]:
%%time
# Get most relevant documents from tf-idf matrix and 
# create a list of top documents (max 10 for each word in a query)
current_query = 'work desk'
query_scoring = get_query_scores(tfidf, tfidf_matrix, current_query)

top_docs = []
for k, v in query_scoring.items():
    top_docs.extend(v)

CPU times: user 296 ms, sys: 0 ns, total: 296 ms
Wall time: 295 ms


In [39]:
def calc_cosine_sim(tfidf, tfidf_matrix, top_docs, query):
    """
    Transform query into a vector of len(vocabulary) dim 
    and calculate cosine similarity between this vector and tf-idf suggestions
    """
    query_vec = tfidf.transform(['work desk'])
    cos_sim_scores = {doc_id: cosine_similarity(
                        query_vec, tfidf_matrix[doc_id]).flatten()[0]
                          for doc_id in top_docs}
    
    return cos_sim_scores

In [41]:
%%time
cos_sim_scores = calc_cosine_sim(tfidf, tfidf_matrix, top_docs, current_query)

CPU times: user 37.2 ms, sys: 298 µs, total: 37.5 ms
Wall time: 34.7 ms


In [43]:
# Get top 5 docs by cosine similarity value
def get_top_docs(cos_sim_scores):
    return [doc[0] for doc in sorted(cos_sim_scores.items(), key=lambda x:x[1], reverse=True)[:5]]

In [44]:
%%time
top_5_docs = get_top_docs(cos_sim_scores)

CPU times: user 10 µs, sys: 1e+03 ns, total: 11 µs
Wall time: 13.1 µs


In [53]:
%%time
# Persist answers on a disk
for idx, doc_id in enumerate(top_5_docs):
    with open(f'answers/answer_{idx}', 'w+') as f:
        f.write(df_full.iloc[doc_id].content)

CPU times: user 8.56 ms, sys: 543 µs, total: 9.1 ms
Wall time: 6.87 ms


In [54]:
# Print top 5 articles
for idx, doc_id in enumerate(top_5_docs):
    print(f"Top {idx} doc_id: {doc_id}")

Top 0 doc_id: 94054
Top 1 doc_id: 108355
Top 2 doc_id: 90128
Top 3 doc_id: 70823
Top 4 doc_id: 109725


In [52]:
# One of the top 5 articles
df_full.iloc[94054].content

'In the office, one’s desk is one’s domain. But unlike your messy bedroom at home, this abode is on display whether you like it or not, especially in   open office plans. So what message does it send if your desk is stacked with paper, covered in “Star Trek” paraphernalia or bedecked with molding coffee cups? Experts say: not the best. “How we dress and how we maintain our work space are ways we communicate with people,” says Michelle Augenstein, founder of   consulting group  . “A messy desk is a barrier to communication. ” “A messy desk is a barrier to communication. ” It can even undermine your own credibility, she says. “It projects your work ethic to people, whether you like it or not. Are you known as the Lego lady or the ‘Star Wars’ aficionado first [by your  ]?” If so, you might want to disassemble the shrine instead of risking your personal quirks overshadowing your reputation as a hard worker. John Ore, head of product at   Business Insider, finds himself newly tidy at work. 