# Text Vectorizaion

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [5]:

path = 'data/articles.csv'
df = pd.read_csv(path)

## Data cleaning

In [6]:
df.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        337 non-null    object
 1   claps         337 non-null    object
 2   reading_time  337 non-null    int64 
 3   link          337 non-null    object
 4   title         337 non-null    object
 5   text          337 non-null    object
dtypes: int64(1), object(5)
memory usage: 15.9+ KB


In [8]:
df.isnull().sum()

author          0
claps           0
reading_time    0
link            0
title           0
text            0
dtype: int64

In [9]:
df['text'] = df['text'].apply(lambda x: x.replace('\n', ' '))
df

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared: Chatbots were Th..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...
...,...,...,...,...,...,...
332,Daniel Simmons,3.4K,8,https://itnext.io/you-can-build-a-neural-netwo...,You can build a neural network in JavaScript e...,Click here to share this article on LinkedIn »...
333,Eugenio Culurciello,2.8K,13,https://towardsdatascience.com/artificial-inte...,"Artificial Intelligence, AI in 2018 and beyond...",These are my opinions on where deep neural net...
334,Devin Soni,5.8K,4,https://towardsdatascience.com/spiking-neural-...,"Spiking Neural Networks, the Next Generation o...",Everyone who has been remotely tuned in to rec...
335,Carlos E. Perez,3.9K,7,https://medium.com/intuitionmachine/neurons-ar...,Surprise! Neurons are Now More Complex than We...,One of the biggest misconceptions around is th...


In [10]:
# save cleaned data
df.to_csv('data/cleaned_articles.csv', index=False)

## Vectorize text

In [11]:
# Define a TF-IDF Vectorizer Object, while removing all english stop words such as 'the', 'a', ...
tfidf = TfidfVectorizer(stop_words='english')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['text'])
tfidf_matrix.shape

(337, 20095)

In [12]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.06265938 0.03636636 ... 0.03262549 0.06377183 0.06731326]
 [0.06265938 1.         0.16808102 ... 0.03386742 0.04663115 0.07390585]
 [0.03636636 0.16808102 1.         ... 0.03861725 0.04874319 0.10531746]
 ...
 [0.03262549 0.03386742 0.03861725 ... 1.         0.18912234 0.09062875]
 [0.06377183 0.04663115 0.04874319 ... 0.18912234 1.         0.15074329]
 [0.06731326 0.07390585 0.10531746 ... 0.09062875 0.15074329 1.        ]]


In [13]:
def get_similars(text, cosine_sim=cosine_sim):
    text = text.lower()

    idx = []

    # iterate over articles, and add their index if content contains substring
    for i in range(len(df)):
        if text in df['text'][i].lower():
            idx.append(i)

    # Get the pairwsie similarity scores of all articles with that text
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the articles based on the similarity scores
    sim_scores = sorted(sim_scores, reverse=True)

    # Get the scores of the 3 most similar articles
    sim_scores = sim_scores[1:4]

    # Get the article indices
    article_indices = [i[0] for i in sim_scores]
    print(article_indices)

    if article_indices == []:
        return None
    # Return the top 3 most similar articles
    return df.iloc[article_indices]

In [14]:
get_similars('data science')

[72, 71, 70]


Unnamed: 0,author,claps,reading_time,link,title,text
72,Andrej Karpathy,35K,8,https://medium.com/@karpathy/software-2-0-a641...,Software 2.0 – Andrej Karpathy – Medium,I sometimes see people refer to neural network...
71,Dhruv Parthasarathy,4.3K,12,https://blog.athelas.com/a-brief-history-of-cn...,A Brief History of CNNs in Image Segmentation:...,"At Athelas, we use Convolutional Neural Networ..."
70,Sophia Ciocca,53K,9,https://medium.com/s/story/spotifys-discover-w...,How Does Spotify Know You So Well? – Member Fe...,Member Feature Story A software engineer expla...
