In [1]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
corpus = [
	"At 20 years of age the will reigns; at 30, the wit; and at 40, the judgement.",
	"Challenges are what make life interesting and overcoming them is what makes life meaningful.",
	"Let your life be shaped by decisions you made, not by the ones you didn't.",
	"The privilege of a lifetime is being who you are.",
	"To see the world, things dangerous to come to. To see behind walls, to draw closer. To find each other and to feel. That, is the purpose of life.",
	"We should count time by heart throbs. He most lives who thinks most, feels the noblest, acts the best.",
	"Continuous effort - not strength or intelligence, is the key to unlocking our potential.",
	"Knowledge is knowing what to say. Wisdom is knowing when to say it.",
	"Critique to sharpen; not to put down.",
	"Cowards die many times before their deaths; the braves only but once.",
	"Strength doesn't come from what you can do. It comes from overcoming the things you once thought you couldn't.",
	"Our greatest fear should not be of failure, but of succeeding at things in life that don't really matter.",
	"Creativity comes from constraint.",
	"To love someone means you have a desire to change together with that person.",
	"Aspire for the heights but prepare for the depths.",
	"What hurts more, the pain of hard work or the pain of regret?",
	"Those who have achieved all their aims probably set them too low.",
	"What we fight with is so small, and when we win, it makes us small. What we want is to be defeated, decisively, by successively greater things.",
	"Life is the sum of our choices.",
	"To talk well and eloquently is a very great art, but it is an equally great one to know the right time to stop.",
	"You can live your whole life and never know who you are; until you see the world through the eyes of others.",
	"CHANGE begins, when you start trying.",
	"Design influences meaning.",
	"Nobody made a greater mistake than he who did nothing because he could only do a little.",
	"Many succeed momentarily by what they know; some succeed temporarily by what they do; few succeed permanently by what they are.",
	"When you find peace within yourself, you become the kind of person who can live at peace with others.",
	"Determine never to be idle. No person will have occasion to complain of the want of time, who never loses any. It is wonderful how much may be done, if we are always doing.",
	"When I am working on a problem, I never think about beauty. I think only of how to solve the problem. But when I have finished, if the solution is not beautiful, I know it is wrong.",
	"If it is important to you, you will find a way. If not, you will find an excuse.",
	"In science, we reserve our highest honors for those who prove us wrong.",
	"Logic will get you from A to B. Imagination will take you anywhere.",
	"Out of our vulnerabilities come our strengths.",
	"The eyes of others our prisons, their thoughts our cages.",
	"People are attracted to you by what they see in you; they remain attracted to you by what you see in yourself.",
	"Don't be afraid your life will end; be afraid that it will never begin.",
	"Climb the mountain not to plant your flag, but to embrace the challenge, enjoy the air and behold the view. Climb it so you can see the world, not so the world can see you.",
	"The mark of a successful man is one that has spent an entire day on the bank of a river without feeling guilty about it.",
	"In the beginner's mind, there are many possibilities; in the expert's mind, there are few.",
	"The idea of education was to learn to think for yourself.",
	"It is our choices that show what we really are, far more than our abilities.",
	"Before success comes the courage to fail.",
	"A man is rich in proportion to the number of things he can afford to let alone.",
	"Every time we open our mouths, men look into our minds.",
	"A man cannot be comfortable without his own approval.",
	"There are two things to aim at in life: first, to get what you want; and, after that, to enjoy it. Only the wisest of mankind achieve the second.",
	"We all make decisions, but in the end, our decisions made us.",
	"He who establishes his argument by noise and command shows that his reason is weak.",
	"You attract into your life that which you are.",
	"Greatness lies not in being strong, but in the right use of strength.",
	"Diplomacy is the art of telling plain truths without giving offence.",
	"The highest reward for a person's work is not what he gets for it, but what he becomes by it."
]

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Prepare stemmer and stop words

In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

### Perform data cleansing

In [5]:
def preprocess(docs):
    docs_clean = []
    punc = str.maketrans("", "", string.punctuation)
    for doc in docs:
        doc_no_punc = doc.translate(punc)
        words = doc_no_punc.lower().split()
        words = [lemmatizer.lemmatize(word, 'v')
                for word in words if word not in stop_words]
        docs_clean.append(' '.join(words))
    return docs_clean

docs_clean = preprocess(corpus)

### After preprocessing, generate Tfidf for feature vectors

In [6]:
tfidf = TfidfVectorizer()

feature_vectors = tfidf.fit_transform(docs_clean).toarray()

In [10]:
features = tfidf.get_feature_names()



In [27]:
indexes = [i for i in range(len(corpus))]

In [28]:
import pandas as pd
import numpy as np

### Create dataframe for feature vectors 

In [31]:
tfidf_df = pd.DataFrame(data=feature_vectors, index=indexes, columns=features)
tfidf_df.shape

(51, 242)

In [32]:
tfidf_df

Unnamed: 0,20,30,40,abilities,achieve,act,afford,afraid,age,aim,...,wisdom,wisest,wit,within,without,wonderful,work,world,wrong,years
0,0.353553,0.353553,0.353553,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,...,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.353553
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241662,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.341626,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.349168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create new query and preprocess

In [13]:
query = ['life wise choice']

In [14]:
query_clean = preprocess(query)

### Use Tfidf on query

In [15]:
query_feature_vector = tfidf.transform(query_clean).toarray()

In [16]:
similarity = cosine_similarity(query_feature_vector, feature_vectors)

### Reduce dimension from 2 to 1

In [17]:
query_similarity = similarity[0] # do this step to reduce dimension from 2 to 1

In [18]:
query_similarity.shape

(51,)

### Create a pandas series that list the cosine similarity values over all documents

In [19]:
series = pd.Series(query_similarity, index=tfidf_df.index)

### Sort the series descending order

In [20]:
sorted_series = series.sort_values(ascending=False)

### Only get the rows with non-zero cosine similarity values

In [21]:
sorted_series = sorted_series[sorted_series != 0]

In [22]:
sorted_series

48    0.566525
2     0.457722
19    0.418844
3     0.263450
21    0.252043
12    0.229580
35    0.227760
45    0.199282
5     0.179548
dtype: float64

In [42]:
for i in range(len(sorted_series)):
    print(f'{corpus[sorted_series.index[i]]} [score = {sorted_series.iloc[i]}]', end='\n')

Greatness lies not in being strong, but in the right use of strength. [score = 0.5665251217588586]
Let your life be shaped by decisions you made, not by the ones you didn't. [score = 0.4577222910359949]
To talk well and eloquently is a very great art, but it is an equally great one to know the right time to stop. [score = 0.4188444278712081]
The privilege of a lifetime is being who you are. [score = 0.2634504186983477]
CHANGE begins, when you start trying. [score = 0.25204318290656375]
Creativity comes from constraint. [score = 0.22958043890516397]
Climb the mountain not to plant your flag, but to embrace the challenge, enjoy the air and behold the view. Climb it so you can see the world, not so the world can see you. [score = 0.22776047935509783]
We all make decisions, but in the end, our decisions made us. [score = 0.1992822831024917]
We should count time by heart throbs. He most lives who thinks most, feels the noblest, acts the best. [score = 0.17954827567225767]


In [35]:
corpus[sorted_series.index[0]]

'Greatness lies not in being strong, but in the right use of strength.'

In [40]:
sorted_series[1]

KeyError: 1