# Content-based recommender system using Cosine-Similarity

In [None]:
# !pip install rake_nltk

In [None]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.read_excel("blogs_extract.xlsx")
df.head()

Unnamed: 0,title,info
0,how to write a good official resume,building and maintaining a good resume is extr...
1,why should you do internships,internships are defined as “any official or fo...
2,what are some of the best sites to apply for i...,internships are a great tool in the life of an...
3,what are the various scholarships available fo...,scholarships are a great boon for students who...
4,what are the upcoming careers in the science f...,science has been impacting our lives majorly f...


In [None]:
df.shape

(97, 2)

In [None]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    info = row['info']
    
    # instantiating Rake, by default is uses english stopwords from NLTK
    # and discard all punctuation characters
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(info)

    # getting the dictionary with key words and their scores
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the info column
df.drop(columns = ['info'], inplace = True)

In [None]:
df.set_index('title', inplace = True)
df.head()

Unnamed: 0_level_0,Key_words
title,Unnamed: 1_level_1
how to write a good official resume,"[growth, depending, experience, research, work..."
why should you do internships,"[part, experience, problem, solving, job, mark..."
what are some of the best sites to apply for internships in india,"[best, internship, get, noticed, find, interns..."
what are the various scholarships available for mbbs students in india,"[5000, pursuing, graduation, academic, fields,..."
what are the upcoming careers in the science field,"[even, wearable, technology, edge, research, a..."


In [None]:
df['bag_of_words'] = ''
columns = df.columns
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words = words + ' '.join(row[col])+ ' '
    row['bag_of_words'] = words
    
df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)

In [None]:
df.head()

Unnamed: 0_level_0,bag_of_words
title,Unnamed: 1_level_1
how to write a good official resume,growth depending experience research work well...
why should you do internships,part experience problem solving job market rec...
what are some of the best sites to apply for internships in india,best internship get noticed find interns start...
what are the various scholarships available for mbbs students in india,5000 pursuing graduation academic fields trans...
what are the upcoming careers in the science field,even wearable technology edge research aerospa...


In [None]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# creating a Series for the blog titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(df.index)
indices[:5]

0                  how to write a good official resume
1                        why should you do internships
2    what are some of the best sites to apply for i...
3    what are the various scholarships available fo...
4    what are the upcoming careers in the science f...
Name: title, dtype: object

In [None]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.16064915, 0.14796631, ..., 0.13443321, 0.18630692,
        0.12686482],
       [0.16064915, 1.        , 0.2276498 , ..., 0.13870662, 0.15055722,
        0.09981492],
       [0.14796631, 0.2276498 , 1.        , ..., 0.12170846, 0.1280041 ,
        0.10421757],
       ...,
       [0.13443321, 0.13870662, 0.12170846, ..., 1.        , 0.22047928,
        0.11283387],
       [0.18630692, 0.15055722, 0.1280041 , ..., 0.22047928, 1.        ,
        0.12923392],
       [0.12686482, 0.09981492, 0.10421757, ..., 0.11283387, 0.12923392,
        1.        ]])

In [None]:
# function that takes in blog title as input and returns the top 10 recommended blogs
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_blogs = []
    
    # gettin the index of the blog that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar blogs
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching blogs
    for i in top_10_indexes:
        recommended_blogs.append(list(df.index)[i])
        
    return recommended_blogs

In [None]:
recommendations('how to write a good official resume')

['how you can start your own side hustle',
 'which are the most common interview questions',
 'what is the best way to earn my pocket money online as a student',
 'how to create the perfect linkedin profile',
 'how to be a problem solver',
 'how to pick the right college',
 'how can i stop my social media addiction',
 'what are a few tips to improve my academic performance',
 'should you start to learn to code',
 'good journaling habits']

In [None]:
recommendations('why should you do internships')

['importance of being in clubs or committees',
 'should you consider preparing for masters',
 'how to develop leadership during college',
 'why you should participate in extracurricular activities',
 'how to become a content writer in india',
 'what are good co-curricular activities to improve my cv',
 'the importance of having a mentor',
 'how much does college cgpa matter',
 'is it okay to take drop a year post 12th?',
 'what if i do not get selected via campus placements']

In [None]:
recommendations('what are the various scholarships available for mbbs students in india')

['how can i apply for scholarship in india?',
 'top foreign countries to pursue a masters’ from',
 'which stream of engineering to choose',
 'where can students learn online courses in india?',
 'what are some good career options in arts are there any',
 'corporate job vs government job',
 'should i go for ib or being in india, choose cbse',
 'why do so many students in india study engineering',
 'how will the nep 2020 impact students',
 'what are the highest paying jobs in india']

In [None]:
recommendations('how can students turn their ideas into a business?')

['how to start my own startup',
 'why is choosing the right career important',
 'should you start to learn to code',
 'how to be a problem solver',
 'is public speaking important for me?',
 'what are the top career myths',
 'how you can start your own side hustle',
 'top ways of being successful',
 'should you consider preparing for masters',
 'what if i do not get selected via campus placements']

In [None]:
recommendations('what are the future prospects in architecture')

['how much does college cgpa matter',
 'how to become a professional from a student',
 'how to become a lifelong learner',
 'what if i do not get selected via campus placements',
 'how can career counselling assist you',
 'is it worth becoming a politician',
 'jobs that may become automated in the future',
 'what are the future careers to arise after the pandemic',
 'evergreen skills that everyone should develop',
 'why are psychometric tests necessary']

In [None]:
recommendations('what is the best way to earn my pocket money online as a student')

['how to become a content writer in india',
 'should you start to learn to code',
 'best books to become a better student',
 'what if i do not get selected via campus placements',
 'what are some of the best sites to apply for internships in india',
 'how to write a good official resume',
 'should you consider preparing for masters',
 'how can i stop my social media addiction',
 'how can i learn a new language',
 'what are good co-curricular activities to improve my cv']

In [None]:
recommendations('tips to focus during online classes')

['what is the importance of time management for students',
 'how to learn more effectively',
 'how can i stop my social media addiction',
 'how much does college cgpa matter',
 'what are a few tips to improve my academic performance',
 'the future of education after the pandemic',
 'how can i face the hostel life in college?',
 'how can students these days deal with social anxiety',
 'is it okay to take drop a year post 12th?',
 'what are the top career myths']

In [None]:
recommendations('how can i stop my social media addiction')

['good journaling habits',
 'top ways of being successful',
 'how can i learn a new language',
 'tips to focus during online classes',
 'myths about foreign education (1)',
 'how to learn more effectively',
 'how you can start your own side hustle',
 'what is the importance of time management for students',
 'how can students these days deal with social anxiety',
 'should you start to learn to code']

## Icluding Read_Counts

In [None]:
# Creating new columns of Read Counts
df['read_counts'] = np.random.randint(0, 101, df.shape[0])
df.head()

Unnamed: 0_level_0,bag_of_words,read_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
First Artificial Enzyme From Two Non-Biological Groups Created,scientists lmrr protein chemical reactions the...,34
AI Will Revolutionize DNA Evidence – Once We Can Trust The Results,scientists programmers battle ensued shutterst...,83
The Hubble Space Telescope Is Falling (Synopsis),using timespan two ways “ end ” loses altitude...,78
5 NASA Photos That Changed The World,advances accepted fact today may explaining na...,20
Double Comments of the Week #178: From Point Particles To The Very First Galaxies Of All,"fine suppose hand negative curvature ."" bars p...",23


In [None]:
df.to_csv('blogs_altered.csv')

In [None]:
# function that takes in blog title as input and returns the top 10 recommended blogs
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_blogs = []
    
    # gettin the index of the blog that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar blogs
    top_5_indexes = list(score_series.iloc[1:6].index)

    # getting the indexes of the next 5 most similar blogs
    next_top_5_indexes = list(score_series.iloc[6:11].index)
    
    # populating the list with the titles of the best 5 matching blogs
    for i in top_5_indexes:
        recommended_blogs.append(list(df.index)[i])

    for i in next_top_5_indexes:
        recommended_blogs.append(list(df.index)[i])
        
    return recommended_blogs

In [None]:
def recommendations_read_counts(title):
  recommended_blogs = recommendations(title, cosine_sim = cosine_sim)
  recommendations_list = recommendations(title)
  
  #getting the read_count of the top 5 recommended blogs
  count_list = []
  for i in range(0, 5):
    count_list.append(df.read_counts[recommendations_list[i]])
  count_list.sort(reverse=True)

  # gettin the index of the blog that matches the title
  idx = indices[indices == title].index[0]

  # creating a Series with the similarity scores in descending order
  score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

  # Sorting top 5 blogs based on read_counts
  recommended_blogs_sorted = [x for _, x in sorted(zip(count_list, recommended_blogs))]

  # getting the indexes of the next 5 most similar blogs
  next_top_5_indexes = list(score_series.iloc[6:11].index)

  for i in next_top_5_indexes:
      recommended_blogs_sorted.append(list(df.index)[i])

  return recommended_blogs_sorted

In [None]:
recommendations_read_counts('A Question For The President')

['NAS cosponsors ScienceDebate2008; joins AAAS and a bajillion bloggers',
 'Should there be a Presidential Debate about Science?',
 'Science Debate 2008 - my Question #1: Scientific Advice to the President',
 'The Donald Trump War on Science: Scholarly and Professional Society Statements in Support of Open Science Communications',
 'Exclusive: Interview with Senator John Edwards on Science-Related Topics',
 'Pew Survey of Scientists & the Public: Implications for Public Engagement and Communication',
 "It's Official: Bush Will Pull a Gingrich",
 'Obama on Science',
 'Raising the barriers:  restricting access to scientific literature will hurt STEM education',
 'Scientists Should Be Skeptical of Funding Through the Stimulus Package (updated)']

In [None]:
recommendations_read_counts('5 NASA Photos That Changed The World')

['Why observatories shoot lasers at the Universe',
 'Ten Questions to Ask Your Biology Teacher About Intelligent Design',
 'Happy Earth Day, 2011 Edition!',
 'What the James Webb Space Telescope means',
 'Celestial Square One',
 'Your Friday Dose of Woo: Generation woo',
 'The Most Impossible Idea From Star Trek (Synopsis)',
 "The 10 Best Books I've Read This Year",
 'If Aliens Exist, should we be eager to meet them?',
 'ScienceOnline2010 - interview with Scott Huler']

In [None]:
recommendations('Worthless advice on bioterrorism to the new administration')

['Exclusive: Interview with Senator John Edwards on Science-Related Topics',
 'An end to climate silence',
 'Pandemic prep teachable moment',
 'Obama takes a science test',
 'Pandemic prep teachable moment',
 'What if, rather than being too stringent about drug approval, the FDA is not being stringent enough?',
 'The War On Science: What It Is And How To Win It',
 'Overdiagnosis of breast cancer due to mammography',
 'Making Liquid Fuels From Sun And Air',
 'A rebuke to the antivaccine movement: A hundred million cases of disease prevented and millions of lives saved by vaccines']

In [None]:
recommendations_read_counts('Harpooning For Dummies')

['"Liquid biopsies" for cancer: not ready for prime time',
 'Adverse effects of chemotherapy in breast cancer: Balancing risks and benefits',
 'What if, rather than being too stringent about drug approval, the FDA is not being stringent enough?',
 'Vertebroplasty for compression fractures due to osteoporosis: Modern acupuncture',
 'Early detection of cancer, part 2: Breast cancer and the never-ending confusion over screening',
 'Overdiagnosis of breast cancer due to mammography',
 'The Canadian Breast Screening Study attacked: Why do doctors have such a hard time with the concept of overdiagnosis?',
 'Balancing scientific rigor versus patient good in clinical trials',
 'The early detection of cancer: More complicated than you think',
 'Crank spin versus science on mammography']