In [22]:
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
users = pd.read_csv('lucid.csv/lucid_table_users.csv', encoding='latin-1')
users.head()

Unnamed: 0,id,name,username,email,image,provider,provider_id,password,remember_token,created_at,updated_at,short_bio
0,1,Eniayomi Oluwaseyi,eniayomi,oluwaseyieniayomi@gmail.com,/storage/oluwaseyieniayomi/images//5lxMjdbOVca...,google,100892238782850407484,,79kIhqDiWzqx1i3eFpGMpxfLdPVIvSfv8A3vN2PNEl4ig9...,2019-08-01 17:57:22,2019-08-01 17:57:22,Software Developer | DevOPs Engineer
1,2,Elijah Okokon,DMatrix,okoelijah@gmail.com,/storage/2/images//DB7axhFJpFUzPuCNPRMQ6X1iNwb...,google,105584000674868760138,,Rz08FtDtdyAp8Cm5XJq5XumgZhEqytSu5MxxyHZYlHBhd0...,2019-08-01 18:05:58,2019-08-01 18:05:58,Web Developer
2,3,Jeffrey Ogah,jeff.ogah,jeff.ogah@gmail.com,https://lh3.googleusercontent.com/-FQgpN5l_UoY...,google,101235103722282698796,,ZxNMYG05B6RQuSWaJlqlc8bGbPyFxNGNnNf5D86biFix4G...,2019-08-01 20:55:39,2019-08-01 20:55:39,Front End Developer | React Developer | Mentor...
3,4,Oluwaseyi Oluwapelumi,nathan,nathanoluwaseyi@gmail.com,/storage/4/images/thumbnail/TLWSEBK7C-UMECCGUP...,google,107233455544779563919,,uxbOXr4Cw8tJyilWDKQRhi7TjiV7gBzjMnX0cb4i3ndFu6...,2019-08-02 16:52:47,2019-08-02 16:52:47,| Software Developer | DevOps Engineer | @linu...
4,5,PoRH,lamar,paulchibiukeigweze@gmail.com,/storage/5/images/thumbnail/images (2)_small_1...,google,103431607062264005570,,EeCB77oXHo3zdZgYTVhWlw2EyHkTtLSJb2PjWFF9RuG7H1...,2019-08-03 11:33:51,2019-08-03 11:33:51,I Am lamar and you don't think am real?


In [24]:
lucid_tfidf = TfidfVectorizer(stop_words='english')
# filling the missing values with empty string
users['short_bio'] = users['short_bio'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
users_matrix = lucid_tfidf.fit_transform(users['short_bio'])

In [25]:
users_matrix.shape

(2293, 2252)

In [26]:
cosine_similarity = linear_kernel(users_matrix, users_matrix)

In [27]:
indices = pd.Series(users['name'].index)

In [28]:
def recommend(index, cosine_sim=cosine_similarity):
    id = indices[index]
    # Get the pairwsie similarity scores of all names
    # sorting them and getting top 5
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6]

    # Get the names index
    lucid_index = [i[0] for i in similarity_scores]

    # Return the top 5 most similar names
    return users['name'].iloc[lucid_index]

In [29]:
recommend(2)

207         Olawale Esan
274    Kenny Oyinkansola
336        Emediong Okon
363        Nnamdi Ibeanu
371          adio mojeed
Name: name, dtype: object

In [30]:
recommend(6)

665     Tolulope Ogunfuwa
1090        Fisayo Aikomo
18            Lawal Idris
235     Alexander Ibrahim
877           taiwo coker
Name: name, dtype: object

In [31]:
posts = pd.read_csv('lucid.csv/lucid_table_posts.csv', encoding='latin-1')
posts.head()

Unnamed: 0,id,user_id,title,content,tags,slug,created_at,updated_at,image,status_id,action,post_id
0,1,2077,What i have learnt so far on HTML,I learnt how to use the table tag as i have us...,,what-i-have-learnt-so-far-on-html-985,2019-08-25 19:39:01,2019-08-28 11:30:00,1,,,
1,2,1719,HTML BEGINS HERE,"Â I am on this journey with start.ng, and here...",Technology,html-begins-here-4d6,2019-08-25 19:44:36,2019-08-28 11:30:00,1,,,
2,4,1310,My Laziness In The Open,I have not been attending classes on the HNG c...,,my-laziness-in-the-open-029,2019-08-25 19:50:33,2019-08-28 11:30:00,1,,,
3,6,1787,MY TASK 2,My journey on **StartNG** pre-internship progr...,,my-task-2-649,2019-08-25 19:51:36,2019-08-28 11:30:00,1,,,
4,7,167,Task 2,"Â A Summary on The âidongesit.htmlâ CV, It...",,task-2-7e5,2019-08-25 19:51:39,2019-08-28 11:30:00,1,,,


In [32]:
lucids_tfidf = TfidfVectorizer(stop_words='english')
# filling the missing values with empty string
posts['content'] = posts['content'].fillna('')
# computing TF-IDF matrix required for calculating cosine similarity
posts_matrix = lucids_tfidf.fit_transform(posts['content'])

In [33]:
posts_matrix.shape

(876, 475)

In [34]:
cosines_similarity = linear_kernel(posts_matrix, posts_matrix)

In [35]:
indicess = pd.Series(posts['title'].index)

In [36]:
def recommend(index, cosine_sim=cosines_similarity):
    id = indicess[index]
    # Get the pairwsie similarity scores of all names
    # sorting them and getting top 5
    similarity_scores = list(enumerate(cosine_sim[id]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:6]

    # Get the names index
    lucid_index = [i[0] for i in similarity_scores]

    # Return the top 5 most similar names
    return posts['title'].iloc[lucid_index]

In [37]:
recommend(6)

712    <p><strong>Introduction</strong></p>\n<p>The i...
714    <p><strong>Introduction</strong></p>\n<p>The i...
41     As a young lady in this computer age, I really...
111    <h3><strong>MY EXPERIENCE:</strong></h3>\n<ol>...
261    <p>Few days since the start of HNG 6.0 pre - i...
Name: content, dtype: object

In [38]:
recommend(10)

1    Â I am on this journey with start.ng, and here...
2    I have not been attending classes on the HNG c...
3    My journey on **StartNG** pre-internship progr...
4    Â A Summary on The âidongesit.htmlâ CV, It...
5    Using the Hyper Text Markup Language (HTML) ha...
Name: content, dtype: object