In [134]:
# imports for required libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [109]:
# load datased as pandas dataframe
df = pd.read_csv('TopStaredRepositories.csv')

In [110]:
# print first 5 rows of dataset
df.head(5)

Unnamed: 0,Username,Repository Name,Description,Last Update Date,Language,Number of Stars,Tags,Url,Gravatar
0,freeCodeCamp,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,2017-06-24T15:56:17Z,JavaScript,290k,"nonprofits,certification,curriculum,react,node...",https://github.com/freeCodeCamp/freeCodeCamp,https://avatars0.githubusercontent.com/u/98925...
1,twbs,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",2017-06-24T15:40:21Z,JavaScript,112k,"javascript,css,html,bootstrap,jekyll-site,scss",https://github.com/twbs/bootstrap,https://avatars0.githubusercontent.com/u/29185...
2,EbookFoundation,free-programming-books,Freely available programming books,2017-06-23T01:09:34Z,,87.8k,"education,list,books,resource",https://github.com/EbookFoundation/free-progra...,https://avatars0.githubusercontent.com/u/14127...
3,facebook,react,"A declarative, efficient, and flexible JavaScr...",2017-06-24T19:33:49Z,JavaScript,69.7k,,https://github.com/facebook/react,https://avatars3.githubusercontent.com/u/69631...
4,d3,d3,"Bring data to life with SVG, Canvas and HTML.",2017-05-31T06:03:47Z,JavaScript,65.7k,visualization,https://github.com/d3/d3,https://avatars1.githubusercontent.com/u/15627...


In [111]:
# print number of rows and columns in dataset, 980 and 9 respectively
df.shape

(980, 9)

In [112]:
# list of important columns that can be used to learn content of project
columns = ['Repository Name', 'Description', 'Language', 'Tags']

df[columns].head(5)

Unnamed: 0,Repository Name,Description,Language,Tags
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,"nonprofits,certification,curriculum,react,node..."
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,"javascript,css,html,bootstrap,jekyll-site,scss"
2,free-programming-books,Freely available programming books,,"education,list,books,resource"
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,visualization


## Checking null values
The following section is concerned with null values within the chosen columns of the dataset and deals with them appropriately

In [113]:
# check if null values exist in the new set
df[columns].isnull().values.any()

True

In [114]:
# find how many rows have null values within language and tag columns
null_lang = df['Language'].isna().sum()
print(f'Language Rows with NaNs: {null_lang} ({(null_lang/df.shape[0]) *100:.0f}%)')

null_tags = df['Tags'].isna().sum()
print(f'Tag Rows with NaNs: {null_tags} ({(null_tags/df.shape[0]) *100:.0f}%)')

null_rows = len(df[df[columns].isna().any(axis=1)])
print(f'Combined Rows with NaNs: {null_rows} ({(null_rows/df.shape[0]) *100:.0f}%)')

Language Rows with NaNs: 103 (11%)
Tag Rows with NaNs: 491 (50%)
Combined Rows with NaNs: 544 (56%)


Ideally you would remove rows with null values however with 56% of the data having either no language or no tags this is too much to sacrifice. We still have complete entries for title and description so we should be able to infer enough detail about each project without the other columns and will instead fill these with empty strings.

## Description of each column
- **Repository Name:** Important for output as well as including keywords which are directly related to the topic of the project
- **Description:** Similar to above, useful for outputting in final deployment and for majority of projects has the highest word count of all columns
- **Language:** Single descriptor for most relevant language/technology with respect to the project
- **Tags:** Similar to above, and contains only words directly related to the project without "fluff" like the description, although this is less of an issue once stopwords have been removed

In [115]:
# create duplicate dataframe object with only important columns and replace null values with empty string, then check
df_important = df[columns]
df_important.fillna('', inplace=True)
df_important.head(5)

Unnamed: 0,Repository Name,Description,Language,Tags
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,"nonprofits,certification,curriculum,react,node..."
1,bootstrap,"The most popular HTML, CSS, and JavaScript fra...",JavaScript,"javascript,css,html,bootstrap,jekyll-site,scss"
2,free-programming-books,Freely available programming books,,"education,list,books,resource"
3,react,"A declarative, efficient, and flexible JavaScr...",JavaScript,
4,d3,"Bring data to life with SVG, Canvas and HTML.",JavaScript,visualization


In [116]:
# remove commas from description and tags columns to allow proper tokenization of words
df_important[['Tags']] = df_important[['Tags']].applymap(lambda x: ' '.join(x.split(',')))
df_important[['Description']] = df_important[['Description']].applymap(lambda x: ''.join(x.split(',')))

df_important.head(5)

Unnamed: 0,Repository Name,Description,Language,Tags
0,freeCodeCamp,The https://freeCodeCamp.com open source codeb...,JavaScript,nonprofits certification curriculum react node...
1,bootstrap,The most popular HTML CSS and JavaScript frame...,JavaScript,javascript css html bootstrap jekyll-site scss
2,free-programming-books,Freely available programming books,,education list books resource
3,react,A declarative efficient and flexible JavaScrip...,JavaScript,
4,d3,Bring data to life with SVG Canvas and HTML.,JavaScript,visualization


## Weighting
By giving more emphasis on desired columns, we can make them have a higher weight when determining strength of certain words. For example, as the language column is a single word descriptor of the primary language used in the project, it could make sense depend on this as a key column and therefore increase the weight it holds. This can be done simply by duplicating the selected column and then every language will show twice which increases its importance

In [117]:
#df_important['Language_duplicated'] = df_important['Language']
#df_important.head(5)

In [118]:
corpus = df_important.apply(' '.join, axis=1)
corpus.head()

0    freeCodeCamp The https://freeCodeCamp.com open...
1    bootstrap The most popular HTML CSS and JavaSc...
2    free-programming-books  Freely available progr...
3    react A declarative efficient and flexible Jav...
4    d3 Bring data to life with SVG Canvas and HTML...
dtype: object

In [122]:
tfidf_vectorizer_params = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,3))

In [123]:
tfidf_vectorizer = tfidf_vectorizer_params.fit_transform(corpus)

In [133]:
pd.DataFrame(tfidf_vectorizer.toarray(), columns=tfidf_vectorizer_params.get_feature_names())

Unnamed: 0,07511,07511 matlab,10,10 objective,1000,1000 contributors,1000 contributors framework,1024,1024 https,1024 https play,...,zxcvbn,zxcvbn low,zxcvbn low budget,zxing,zxing android,zxing android barcode,zxing official,zxing official zxing,zxing zebra,zxing zebra crossing
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
#pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pickle', 'wb'))

## Cosine Similarity
With the tf-idf scores generated we can now create a new matrix where the instersecting rows and columns display the cosine similarity score for the relevant projects

In [136]:
vect_cos_sim = cosine_similarity(tfidf_vectorizer, tfidf_vectorizer)

In [137]:
pd.DataFrame(data=vect_cos_sim, index=df_important['Repository Name'], columns=df_important['Repository Name']).head(5)

Repository Name,freeCodeCamp,bootstrap,free-programming-books,react,d3,You-Dont-Know-JS,tensorflow,awesome,vue,angular.js,...,laptop,bolt,js-the-right-way,plotly.js,hiring-without-whiteboards,Calligraphy,http-prompt,masscan,mint-ui,iina
Repository Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
freeCodeCamp,1.0,0.007676,0.027607,0.015231,0.022984,0.074595,0.0,0.0,0.008986,0.004282,...,0.0,0.0,0.002276,0.047318,0.002164,0.0,0.0,0.0,0.0,0.0
bootstrap,0.007676,1.0,0.0,0.015239,0.027493,0.011496,0.0,0.0,0.097276,0.039724,...,0.017589,0.0,0.021117,0.010092,0.004453,0.0,0.003334,0.0,0.017155,0.0
free-programming-books,0.027607,0.0,1.0,0.0,0.0,0.027565,0.0,0.019464,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
react,0.015231,0.015239,0.0,1.0,0.007402,0.011089,0.0,0.0,0.03688,0.008501,...,0.0,0.0,0.004519,0.022551,0.004295,0.0,0.0,0.0,0.0,0.0
d3,0.022984,0.027493,0.0,0.007402,1.0,0.005584,0.010382,0.0,0.008982,0.020864,...,0.0,0.0,0.033251,0.063494,0.002163,0.0,0.0,0.0,0.0,0.0


## Making recommendations
Now that we have a way of generating similarity scores for pairs of projects, we can define a function which takes a project name as an argument and returns the top n project using their similarity scores. Note that rather than keeping the entire cosine similarity matrix in memory we will generate a much smaller matrix which only holds the scores for the project of our concern, this way we do not need a matrix of 980x980 but rather 1x980

In [176]:
# recommend function, used to find cosine similarity scores for a particular project and return the top results in a sorted list
def recommend(project_title, n):
    
    # find index of project
    title_iloc = df_important.index[df_important['Repository Name'] == project_title][0]
    
    # generate cosine similarity matrix
    project_cos_sim = cosine_similarity(tfidf_vectorizer[title_iloc],tfidf_vectorizer).flatten()
    
    # get top n most similar projects
    topN = sorted(list(enumerate(project_cos_sim)), key=lambda x: x[1], reverse=True)[1:n+1]

    # return results
    response = '\n'.join(f'{df_important.iloc[t_vect[0]][0]} --> confidence: {round(t_vect[1],1)}' for t_vect in topN)

    return response

Based on initial testing, the results seem quite intuitive. Flask, a python web framework, gets results for pyspider, djangy and scrapy amongst other python related projects.

In [177]:
# run recommend function on "flask" with the top 10 results being printed
print(recommend('flask', 10))

pyspider --> confidence: 0.1
tornado --> confidence: 0.1
python-patterns --> confidence: 0.1
django --> confidence: 0.1
scrapy --> confidence: 0.1
python-fire --> confidence: 0.1
awesome-python --> confidence: 0.1
martini --> confidence: 0.1
scikit-learn --> confidence: 0.1
pelican --> confidence: 0.1


Even running the function on something like the WebFundamentals project, a collection of best practices for web development, where it returns other similar projects like android-best-practices and js-the-right-way.

In [178]:
print(recommend('WebFundamentals', 10))

lighthouse --> confidence: 0.2
android-best-practices --> confidence: 0.2
frontend-guidelines --> confidence: 0.1
js-the-right-way --> confidence: 0.1
Best-App --> confidence: 0.1
nightmare --> confidence: 0.1
ionic --> confidence: 0.1
vux --> confidence: 0.1
beautiful-web-type --> confidence: 0.1
pure --> confidence: 0.1
