**This notebook is used for calculating k nearest neighbors for a given project mostly based on their descriptions**

- Loads in data from csv
- Generates list of top 500 (as of now) common words excluding stopwords
- Adds some other variables used for distance calculation (Stars/contributers/forks Count, programming language)
- Creates knn model using sklearn and stores it for retrieval 

In [1]:
import pandas as pd
import numpy as np
import nltk
import operator
import string
import pickle

from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/rick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
# reading in data
pd.set_option('display.max_columns', 500)
PATH ="../data/repositories-1.2.0-2018-03-12.csv"
dataRepo = pd.read_csv(PATH, nrows = 400000, index_col=False)
dataRepo.head()

[nltk_data] Downloading package stopwords to /home/rick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/rick/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ID,Host Type,Name with Owner,Description,Fork,Created Timestamp,Updated Timestamp,Last pushed Timestamp,Homepage URL,Size,Stars Count,Language,Issues enabled,Wiki enabled,Pages enabled,Forks Count,Mirror URL,Open Issues Count,Default branch,Watchers Count,UUID,Fork Source Name with Owner,License,Contributors Count,Readme filename,Changelog filename,Contributing guidelines filename,License filename,Code of Conduct filename,Security Threat Model filename,Security Audit filename,Status,Last Synced Timestamp,SourceRank,Display Name,SCM type,Pull requests enabled,Logo URL,Keywords
0,1,GitHub,brianmhunt/knockout-modal,Opinionated modals with Knockout.js,False,2014-09-15 01:21:34 UTC,2016-12-28 16:33:17 UTC,2016-12-18 18:31:32 UTC,http://brianmhunt.github.io/knockout-modal/,512,7,JavaScript,True,True,True,0,,1,master,2,24038237,,MIT,1,README.md,,,LICENSE,,,,,2016-05-27 15:42:48 UTC,5,GitHub,,,,
1,2,GitHub,SteveSanderson/knockout.mapping,Object mapping plugin for KnockoutJS,False,2010-11-01 09:27:43 UTC,2018-02-11 10:04:55 UTC,2017-06-21 22:54:45 UTC,,924,543,JavaScript,True,True,False,782,,85,master,61,1041356,,MIT,21,README.md,,,LICENSE,,,,,2016-05-27 15:21:05 UTC,8,GitHub,,git,,
2,3,GitHub,azman-co/knockout-model,A set of useful model and collection features ...,True,2014-09-13 03:14:07 UTC,2017-03-18 22:40:02 UTC,2015-01-14 02:01:03 UTC,,472,1,JavaScript,False,True,False,0,,0,master,1,23984550,devco/knockup,,5,README.md,,,,,,,,2016-05-27 16:07:08 UTC,2,GitHub,,,,
3,4,GitHub,zonuexe/aozora-ruby-parser.js,Aozora-bunko ruby parser,False,2014-12-27 21:02:09 UTC,2016-12-28 16:45:20 UTC,2015-01-07 18:04:42 UTC,http://zonuexe.github.io/aozora-ruby-parser.js/,536,3,JavaScript,True,True,True,1,,2,master,1,28546273,,,1,README.md,,,,,,,,2016-05-27 17:22:36 UTC,3,GitHub,,,,
4,5,GitHub,immense/knockout-pickatime,Pickadate timepicker binding for Knockout.js,False,2014-12-04 21:13:48 UTC,2017-03-18 22:40:04 UTC,2014-12-11 16:12:08 UTC,http://rawgit.com/immense/knockout-pickatime/m...,192,1,CoffeeScript,True,True,False,0,,0,master,2,27560378,,MIT,1,README.md,,,LICENSE,,,,,2016-05-27 18:52:59 UTC,4,GitHub,,,,


In [26]:
# build functions that we need for processing descriptions and data

# counting word frequencies
def words_freq(s):
    d={}
    for i in s.split():

        if i in d:
            d[i] +=1
        else:
            d[i] = 1
    return d

# converting variables to dummies
def getDummies (data, feature):
    dummies = pd.get_dummies(data[feature])
    data = pd.concat([data, dummies], axis=1)
    data = data.drop([feature],axis=1)
    return data

# get_neighbors returns k nearest neighbours for an instance

# this method is old but kept here for comparison for any new ways of finding neighbors
def get_neighbors(k, instance, data, labels):
    distances = []
    index = labels[labels == instance].index[0]
    inst = data.iloc[index]
    for i in range(len(data)):
        dist = np.linalg.norm(np.array(inst) - np.array(data.iloc[i]))
        distances.append((dist, i))
    distances.sort(key=lambda x: x[0])
    neighbors = distances[1:k+1]
    indexes = [x[1] for x in neighbors]
    print(indexes)
    # loc OR iloc?
    return labels.iloc[indexes]


# current best method for getting neighbors
def get_neighbors_improved(instance, data, labels):
    index = labels[labels == instance].index[0]
    inst = np.array(data[index]).reshape(1, -1)
    distances, indices = nbrs.kneighbors(inst)
    print(indices)
    # loc OR iloc?
    return labels.iloc[indices[0]]

In [27]:
dataRepo.Keywords.fillna(0, inplace=True)
dataRepo.Description.fillna(0, inplace=True)
dataRepo = dataRepo[dataRepo.Description != 0]

In [28]:
# findig keywords to use in knn

descriptions = list(dataRepo.Description)

joinedDescriptions = " ".join(descriptions)
joinedDescriptions = joinedDescriptions.lower().translate(str.maketrans('', '', string.punctuation))
stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(joinedDescriptions)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

filtered_sentence = " ".join(filtered_sentence)

word_freq = words_freq(filtered_sentence)
sortedList = sorted(word_freq.items(), key = operator.itemgetter(1), reverse=True)
sortedDict = dict(sortedList)
topWords = list(sortedDict.keys())[:500]
print(topWords)

['library', 'go', 'simple', 'api', 'plugin', 'nodejs', 'ruby', 'using', 'client', 'framework', 'php', 'module', 'javascript', 'wrapper', 'files', 'data', 'web', 'package', 'golang', 'based', 'server', 'implementation', 'use', 'file', 'python', 'tool', 'rails', 'code', 'gem', 'application', 'written', 'support', 'node', 'generator', 'project', 'json', 'interface', 'http', 'command', 'app', 'provides', 'service', 'easy', 'language', 'html', 'line', 'system', 'create', 'utility', 'jquery', 'text', 'engine', 'object', 'extension', 'google', 'parser', 'like', 'css', 'test', 'component', 'applications', 'database', 'class', 'build', 'testing', 'java', 'used', 'functions', 'generate', 'small', 'bundle', 'way', 'objects', 'allows', 'ios', 'middleware', 'browser', 'laravel', 'angularjs', 'set', 'lightweight', 'tools', 'get', 'bindings', 'collection', 'version', 'via', 'template', 'add', 'source', 'image', 'helper', 'configuration', 'make', 'projects', 'js', 'apps', 'integration', 'rest', 'cli',

In [29]:
# normalize numeric variables, should we weigh these somehow?
numeric_vars = ['Stars Count', 'Contributors Count', 'Forks Count']
knn_num = dataRepo[numeric_vars]
knn_num = knn_num.apply(lambda col: ((col-np.mean(col))/np.std(col)), axis=0)

# add dummies for language and keywords
knn_data = dataRepo[['Language', 'Description']]
knn_data = getDummies(knn_data, 'Language')

for word in topWords:
    newcol = np.zeros(len(knn_data))
    i = 0;
    for row in knn_data['Description']:
        if(word in row):
            newcol[i] = 1
        i += 1
    knn_data[word] = newcol

# join numeric variables to knn data
knn_data[numeric_vars] = knn_num
knn_data = knn_data.drop(['Description'],axis=1)
knn_data.head()

names = dataRepo['Name with Owner']

MemoryError: 

In [9]:
# calculate k nearest neighbours for a project using the old slow method
nbrs = get_neighbors(5, "immense/knockout-pickatime", knn_data, names)
print(nbrs)

[5, 2078, 832, 1145, 1522]
5         immense/knockout-pickadate
2290       icambron/moment-countdown
935         jiyinyiyong/compact-json
1264    chrisenytc/livi18n.socket.js
1672                Maslosoft/Binder
Name: Name with Owner, dtype: object


In [10]:
# Making a NN model, this takes a while but then retrieving info is very fast
knn_data = np.array(knn_data)
nbrs = NearestNeighbors(n_neighbors=6, algorithm='auto').fit(knn_data)

In [11]:
# getting the neighbors from the "improved" model
nbrsTEST = get_neighbors_improved("immense/knockout-pickatime", knn_data, names)
print(nbrsTEST)

[[   4    5 2078  832 1522 1145]]
4         immense/knockout-pickatime
5         immense/knockout-pickadate
2290       icambron/moment-countdown
935         jiyinyiyong/compact-json
1672                Maslosoft/Binder
1264    chrisenytc/livi18n.socket.js
Name: Name with Owner, dtype: object


In [None]:
# dump model to pickle
pickle.dump(nbrs, open('knn_model', 'wb'))