In [4]:
# imports for required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import pickle

from sqlalchemy import create_engine

In [5]:
engine = create_engine('sqlite:///app.db')

## Load data from SQL to pandas dataframe
The following cell will issue a query on the database and assign the result to a dataframe

In [80]:
df = pd.read_sql('SELECT rater_id, project_id, language FROM ratings JOIN project on project_id = id', con=engine)
df.head(5)

Unnamed: 0,rater_id,project_id,language
0,5,16,C++
1,2,11,Shell
2,6,11,Shell
3,2,980,Swift
4,2,22,C++


In [95]:
users = pd.read_sql('SELECT * FROM user', con=engine)
users['id']

0    1
1    2
2    3
3    4
4    5
5    6
6    7
Name: id, dtype: int64

In [82]:
corpus = df.groupby('rater_id')['language'].apply(' '.join)
corpus.head

<bound method NDFrame.head of rater_id
1    Python Python Python JavaScript JavaScript HTM...
2    Shell Swift C++ Swift JavaScript Swift JavaScr...
3                       Vue Vue  CSS JavaScript Python
4                 Java Java Kotlin Java Ruby Ruby Ruby
5                                                  C++
6                                                Shell
7    JavaScript TypeScript CSS CSS HTML HTML Python...
Name: language, dtype: object>

In [None]:

corpus = df.groupby('rater_id')['language'].apply(' '.join)
corpus.head

In [83]:
cv_params = CountVectorizer(token_pattern = '[a-zA-Z0-9$&+,:;=?@#|<>.^*()%!-]+')

In [84]:
cv = cv_params.fit_transform(corpus)

In [85]:
pd.DataFrame(cv.toarray(), columns=cv_params.get_feature_names())

Unnamed: 0,c++,css,html,java,javascript,kotlin,python,ruby,shell,swift,typescript,vue
0,0,2,1,0,2,0,3,0,0,0,0,0
1,1,0,0,0,3,0,0,0,1,3,0,0
2,0,1,0,0,1,0,1,0,0,0,0,2
3,0,0,0,3,0,1,0,3,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0
6,1,2,2,0,1,0,1,0,0,0,1,0


In [86]:
pickle.dump(cv, open('count_vectorizer.pickle', 'wb'))

In [87]:
vect_cos_sim = cosine_similarity(cv, cv)

Show cosine similarity matrix

In [88]:
result = pd.DataFrame(data=vect_cos_sim)
result

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.316228,0.62361,0.0,0.0,0.0,0.748455
1,0.316228,1.0,0.253546,0.0,0.223607,0.223607,0.258199
2,0.62361,0.253546,1.0,0.0,0.0,0.0,0.436436
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.223607,0.0,0.0,1.0,0.0,0.288675
5,0.0,0.223607,0.0,0.0,0.0,1.0,0.0
6,0.748455,0.258199,0.436436,0.0,0.288675,0.0,1.0


Shift index and column numbers up by 1

In [89]:
result.index += 1
result.columns += 1
result

Unnamed: 0,1,2,3,4,5,6,7
1,1.0,0.316228,0.62361,0.0,0.0,0.0,0.748455
2,0.316228,1.0,0.253546,0.0,0.223607,0.223607,0.258199
3,0.62361,0.253546,1.0,0.0,0.0,0.0,0.436436
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,0.223607,0.0,0.0,1.0,0.0,0.288675
6,0.0,0.223607,0.0,0.0,0.0,1.0,0.0
7,0.748455,0.258199,0.436436,0.0,0.288675,0.0,1.0
