# BERT on Repo Description

1. Construct a sentence corpus for each software type using labeled/manually validated repo descriptions
2. Calculate embeddingfor each corpus
3. Compare all repo description with each software type corpus using cosine-similarity score
    - It took about 1.5 hrs to run the embedding on repo data

Author: Cierra and Crystal

In [None]:
%reset

In [1]:
#pgadmin
import os
import psycopg2 as pg


#bert
from sentence_transformers import SentenceTransformer, util
import torch

import pandas as pd

import re

import nltk
nltk.download("punkt")

from nltk import tokenize

import scipy

import datetime

import math

[nltk_data] Downloading package punkt to /home/dab3dj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Embedding Model

In [7]:
# test cuda set up

import torch
torch.cuda.is_available()
#This should return True
torch.cuda.current_device()
# #There should be a number here
torch.cuda.device(0)
# #There should be <torch.cuda.device at 0x7efce0b03be0>
torch.cuda.device_count()
# #This should be 2 I guess

1

In [20]:
#embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2') #quicker model
embedder = SentenceTransformer('paraphrase-mpnet-base-v2', device='cpu') #most accurate, long run time

# Data

## I. Unlabelled Repo Data

In [11]:
repo_data = pd.read_csv("~/git/dspg21oss/data/dspg21oss/clean_eng_github_repos_157k.csv")

In [12]:
# get a list of repo descriptions
repo_description = repo_data["description"].tolist()

## II. Labelled Repo Data -- Software Type Corpus

In [13]:
python_data = pd.read_excel('~/git/dspg21oss/data/dspg21oss/labelled_repo/oss_software_labelled_python_sz.xlsx') #import csv
c_data = pd.read_excel('~/git/dspg21oss/data/dspg21oss/labelled_repo/oss_software_labelled_c_sz.xlsx')
java_data = pd.read_excel('~/git/dspg21oss/data/dspg21oss/labelled_repo/oss_software_labelled_java_sz.xlsx')

In [14]:
# software type
type_name =  "python_label"
# filter 500 validated repos that are labelled 1 (numeric)
corpus_type_i = python_data[python_data[type_name] ==1][["slug",type_name]]

# perform a left merge to get cleaned repo description
corpus_type_i = corpus_type_i.merge(repo_data, on='slug', how='left')
# get just the description
corpus_type_i = corpus_type_i["description"].tolist()

In [38]:
validated_data = pd.read_csv('~/git/dspg21oss/data/dspg21oss/oss_software_labelled.csv') #import csv

# Embedding 

In [73]:
# get embeddings for full list of unvalidated repo descriptions

queries = repo_description

query_embeddings = []

for query in queries: 
    #Compute embeddings
    embedding = embedder.encode(query, show_progress_bar=False, convert_to_tensor=True) 
    query_embeddings.append(embedding)


In [None]:
types = ['ai_label', 'blockchain_label','clang_label', 'database_label',
         'dataviz_label','java_label','javascript_label','php_label','python_label']

# pre-specified number of sentences
num_sentences = 10 #find 10 most similar sentences from the corpus

# print start time
t1 = datetime.datetime.now()
print("Start:", t1)

# for each software type
for sw in types:
    # init a result list for scores
    result = []
    
    # grab the repos validated as sw type
    corpus_type = validated_data[validated_data[sw]==1][["slug",sw]]
    # left merge to get clean repo
    corpus_type = corpus_type.merge(repo_data, on='slug', how='left')
    # get just the description
    corpus_type = corpus_type["description"].tolist()
    # get rid of floats
    corpus_type = [x for x in corpus_type if type(x) == str]
    
    corpus_type_embeddings = embedder.encode(corpus_type, show_progress_bar=False)
    
    # print time for each type
    t2 = datetime.datetime.now()
    print("type ", sw,": ", t2)
    
    # for each unvalided repo query embedding 
    for query_embedding in query_embeddings:
        # use cosine-similarity and torch.topk to find the highest k scores
        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_type_embeddings)[0]

        top_results = torch.topk(cos_scores, k=num_sentences)   #get the top k scores
        result.append(top_results.values.tolist()) #unlist the top result list
    
    # add results to repo_data
    result_col = sw.replace("label","sim_score")
    repo_data[result_col] = result
    
t3 =  datetime.datetime.now()
print("Finished", len(result), "descriptions at", t3)
print("It took", t3-t1, "to run.")

Start: 2021-07-26 17:09:26.684372
type  ai_label :  2021-07-26 17:09:27.225207


In [None]:
#save csv
repo_data.to_csv(r'/home/dab3dj/git/dspg21oss/data/dspg21oss/full_repo_sim_scores_co.csv', index = False)   


# Similarity Score Analysis

In [7]:
from scipy import stats
from scipy.stats import skew
import statistics #calculate mean and others

In [3]:
#read in data
repo_data = pd.read_csv('/home/dab3dj/git/dspg21oss/data/dspg21oss/full_repo_sim_scores_co.csv')   


In [None]:
repo_data.head()

In [8]:
# save copy of original j in case
repo_data_copy = repo_data

# list of sim score columns
score_cols = ["ai_sim_score", "blockchain_sim_score", 'clang_sim_score', 'database_sim_score', 'dataviz_sim_score',
       'java_sim_score', 'javascript_sim_score', 'php_sim_score','python_sim_score']

In [9]:
# change string of scores to list of scores
for col in score_cols:
    #score is in a string, convert to a list, also make sure the numbers are float
    score_ls = repo_data[col]

    score_ls_float = []
    for sentence_score in score_ls:
        sentence_score = str(sentence_score)[1:-1]
        sentence_score = sentence_score.split(",")
        item_float= []
        for item in sentence_score:
            item_float.append(float(item))
        score_ls_float.append(item_float)

    #new_col = col + "_float"
    repo_data[col] = score_ls_float

In [10]:
# score stats

# for each col
for col in score_cols:
    
    score_ls = repo_data[col]
    
    # get stats
    mean_score= []
    range_score = []
    max_score = []
    median_score = []
    skewness_score = []
    for sentence_score in score_ls:
        mean_score.append(statistics.mean(sentence_score))
        range_score.append(max(sentence_score)- min(sentence_score))
        max_score.append(max(sentence_score))
        median_score.append(statistics.median(sentence_score))
        skewness_score.append(stats.skew(sentence_score))
    
    # save stat columns
    col_name = col.replace("sim_score","")
    repo_data[col_name+"_mean_score"]=mean_score
    repo_data[col_name+"range_score"]=range_score
    repo_data[col_name+"max_score"]=max_score
    repo_data[col_name+"median_score"]=median_score
    repo_data[col_name+"skewness_score"]=skewness_score

In [11]:
repo_data.head()

Unnamed: 0,slug,description,ai_sim_score,blockchain_sim_score,clang_sim_score,database_sim_score,dataviz_sim_score,java_sim_score,javascript_sim_score,php_sim_score,...,php__mean_score,php_range_score,php_max_score,php_median_score,php_skewness_score,python__mean_score,python_range_score,python_max_score,python_median_score,python_skewness_score
0,vuejs/vue,Vue js is a progressive incrementally adopt...,"[0.38483116030693054, 0.34321022033691406, 0.3...","[0.3770439028739929, 0.37109243869781494, 0.36...","[0.4809216260910034, 0.42691099643707275, 0.42...","[0.3832893371582031, 0.38218772411346436, 0.38...","[0.6460678577423096, 0.47009673714637756, 0.46...","[0.45489370822906494, 0.3935528099536896, 0.37...","[0.6812090873718262, 0.5953835844993591, 0.560...","[0.43424174189567566, 0.43411487340927124, 0.4...",...,0.376956,0.113091,0.434242,0.388413,-0.080969,0.402169,0.091767,0.45304,0.402336,0.328891
1,facebook/react,A declarative efficient and flexible JavaScr...,"[0.40412580966949463, 0.40174245834350586, 0.3...","[0.4677104353904724, 0.4392547011375427, 0.383...","[0.5760725140571594, 0.5097417235374451, 0.488...","[0.49702656269073486, 0.45085686445236206, 0.4...","[0.5633409023284912, 0.5507672429084778, 0.506...","[0.5418217182159424, 0.5393773317337036, 0.452...","[1.0000003576278687, 0.763877272605896, 0.6752...","[0.5178160071372986, 0.48840513825416565, 0.47...",...,0.420226,0.176267,0.517816,0.425113,0.077832,0.510649,0.108225,0.589151,0.498362,1.435497
2,tensorflow/tensorflow,An Open Source Machine Learning Framework for ...,"[0.6374237537384033, 0.5639429092407227, 0.554...","[0.4424801468849182, 0.4372060000896454, 0.425...","[0.47011685371398926, 0.42693886160850525, 0.4...","[0.49735206365585327, 0.48971667885780334, 0.4...","[0.616600513458252, 0.5324415564537048, 0.4950...","[0.5114375352859497, 0.40726539492607117, 0.40...","[0.5229647755622864, 0.4638928174972534, 0.430...","[0.4243718385696411, 0.36536622047424316, 0.36...",...,0.342346,0.114485,0.424372,0.328773,1.296752,0.598564,0.087771,0.643442,0.586319,0.330979
3,twbs/bootstrap,The most popular HTML CSS and JavaScript fra...,"[0.34504151344299316, 0.2958582937717438, 0.27...","[0.3497481048107147, 0.34957581758499146, 0.33...","[0.49070748686790466, 0.47657108306884766, 0.4...","[0.372490257024765, 0.3629458248615265, 0.3476...","[0.40439078211784363, 0.3792816698551178, 0.35...","[0.38311436772346497, 0.3724273443222046, 0.36...","[1.0000003576278687, 0.6907554864883423, 0.591...","[0.42142561078071594, 0.3685976564884186, 0.36...",...,0.350699,0.11513,0.421426,0.34691,0.833566,0.403796,0.123711,0.476278,0.387911,0.533113
4,ohmyzsh/ohmyzsh,A delightful community driven with 1700 c...,"[0.35356998443603516, 0.3418255150318146, 0.32...","[0.43588021397590637, 0.3517952263355255, 0.34...","[0.38053128123283386, 0.340486079454422, 0.331...","[0.4280346632003784, 0.4265906810760498, 0.423...","[0.47275614738464355, 0.42588570713996887, 0.4...","[0.3862834870815277, 0.3327464759349823, 0.326...","[0.3588447570800781, 0.332401305437088, 0.3097...","[0.42803463339805603, 0.399021178483963, 0.363...",...,0.362576,0.096991,0.428035,0.356334,1.179895,0.423431,0.086344,0.483283,0.412352,1.354862


In [13]:
import csv

In [15]:
# save csv 
repo_data.to_csv('/home/dab3dj/git/dspg21oss/data/dspg21oss/repo_sim_scores_stats_co.csv', index = False)

In [18]:
a = pd.read_csv('/home/dab3dj/git/dspg21oss/data/dspg21oss/repo_sim_scores_stats_co.csv') 
a['ai_sim_score'][0]

'[0.38483116030693054, 0.34321022033691406, 0.34321022033691406, 0.3432101905345917, 0.3432101905345917, 0.3432101905345917, 0.3432101905345917, 0.3432101905345917, 0.3432101905345917, 0.31159618496894836]'

In [19]:
a['ai_sim_score'][0][0]

'['