In [229]:

import tokenize
import json
import os
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
FILEPATH='./raw/scikit-learn/sklearn/decomposition/_factor_analysis.py'

In [3]:
comment_list = []
with open(FILEPATH, 'rb') as f:
    for tok in tokenize.tokenize(f.readline):
        if tok.type == 3:
            comment_list.append((tok.start[0], tok.end[0], tok.string))

In [4]:
with open('./index/scikit-learn/sklearn/decomposition/_factor_analysis.json', 'r') as f:
    file_content = json.loads(f.read())
ln_fdef = {}
function_params = {}
for fd in file_content['FunctionDef']:
    for ln in file_content['FunctionDef'][fd]['lineno']:
        if ln not in ln_fdef:
            ln_fdef[ln] = []
        ln_fdef[ln].append(fd)
    function_params[fd] = file_content['FunctionDef'][fd]['params']

In [5]:
cfunc_pairs = []
for clns, clne, cs in comment_list:
    if clns-1 in ln_fdef:
        for f in ln_fdef[clns-1]:
            cfunc_pairs.append((f, cs))
    if clne+1 in ln_fdef:
        for f in ln_fdef[clne+1]:
            cfunc_pairs.append((f, cs))

In [6]:
# Provides comment-function pairs
for func, comment in cfunc_pairs:
    print(function_params[func])
    print(comment)

['self', 'X', 'y']
"""Fit the FactorAnalysis model to X using SVD based approach.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.

        y : Ignored
            Ignored parameter.

        Returns
        -------
        self : object
            FactorAnalysis class instance.
        """
['self', 'X']
"""Apply dimensionality reduction to X using the model.

        Compute the expected mean of the latent variables.
        See Barber, 21.2.33 (or Bishop, 12.66).

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        X_new : ndarray of shape (n_samples, n_components)
            The latent variables of X.
        """
['self']
"""Compute data covariance with the FactorAnalysis model.

        ``cov = components_.T * components_ + diag(noise_variance)``

        Returns
        -------
        cov : ndarray

In [7]:
# with open(os.path.join('./index/', "file_key.txt")) as f:
#         j = json.loads(f.read())

In [48]:
def extract_parameters_from_func_description(func_description):
    returns_loc = func_description.find('Returns')
    if returns_loc != -1:
        func_description = func_description[:returns_loc]
    
    parameters = {}
    line_split_description = func_description.split('\n')
    for line_no, x in enumerate(line_split_description):
        if ' : ' in x:
            cur_index = line_no + 1
            
            while cur_index < len(line_split_description) and line_split_description[cur_index].strip() != '' and ':' not in line_split_description[cur_index].strip():
                cur_index += 1
                
            parameter_name = x.split(':')[0].strip()
            parameter_description = ' '.join(line_split_description[line_no:cur_index])
            parameter_description = ''.join(parameter_description.split(':')[1:]).strip()
            parameter_description = parameter_description.replace('\\', '').replace('\r', '').replace('\t', '').replace(',', '').strip()
            parameter_description = ' '.join(parameter_description.split())
                
            parameters[parameter_name] = parameter_description
            
    return parameters
        

def get_parameter_definition_locations(json_filepath, function_name):
    DEBUG = True
    
    raw_filepath = json_filepath.replace('.json', '.py')
    comment_list = []
    with open(os.path.join('./raw', raw_filepath), 'rb') as f:
        for tok in tokenize.tokenize(f.readline):
            if tok.type == 3:
                comment_list.append((tok.start[0], tok.end[0], tok.string))
                
    with open(os.path.join('./index', json_filepath), 'r') as f:
        file_content = json.loads(f.read())
    ln_fdef = {}
    function_params = {}
    for fd in file_content['FunctionDef']:
        for ln in file_content['FunctionDef'][fd]['lineno']:
            if ln not in ln_fdef:
                ln_fdef[ln] = []
            ln_fdef[ln].append(fd)
        function_params[fd] = file_content['FunctionDef'][fd]['params']
        
    cfunc_pairs = {}
    for clns, clne, cs in comment_list:
        if clns-1 in ln_fdef:
            for f in ln_fdef[clns-1]:
                cfunc_pairs[f] = cs
        if clne+1 in ln_fdef:
            for f in ln_fdef[clne+1]:
                cfunc_pairs[f] = cs
                
    if DEBUG:
        for func, comment in cfunc_pairs.items():
            print(func)
            print(function_params[func])
            print(comment)
        print('\n')
    
    func_comments = cfunc_pairs[function_name]
    param_description_map = extract_parameters_from_func_description(func_comments)
    param_location_map = {}
    
    if DEBUG:
        print('Parameters for function: {}'.format(function_name))
    for param_name in param_description_map:
        param_description = param_description_map[param_name]
        if DEBUG:
            print(param_name, ':', param_description)
            
        
    return param_location_map
    
def get_unlabeled_csv(json_filepath):
    DEBUG = False
    raw_filepath = json_filepath.replace('.json', '.py')
    comment_list = []
    with open(os.path.join('./raw', raw_filepath), 'rb') as f:
        for tok in tokenize.tokenize(f.readline):
            if tok.type == 3:
                comment_list.append((tok.start[0], tok.end[0], tok.string))
                
    with open(os.path.join('./index', json_filepath), 'r') as f:
        file_content = json.loads(f.read())

    imports = ' '.join([a for a in file_content['Import']])
    ln_fdef = {}
    function_params = {}
    for fd in file_content['FunctionDef']:
        for ln in file_content['FunctionDef'][fd]['lineno']:
            if ln not in ln_fdef:
                ln_fdef[ln] = []
            ln_fdef[ln].append(fd)
        function_params[fd] = file_content['FunctionDef'][fd]['params']
        
    cfunc_pairs = {}
    for clns, clne, cs in comment_list:
        if clns-1 in ln_fdef:
            for f in ln_fdef[clns-1]:
                cfunc_pairs[f] = cs
        if clne+1 in ln_fdef:
            for f in ln_fdef[clne+1]:
                cfunc_pairs[f] = cs
                
    if DEBUG:
        for func, comment in cfunc_pairs.items():
            print(func)
            print(function_params[func])
            print(comment)
        print('\n')
        
    func_comments = [cfunc_pairs[x] for x in cfunc_pairs.keys()]
    param_description_maps = [extract_parameters_from_func_description(x) for x in func_comments]
    
    ret = []
    
    
    for param_description_map in param_description_maps:
        for param_name, comment in param_description_map.items():
            temp_text = param_name + ':' + param_description_map[param_name] + '\n'
            if ':' in temp_text:
                ret.append((param_name, param_description_map[param_name], imports))
                
    return ret
    
def get_unlabeled_data_across_all_files(root_dir, file_limit = 10):
    ret = []
    count = 0
    for root, dirs, files in os.walk(os.path.join('index', root_dir)):
        root = f'{os.sep}'.join(root.split(os.sep)[1:])
        for file in files:
            if file.endswith('.json'):
                count += 1
                ret += get_unlabeled_csv(os.path.join(root, file))
                if count >= file_limit and file_limit != -1:
                    break
                
    with open('data.csv', 'w') as file:
        file.write('parameter,comment\n')
        for param_name, comment, imports in ret:
            file.write(param_name + ',' + comment + ','+ imports + '\n')
            
    return ret

In [49]:
get_unlabeled_data_across_all_files('scikit-learn', -1)

KeyboardInterrupt: 

feature set: parameters, comments, labels

In [238]:
def convert_to_input_format(parameter, comment, options, labels):
    return (' '.join([parameter, comment, options]), labels)

def extract_csv_to_numpy_array(path):
    np_array = []
    with open(path, 'r') as file:
        for line in file.readlines()[1:]:
            line = line.strip().split(',')
            dvsl = convert_to_input_format(line[0], line[1], line[2], line[3])
            if dvsl[-1] != '' and ' ' not in dvsl[-1]:
                np_array.append(dvsl)
    
    np_array = np.array(np_array)

    return np_array[:,0], np_array[:,1]
                    
def convert_labels_to_discrete(labels: np.ndarray):
    le = LabelEncoder()
    return le.fit_transform(labels), le

def get_bow_model(data: np.ndarray):
    count_vec = CountVectorizer()
    bow = count_vec.fit_transform(data)
    bow = np.array(bow.todense())
    return bow, count_vec

def get_dense_vect_for_single_str(item, count_vec):
    return count_vec.transform([item]).todense()

In [239]:
data, labels = extract_csv_to_numpy_array('data.csv')
bow, count_vec = get_bow_model(data)

In [240]:
labels, le= convert_labels_to_discrete(labels)

In [241]:
nb_classifier = GaussianNB()
nb_classifier.fit(X=bow, y=labels)

GaussianNB()

In [242]:
sum(nb_classifier.predict(bow) == labels) / len(bow)

0.9612068965517241

In [253]:
le.inverse_transform(nb_classifier.predict([bow[0]]))[0]

'docutils'

In [263]:
def assess_comment(comment, model, le, count_vec):
    item = comment
    return le.inverse_transform(model.predict(get_dense_vect_for_single_str(item, count_vec)))[0]

In [264]:
assess_comment('The number of samples in the data is 1000', nb_classifier, le, count_vec)



'numpy'

In [243]:
# def chirag_comparison(comment):
#     if 'array-like' in comment or 'ndarray' in comment or 'RandomState' in comment:
#         return 'numpy'
#     else:
#         return 'builtin'

# sum(le.transform([chirag_comparison(x) for x in data]) == labels) / len(labels)