In [1]:
import os
import random
import pandas as pd
import numpy as np
import re
from network import Network
import csv
from sklearn import metrics, cross_validation

In [2]:
possible_factors = ["/$", "def", "end", "{", "}", "&", "\(", "\)", "nil", "var", "print", "END", "do", "int", "str", "static",
                    "public", "/+/+", "put", "return", "map" "->", "BigInt", "php", "|", "each"]
file_extensions = {".py": "python",
                   ".rb": "ruby",
                   ".ruby": "ruby",
                   ".jruby": "ruby",
                   ".python": "python",
                   ".clojure": "clojure",
                   ".php": "php",
                   ".ocaml": "ocaml",
                   ".java": "java",
                   ".javascript": "javascript",
                   ".python3": "python",
                   ".racket": "scheme",
                   ".ghc": "haskell",
                   ".tcl": "tcl",
                   ".scala": "scala"}

In [3]:
def get_training_samples(directory, extension_dict):
    "Read in the examples, kick out a list of tuples of text and classifications."
    text_list = []
    lang_list = []
    for root, dirs, files in os.walk(directory):
        for particular_file in files:
            extension = os.path.splitext(particular_file)[1]
            if extension in file_extensions:
                try:
                    file_in = open(os.path.join(root, particular_file))
                    text = file_in.read()
                    text_list.append(text)
                    lang_list.append(extension_dict[extension])
                except:
                    print("hiccuped on : " + particular_file)
    df = pd.DataFrame({"code": text_list,
                       "language": lang_list})
    data_array = np.array(list(zip(text_list, lang_list)))
    return data_array

In [4]:
training_data = get_training_samples("benchmarksgame-2014-08-31/", file_extensions)

hiccuped on : pidigits.ocaml-2.ocaml


In [5]:
def old_text_factors(snippet, regex_objects):
    "Takes a code snippet and returns a vector of features in [0,1]"
    normalizing_factor = len(snippet)
    factors = []
    for factor in range(len(regex_objects)):
        target = regex_objects[factor]
        total_num = len(target.findall(snippet))
        frequency = total_num / normalizing_factor
        factors.append(frequency)
    factor_array = np.array(factors)  
    factor_array = factor_array.reshape(-1, 1)
    return factor_array

In [6]:
def text_factors(snippet, regex_objects):
    "Takes a code snippet and returns a vector of features in [0,1]"
    normalizing_factor = len(snippet)
    factors = []
    for factor in range(len(regex_objects)):
        target = regex_objects[factor]
        if target.search(snippet) is None:
            value = 0
        else:
            value = 1
        factors.append(value)
    factor_array = np.array(factors)  
    factor_array = factor_array.reshape(-1, 1)
    return factor_array

In [7]:
def compile_regex_objects(factors):
    "Given possible strings in the code, return a list of regex objects that can match for them."
    objects = []
    for factor in factors:
        try:
            current_object = re.compile(factor)
        except:
            print("Problem with {}".format(factor))
        objects.append(current_object)
    return objects

In [8]:
training_data.shape

(367, 2)

In [9]:
training_data[0][1]

'clojure'

In [10]:
def lang_vectorizer(given_lang):
    languages = ["clojure", "python", "javascript", "ruby", "haskell", "scheme", "java", "scala", "php", "ocaml", "tcl"]
    index = languages.index(given_lang)
    return_array = np.zeros((len(languages),1))
    return_array[index] = 1
    return return_array
    
def create_training_data(classifier_fcn, samples, regex_objects):
    data_list = []
    for sample in samples:
        output_encoding = lang_vectorizer(sample[1])
        input_encoding = classifier_fcn(sample[0], regex_objects)
        data_list.append((input_encoding, output_encoding))
    return data_list

In [11]:
regex_objects = compile_regex_objects(possible_factors)
training_arrays = create_training_data(text_factors, training_data, regex_objects)
first_net = Network([len(training_arrays[0][0]), 20, len(training_arrays[0][1])])
training_arrays[1]

(array([[0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0]]), array([[ 1.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]))

In [12]:
len(training_arrays)

367

In [13]:
def unvectorize_lang(vector):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    index = get_one_index(vector)
    languages = ["clojure", "python", "javascript", "ruby", "haskell", "scheme", "java", "scala", "php", "ocaml", "tcl"]
    lang = languages[index]
    return lang

def get_one_index(vector):
    count = 0
    for item in vector:
        if item == 1:
            return count
        else:
            count += 1

In [14]:
lang_vector = lang_vectorizer('ruby')
unvectorize_lang(lang_vector)

'ruby'

In [15]:
get_one_index(lang_vector)

3

In [16]:
train, test = cross_validation.train_test_split(training_arrays, test_size=0.1)

In [17]:
len(test)

37

In [18]:
trans_test = []
for item in range(len(test)):
    trans_test.append((test[item][0], get_one_index(test[item][1])))

In [19]:
first_net.SGD(train, 60, 10, 1.0, test_data=trans_test)

Epoch 0: 3 / 37
Epoch 1: 8 / 37
Epoch 2: 12 / 37
Epoch 3: 12 / 37
Epoch 4: 12 / 37
Epoch 5: 13 / 37
Epoch 6: 17 / 37
Epoch 7: 14 / 37
Epoch 8: 17 / 37
Epoch 9: 16 / 37
Epoch 10: 17 / 37
Epoch 11: 17 / 37
Epoch 12: 17 / 37
Epoch 13: 20 / 37
Epoch 14: 20 / 37
Epoch 15: 22 / 37
Epoch 16: 22 / 37
Epoch 17: 21 / 37
Epoch 18: 21 / 37
Epoch 19: 23 / 37
Epoch 20: 22 / 37
Epoch 21: 22 / 37
Epoch 22: 22 / 37
Epoch 23: 23 / 37
Epoch 24: 23 / 37
Epoch 25: 24 / 37
Epoch 26: 25 / 37
Epoch 27: 25 / 37
Epoch 28: 26 / 37
Epoch 29: 25 / 37
Epoch 30: 24 / 37
Epoch 31: 26 / 37
Epoch 32: 24 / 37
Epoch 33: 23 / 37
Epoch 34: 24 / 37
Epoch 35: 25 / 37
Epoch 36: 26 / 37
Epoch 37: 26 / 37
Epoch 38: 25 / 37
Epoch 39: 26 / 37
Epoch 40: 25 / 37
Epoch 41: 26 / 37
Epoch 42: 26 / 37
Epoch 43: 26 / 37
Epoch 44: 26 / 37
Epoch 45: 26 / 37
Epoch 46: 25 / 37
Epoch 47: 26 / 37
Epoch 48: 26 / 37
Epoch 49: 26 / 37
Epoch 50: 26 / 37
Epoch 51: 26 / 37
Epoch 52: 26 / 37
Epoch 53: 26 / 37
Epoch 54: 26 / 37
Epoch 55: 26 / 37
Epoc

In [20]:
special_answer_list = []
with open("test.csv") as a_cool_file:
    answer_file = csv.reader(a_cool_file)
    for row in answer_file:
        special_answer_list.append((row[0], row[1]))

In [21]:
special_answer_list.pop(0)

('Filename', 'Language')

In [22]:
answer_database = []
for row in special_answer_list:
    with open('test/' + row[0]) as file:
        text = file.read()
        answer_database.append((text, row[1]))

In [23]:
ultimate_test = []
for row in answer_database:
    input_encoding = text_factors(row[0], regex_objects)
    output_encoding = lang_vectorizer(row[1])
    ultimate_test.append((input_encoding, output_encoding))

In [24]:
first_net.feedforward(ultimate_test[0][0])

array([[ 0.06291866],
       [ 0.08873967],
       [ 0.00106952],
       [ 0.3397637 ],
       [ 0.00124911],
       [ 0.02339446],
       [ 0.02519172],
       [ 0.39470497],
       [ 0.01299165],
       [ 0.00108363],
       [ 0.00483397]])

In [25]:
X, y = zip(*ultimate_test)
cross_validation.cross_val_score(first_net, X, y, scoring='accuracy')

TypeError: Cannot clone object 'A network with 3 layers.' (type <class 'network.Network'>): it does not seem to be a scikit-learn estimator it does not implement a 'get_params' methods.