In [117]:
import pandas as pd
import numpy as np
import requests
import re

from bs4 import BeautifulSoup

r = requests.get("http://rosettacode.org/mw/index.php?title=Special:Ask&offset=0&limit=500&q=%5B%5BIs+task%3A%3Atrue%5D%5D&p=mainlabel%3D%2Fformat%3Dbroadtable&sort=_MDAT&order=DESC")
data = r.text
soup = BeautifulSoup(data, 'html.parser')

table = soup.find("table", {"class": "smwtable"})
links = table.findAll('a')

## Import Data

In [3]:
new = pd.read_pickle('examples.pkl')

In [4]:
code = new['Code'].values.tolist()
lang = new['Language'].values.tolist()

In [5]:
def use_of_curly_brackets(text):
    left = text.count("{")
    right = text.count("}")
    return left + right

def use_of_period(text):
    return text.count(".")

def use_of_defn(text):
    return text.count("(defn")

def use_of_slash(text):
    return text.count("/")

def use_of_star(text):
    return text.count("*")

def use_of_function(text):
    return text.count("function")

def end_of_line(text):
    return text.count(";\n")

def use_of_double_arrow(text):
    return text.count("=>")

def use_of_single_arrow(text):
    return text.count("->")

def use_of_double_colon(text):
    return text.count("::")

def use_of_at(text):
    return text.count("@")

def use_of_dollar(text):
    return text.count("$")

def use_of_semi(text):
    return text.count(";")

def use_of_parens(text):
    left = text.count("(")
    right = text.count(")")
    return left + right

def use_of_exclamation(text):
    return text.count("!")

def use_of_bash(text):
    return text.count("#")

def use_of_brackets(text):
    left = text.count("[")
    right = text.count("]")
    return left + right

def use_of_var(text):
    return text.count("var")

def use_of_end(text):
    return text.count("end")

def use_of_double_parens(text):
    return text.count("))")

def python_dict(text):
    return text.count("= {")

def python_init(text):
    return text.count("__init__(")

def python_self(text):
    para = text.count('(self)')
    access = text.count("self.")
    return para + access

def use_of_parens(text):
    return text.count('"')

def use_of_def(text):
    return text.count("def ")

def use_of_colon(text):
    return text.count(":")

In [6]:
def percentage_of_punctuation(text):
    total_length = len(text)
    text = re.sub(r'[\w\s]', '', text)
    punc_length = len(text)
    
    return punc_length / total_length

In [11]:
from textblob import TextBlob
from sklearn.base import TransformerMixin
from collections import Counter


class BagOfWordsFeaturizer(TransformerMixin):
    def __init__(self, num_words=None):
        self.num_words = num_words
        
    def fit(self, X, y=None):
        words = []
        for x in X:
            x = TextBlob(x.lower())
            words += [word.lemmatize() for word in x.words]
        if self.num_words:
            words = Counter(words)
            self._vocab = [word for word, _ in words.most_common(self.num_words)]
        else:
            self._vocab = list(set(words))
        return self
    
    def transform(self, X):
        vectors = []
        for x in X:
            x = TextBlob(x.lower())
            word_count = Counter(x.words)
            vector = [0] * len(self._vocab)
            for word, count in word_count.items():
                try:
                    idx = self._vocab.index(word)
                    vector[idx] = count
                except ValueError:
                    pass
            vectors.append(vector)
        return vectors

In [13]:
class FunctionFeaturizer(TransformerMixin):
    def __init__(self, *featurizers):
        self.featurizers = featurizers
        
    def fit(self, X, y=None):
        '''All SciKit-learn compatible transformers and classifiers have the same
        interface. `fit` should always return the same object (self)'''
        return self
    
    def transform(self, X):
        '''Given a list of original data, return a list of feature vectors'''
        feature_vectors = []
        for x in X:
            feature_vector = [f(x) for f in self.featurizers]
            feature_vectors.append(feature_vector)
        
        return np.array(feature_vectors)

In [14]:
from sklearn.pipeline import make_pipeline, make_union

code_featurizer = make_union(
    BagOfWordsFeaturizer(30),
    FunctionFeaturizer(len,
                       use_of_def,
                       use_of_curly_brackets,
                       use_of_brackets,
                       use_of_period,
                       use_of_defn,
                       use_of_star,
                       use_of_function,
                       end_of_line,
                       use_of_double_arrow,
                       use_of_single_arrow,
                       use_of_double_colon,
                       use_of_at,
                       use_of_dollar,
                       use_of_semi,
                       use_of_parens,
                       use_of_exclamation,
                       use_of_bash,
                       use_of_var,
                       use_of_end,
                       use_of_double_parens,
                       python_dict,
                       python_init,
                       python_self,
                       use_of_parens,
                       use_of_colon,
                       percentage_of_punctuation)
)

In [17]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(code, lang)

In [18]:
from sklearn.tree import DecisionTreeClassifier

pipe = make_pipeline(code_featurizer, DecisionTreeClassifier())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.75945945945945947

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

          C       0.91      0.85      0.88       119
    Clojure       0.76      0.66      0.71        68
    Haskell       0.76      0.82      0.79       115
       Java       0.84      0.79      0.81        94
 JavaScript       0.76      0.87      0.81        76
      OCaml       0.71      0.83      0.76        78
        PHP       0.80      0.63      0.70        68
       Perl       0.79      0.90      0.84        90
     Python       0.68      0.68      0.68       128
       Ruby       0.63      0.65      0.64       105
      Scala       0.77      0.69      0.72       105
     Scheme       0.76      0.73      0.75        64

avg / total       0.76      0.76      0.76      1110



In [20]:
manual_test = ["""(ns my-cli.core)

(defn -main [& args]
  (println "My CLI received arguments:" args))

(defn add-main [& args]
  (->> (map #(Integer/parseInt %) args)
       (reduce + 0)
       (println "The sum is:")))"""]

In [21]:
pipe.predict(manual_test)

array(['Clojure'], 
      dtype='<U10')

In [30]:
file_names = pd.read_csv('test.csv',  header=None)

In [86]:
import glob

test_code = pd.DataFrame()

path = 'test/*'
files = glob.glob(path)   
for file in files: 
    f=open(file, 'r')
#     f.readlines()
    test_code = test_code.append({'filename' : file[file.find("/")+1:], 'code': f.readlines()}, ignore_index=True)
    f.close()
    #if you want to print only the filenames, use 'print file' instead of three previous lines
#     code_examples = code_examples.append({'Language': language, 'Code': code}, ignore_index=True)
test_code = test_code.set_index('filename')

In [62]:
file_names.columns = ['filename', 'code']

In [84]:
file_names = file_names.set_index('filename')

In [112]:
bigdata = pd.concat([file_names, test_code], ignore_index=True)

In [113]:
bigdata

Unnamed: 0,code
0,clojure
1,clojure
2,clojure
3,clojure
4,python
5,python
6,python
7,python
8,javascript
9,javascript
