# Programming Language Classifier

In [1]:
import glob
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn import svm
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt


%matplotlib inline

## Using Glob to read each type of file in the directory

### The text of each file is loaded into a list, and the type of file is loaded into another list.

In [2]:
def read_code(directory, ftype):
    files = glob.glob('{}/**/*.{}'.format(directory, ftype), recursive=True)
    for fiyl in files:
        y_file_type.append(ftype)
        with open(fiyl) as f:
            x_texts.append(f.read())


x_texts = []
y_file_type = []


file_types = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 'javascript', 
              'ocaml', 'perl', 'hack', 'php', 'python3', 'jruby', 'yarv', 
              'scala', 'racket', 'ghc']

for ft in file_types:
    read_code('/Users/David/documents/tiy/programming-language-classifier/benchmarksgame-2014-08-31/benchmarksgame/bench/', ft)

## Pipeline Creation

 The first part of the pipeline is the CountVectorizer.  The CountVectorizer works by turning 'words' of at least two characters into a numeric representation that can be passed into a classifier. 

The second part of the pipeline is the TfiftTransformer.  This takes the vectorized data and weighs each feature based on how unique it is.  

The final part of the pipeline is the estimator method, which in this case is LinearSVC.  LinearSVC works by creating a vector based on the training data features.  When using predict, LinearSVC will decide if the feature is close enough to that vector to be considered the same class.

In [3]:
pipeline_map = [
                ('hashin', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('lin', svm.LinearSVC()),
                ]

In [4]:
pipeline = Pipeline(pipeline_map)

## Splitting the data into two parts, training and testing.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    x_texts, y_file_type, test_size=0.4, random_state=1965)

## Passing the training data into the pipeline and printing out the score for our training data.  

#### .96 is a pretty good $R^{2}$.  That means that our model can predict the correct programming langues 96% of the time, when using the data that we trained it with.

In [6]:
pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train))


0.968571428571


## Now that the pipeline is trained, we pass the test data in to see how well our model can predict data it has seen before.

In [7]:
print(pipeline.score(X_test, y_test))

0.82905982906


#### An $R^{2}$ score of .829 is not too bad.  This means that the model will predict the correct language 82.9% when the model is fed a new set of data.

## Let's see how well the model does with Bryce's testing files which are about 3 MB.

#### Here is a list of all of the correct programming languages that correspond to the test files that we have.

In [8]:
bryce_tests = ['clojure', 'clojure', 'clojure', 'clojure','python3', 'python3',
               'python3',  'python3',  'javascript', 'javascript', 'javascript',
               'javascript', 'jruby', 'jruby', 'jruby', 'ghc', 'ghc',
                'ghc', 'racket', 'racket', 'racket', 'java', 'java',
                'scala', 'scala', 'php', 'php', 'php', 'ocaml',
                'ocaml']

#### The for loop below is running through each file in the directory and checking the models prediction against the correct answer.

In [9]:
predictions = []
count = 0
num_correct = 0
for n in list(range(1,25)) + list(range(28,32)):
    type_count = {}
    with open('test/{}'.format(n)) as f:
        f = f.read()
        predicter = pipeline.predict([f])
        predictions.append(predicter)
        if predicter == bryce_tests[count]:
            num_correct += 1
        count += 1


#### Displaying the percentage the model was able to predict correctly.  76.7% is not too bad, but not quite as good as the model did with the original test data.

In [10]:
print(num_correct/len(bryce_tests))

0.7666666666666667


## More model testing using 75 MB of test files

#### A peer compiled a directory of several examples of each programming language.  Again, we are comparing the model's prediction to the correct answer.

In [11]:
x_texts = []
y_file_type = []


file_types = ['gcc', 'c', 'csharp', 'sbcl', 'clojure', 'java', 
              'javascript', 'ocam', 'perl', 'hack', 'php', 'python3',
              'jruby', 'yarv', 'scala', 'racket', 'ghc', 'cs']


for ft in file_types:
    try:
        read_code('/Users/David/documents/tiy/programming-language-classifier/data/', ft)
    except:
        pass

file_dict = dict(zip(x_texts, y_file_type))

correct_count = 0
for key, value in file_dict.items():
    predicter = pipeline.predict([key])
    if predicter == value:
        correct_count += 1



#### Displaying the percentage the model was able to predict correctly.  71.8% correct is a little worse than previous results, but still not too bad.

In [12]:
print(correct_count/len(file_dict))

0.7187039764359352
