# __Programming language classification model__

The point of this program is to identyfy the programming language of delivered code.
Classyfying model uses __Naive Bayes classifier__.

Programing languages' codes used in training data: (JavaScript, Swift, Python, Java, C++, Ruby, Rust, C, Scala, R, Go, Mathematica, Kotlin, Fortran, Julia,PHP, Matlab, haskell, perl)

Program can be splited for two parts:
1. Handling and cleanin training data, stored in CSV file
2. Calculations based on Naive Bayes classifier

To run this model,  paste the test code to file "test_data.txt" and run all cells from top to the bottom.
After you once run all cells, only change code in "test_data.txt" and run last cell.

## __Inner functions__

### __Prepering the training data__

In [2]:
def clear_rows(rows):
    
    """ Preliminary data cleanin. Preparing the data set to split for particular languages"""
        
    #Deleting semicolons
    rows = [x.replace(';', '') for row in rows for x in row] 
    
    #Deleting new line sign 
    rows = list(map(lambda s: s.strip(), rows)) 
    
    #All lower case
    rows = list(map(lambda s: s.lower(), rows)) 
    
    return(rows)

In [3]:
def lang_range(rows,prog_lang_loc):
    
    """ Preparing the list of ranges taken by each language in training data list """
    
    index_list = []
    for lang in prog_lang_loc:
        for row in rows:
            if lang in row:
                index_list.append(rows.index(row))
                break
    
    return(index_list)

In [4]:
def training_dictionary(rows, index_list, langs):
    
    """ Preparing the dictionary of programming languages and codes """
    
    train_dict = {} # dictionary for training classifier

    for lang in langs:
        if langs.index(lang) < len(langs) - 1: 
            train_dict[lang] = rows[index_list[langs.index(lang)]:index_list[langs.index(lang) + 1]]
        else:
            train_dict[lang] = rows[index_list[langs.index(lang)]:]                   
            
    return(train_dict)

In [5]:
def clear_dict(train_dict,langs):
    
    """ Clearing the training dictionary. Deleting punctuation, numbers, empty elemets in list. Spliting elements for words"""
    
    for lang in langs:
        
        #Deleting punctuation and characters
        translator=str.maketrans(string.punctuation, ' '*len(string.punctuation))
        train_dict[lang] = list(map(lambda row: row.translate(translator), train_dict[lang]))
        
        #Spliting elements for words 
        words = []
        for element in train_dict[lang]:
            words.extend(element.split())
            
        train_dict[lang] = words
        
        #Deleting numbers
        train_dict[lang] = [element for element in train_dict[lang] if element.isalpha()]
        
        #Deleting one-sign elements
        train_dict[lang] = [element for element in train_dict[lang] if len(element)>1]
        
        
    return(train_dict)

In [6]:
def lenght_correction(train_dict,langs):
    
    """ Because of great differences in number of codes for each leanguage this function is 
    responsible for equalizing the lenght of data"""
    
    lang_check = {}
    for lang in langs:
        lang_check[lang] = len(train_dict[lang])
            
    # Checking the number of elemnts attached to each language
    check_list = sorted(lang_check.values())
    
    # Reduction of data to the lowest number
    for lang in langs:
        train_dict[lang] = train_dict[lang][:check_list[0]]
        
    return(train_dict)   

### __Prepering the test data__

In [7]:
def read_the_file(filename):
    
    """ Reading the test file """
    
    file_object = open(filename, 'r')
    words = file_object.read().split()
    words = clear_the_file(words)
    
    return(words)

In [8]:
def clear_the_file(words):
    
    """ Clearing test file. Same standard as training data"""
    
    #All lower case 
    words = list(map(lambda s: s.lower(), words)) 
    
    #Deleting punctuation
    translator=str.maketrans(string.punctuation, ' '*len(string.punctuation))
    words = list(map(lambda word: word.translate(translator), words))
    
    #Spliting elements for single words
    words2 = []
    for word in words:
        words2.extend(word.split())
    
    words = words2
    
    #Deleting numbers
    words = [word for word in words if word.isalpha()]
    
    #Deleting one-sign elements
    words = [word for word in words if len(word)>1]
    
    return(words)

### __Calculations__

In [9]:
def train_the_classifier(train_dict, langs):
    
    """ Model training """
    
    #Summary of each word used in each language
    trained_data = {}
    
    #Summary of each word used in any language
    all_words = Counter()
    
    #Train the model
    for lang in langs:
        trained_data[lang] = {}
        for word in train_dict[lang]:
            if word in trained_data[lang]:
                trained_data[lang][word] +=1
            else:
                trained_data[lang][word] = 1 
            all_words[word] += 1
            
    return(trained_data, all_words)


In [10]:
def calc_prob(lang, code, trained_data, all_words ):
    
    """ Calculating probability for language """
    prob = 0.0
    for word in code:
        
        #If word is contained in language training_dict
        try:
            if prob == 0.0:
                prob = trained_data[lang][word]/all_words[word]
            else:
                prob = prob * trained_data[lang][word]/all_words[word]
        except(KeyError):
            
            #If word is contained in all_words list 
            if word in all_words:
                prob = prob * (1/all_words[word])
            else:
                continue
                
    return(prob)

In [11]:
def language_prob(trained_data, all_words, code, langs):
    
    """ Predicting the programing language """
    
    lang_chance = {}
    
    for lang in langs:
        prob = calc_prob(lang, code, trained_data, all_words )
        lang_chance[lang] = prob
        
    print(lang_chance)
    return('Hopefully programming language is ' + max(lang_chance, key=lang_chance.get))
        
  

__Imports and loading the training data__

In [12]:
import csv, string,math
from collections import Counter

filename = 'data.csv'

rows = []
prog_lang_loc = ['javascript,10001','swift,10475','python,10750','java,11462','c++,12090','ruby,12631','rust,13303','c,13624','scala,14288','r,14819','go,15192','mathematica,15859','kotlin,16489','fortran,17106','julia,17565','php,18026','matlab,18380','haskell,18711','perl,19360']
langs = ['javascript','swift','python','java','c++','ruby','rust','c','scala','r','go','mathematica','kotlin','fortran','julia','php','matlab','haskell','perl']

with open(filename, 'r', encoding="utf8") as csvfile:
    
    csvreader = csv.reader(csvfile)
    
    for row in csvreader:
        rows.append(row)

rows = clear_rows(rows)
index_list = lang_range(rows,prog_lang_loc)
train_dict = training_dictionary(rows, index_list, langs)
train_dict = clear_dict(train_dict,langs)
train_dict = lenght_correction(train_dict,langs)
trained_data, all_words = train_the_classifier(train_dict, langs)



# __Run ONLY  this part__

In [13]:
filename = 'test_data.txt'#enter here te file name (test_data.txt)
code = read_the_file(filename)

language_prob(trained_data, all_words, code, langs)

{'javascript': 1.2507743666739968e-37, 'swift': 1.2453718801516847e-37, 'python': 7.980857348209553e-30, 'java': 3.409624525924143e-43, 'c++': 8.510522148184023e-40, 'ruby': 1.3472277291133397e-38, 'rust': 2.4073014451496573e-40, 'c': 1.2765341344775553e-42, 'scala': 2.3922597994212834e-39, 'r': 3.892326814097173e-34, 'go': 4.3642861261119365e-39, 'mathematica': 2.3299372368441483e-34, 'kotlin': 1.8364170335770386e-36, 'fortran': 5.427751526863307e-43, 'julia': 3.063402424498982e-39, 'php': 9.806807683199635e-36, 'matlab': 3.473370965487525e-45, 'haskell': 1.1547614459632093e-40, 'perl': 4.1117425974589356e-36}


'Hopefully programming language is python'