In [900]:
import csv, string, math, re 
from collections import Counter

filename = 'data.csv'

rows = []
prog_lang_loc = ['javascript,10001','swift,10475','python,10750','java,11462','c++,12090','ruby,12631','rust,13303','c,13624','scala,14288','r,14819','go,15192','mathematica,15859','kotlin,16489','fortran,17106','julia,17565','php,18026','matlab,18380','haskell,18711','perl,19360']
langs = ['javascript','swift','python','java','c++','ruby','rust','c','scala','r','go','mathematica','kotlin','fortran','julia','php','matlab','haskell','perl']

with open(filename, 'r', encoding="utf8") as csvfile:
    
    csvreader = csv.reader(csvfile)
    
    for row in csvreader:
        rows.append(row)

rows = clear_rows(rows)
index_list = lang_range(rows,prog_lang_loc)
train_dict = training_dictionary(rows, index_list, langs)
train_dict = clear_dict(train_dict,langs)
trained_data, all_words = train_the_classifier(train_dict, langs)


In [901]:
def clear_rows(rows):
    
    """Wstępne oczyszczenie danych, przygotowanie do podzielenia danych na poszczególne języki programowania"""
    
    rows = [x.replace(';', '') for row in rows for x in row] #usunięcie znaków średnika
    rows = list(map(lambda s: s.strip(), rows)) #usunięcie znaków nowego wiersza 
    rows = list(map(lambda s: s.lower(), rows)) #małe litery
    
    return(rows)


 


In [902]:
def lang_range(rows,prog_lang_loc):
    
    """ Utworzenie zakresów zajmowanych przez poszczególne języki w zestawie danych """
    
    index_list = []
    for lang in prog_lang_loc:
        for row in rows:
            if lang in row:
                index_list.append(cleared_rows.index(row))
                break
    
    return(index_list)

In [903]:
def training_dictionary(rows, index_list, langs):
    
    """ Utworzenie słownika języków programowania i przypisanych im kodów """
    
    train_dict = {} # słownik treningowy

    for lang in langs:
        if langs.index(lang) < len(langs) - 1: 
            train_dict[lang] = rows[index_list[langs.index(lang)]:index_list[langs.index(lang) + 1]]
        else:
            train_dict[lang] = rows[index_list[langs.index(lang)]:]
            
    return(train_dict)

In [904]:
def clear_dict(train_dict,langs):
    
    for lang in langs:
        
        #Usunięcie interpunkcji i wszystkich znaków
        translator=str.maketrans(string.punctuation, ' '*len(string.punctuation))
        train_dict[lang] = list(map(lambda row: row.translate(translator), train_dict[lang]))
        
        #Rodzielenie elementów na pojedyncze słowa
        words = []
        for element in train_dict[lang]:
            words.extend(element.split())
            
        train_dict[lang] = words
        
        #Usunięcie liczb
        train_dict[lang] = [element for element in train_dict[lang] if element.isalpha()]
        
        #Usunięcie elementów jednoznakowych
        train_dict[lang] = [element for element in train_dict[lang] if len(element)>1]
        
    return(train_dict)

In [905]:
def train_the_classifier(train_dict, langs):
    
    #Summary of each word used in each language
    trained_data = {}
    
    #Summary of each word used in any language
    all_words = Counter()
    
    #Train the model
    for lang in langs:
        trained_data[lang] = {}
        for word in train_dict[lang]:
            if word in trained_data[lang]:
                trained_data[lang][word] +=1
            else:
                trained_data[lang][word] = 1 
            all_words[word] += 1

    return(trained_data, all_words)


In [906]:
def read_the_file(filename):
    file_object = open(filename, 'r')
    words = file_object.read().split()
    words = clear_the_file(words)
    
    return(words)

In [952]:
def clear_the_file(words):

    words = list(map(lambda s: s.lower(), words)) #Małe litery
    
    translator=str.maketrans(string.punctuation, ' '*len(string.punctuation))#Usunięcie interpunkcji
    words = list(map(lambda word: word.translate(translator), words))
    
    #Rodzielenie elementów na pojedyncze słowa
    words2 = []
    for word in words:
        words2.extend(word.split())
    
    words = words2
    
    #Usunięcie liczb
    words = [word for word in words if word.isalpha()]
    
    #Usunięcie elementów jednoznakowych
    words = [word for word in words if len(word)>1]
    
    return(words)

In [953]:
def calc_prob(lang, code, trained_data, all_words ):
    
    prob = 0
    for word in code:
        try:
            if prob == 0:
                prob = trained_data[lang][word]/all_words[word]
            else:
                prob = prob * trained_data[lang][word]/all_words[word]
        except(KeyError):
            continue
            
    return(prob)

In [956]:
def language_prob(trained_data, all_words, code, langs):
    
    lang_chance = {}
    
    for lang in langs:
        prob = calc_prob(lang, code, trained_data, all_words )
        lang_chance[lang] = prob
        
    print(lang_chance)
    return(max(lang_chance, key=lang_chance.get))
        
  

In [957]:
filename = 'test_data.txt'#enter here te file name (code.txt)
code = read_the_file(filename)

language_prob(trained_data, all_words, code, langs)


{'javascript': 4.880673204007586e-11, 'swift': 1.4077064071157346e-13, 'python': 5.2493690582965744e-12, 'java': 6.779470893429329e-10, 'c++': 1.2337194191215784e-12, 'ruby': 2.229518268372761e-12, 'rust': 9.270360226680397e-13, 'c': 9.882278363673579e-10, 'scala': 2.9590204514369967e-13, 'r': 1.5028443250410055e-09, 'go': 2.1371336199803024e-09, 'mathematica': 1.1484656886326058e-09, 'kotlin': 5.800462090660877e-10, 'fortran': 2.2850776696018937e-12, 'julia': 3.168112881069258e-14, 'php': 4.9169746315203076e-08, 'matlab': 2.837713748455233e-08, 'haskell': 2.8644764595090476e-07, 'perl': 0.000509052708097724}


'perl'