In [101]:
import pandas as pd
import numpy as np
df_of_train_tweets = None
with open('training-tweets.txt', encoding='utf8') as file:
    language_list = []
    content_list = []
    for line in file.readlines():
        try:
            tweet_id, user_name, language, tweet = line.split(maxsplit=3)
            language_list.append(language.strip())
            content_list.append(tweet.strip())
        except ValueError:
            pass
    data = {'Language': language_list, 'Content': content_list}
    df_of_train_tweets = pd.DataFrame.from_dict(data)        

In [102]:
df_of_train_tweets

Unnamed: 0,Language,Content
0,eu,"""eunoon, zeatik dakaat sentsazioa goizegi esna..."
1,es,Pedazo de tarta k me e kurrao!!!! Ske k buena ...
2,es,No hace falta que te digan que vas a morir par...
3,es,Empieza mi findeeeee :))))
4,es,Pr√≥xima parada: Carnaval del Toro de Ciudad Ro...
5,es,Graniza y tal.
6,es,¬øMi idolo? Lo que mas quiero en este mundo. ht...
7,es,"""Graniza y luego sale el sol. Muy normal en Ir..."
8,en,"""IM CONFUSED ,WHAT IS THE REAL JUSTIN? @justin..."
9,es,@Nyahika Creo que esta el dependiente majo que...


In [103]:
import string
def accepted_character(a_character, accepted_char_set):
    if a_character in accepted_char_set:
        return True
    else:
        return False

In [104]:
class Language:
    def __init__(self, symbol, description, dataset, ngram=None, probability=None):
        self.symbol = symbol
        self.description = description
        self.dataset = dataset
        self.ngram = []
        self.probability = probability
        
    def add_ngram(self, ngram):
        self.ngram.append(ngram)

In [105]:
language_symbol = {
    'eu': 'basque',
    'ca': 'catalan',
    'gl': 'galician',
    'es': 'spanish',
    'en': 'english',
    'pt': 'portugese'
}

In [106]:
list_of_languages = []
total_nb_of_tweets = df_of_train_tweets.shape[0]
for i in language_symbol.keys():
    language_dataset = df_of_train_tweets[df_of_train_tweets['Language'] == i]
    prob_of_language = language_dataset.shape[0]/total_nb_of_tweets
    a_language = Language(i, language_symbol.get(i), language_dataset, None, prob_of_language)
    
    list_of_languages.append(a_language)

In [107]:
list_of_letters = [character for character in string.ascii_letters] #readjust for the nb of vocabulary
unigram_dataset = pd.DataFrame(np.zeros((52,2)), columns=['Probability', 'Instances'])
unigram_dataset['Characters'] = list_of_letters
unigram_dataset = unigram_dataset.set_index('Characters')
print(unigram_dataset)

            Probability  Instances
Characters                        
a                   0.0        0.0
b                   0.0        0.0
c                   0.0        0.0
d                   0.0        0.0
e                   0.0        0.0
f                   0.0        0.0
g                   0.0        0.0
h                   0.0        0.0
i                   0.0        0.0
j                   0.0        0.0
k                   0.0        0.0
l                   0.0        0.0
m                   0.0        0.0
n                   0.0        0.0
o                   0.0        0.0
p                   0.0        0.0
q                   0.0        0.0
r                   0.0        0.0
s                   0.0        0.0
t                   0.0        0.0
u                   0.0        0.0
v                   0.0        0.0
w                   0.0        0.0
x                   0.0        0.0
y                   0.0        0.0
z                   0.0        0.0
A                   

In [108]:
unigram_dataset

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,0.0
b,0.0,0.0
c,0.0,0.0
d,0.0,0.0
e,0.0,0.0
f,0.0,0.0
g,0.0,0.0
h,0.0,0.0
i,0.0,0.0
j,0.0,0.0


In [109]:
#create unigram for each language
for language in list_of_languages:
    language.ngram.append(unigram_dataset.copy())
    list_of_letters = [character for character in string.ascii_letters]
    for character in list_of_letters:
        language.ngram[0].loc[character]['Instances'] = ''.join(language.dataset['Content']).count(character)

In [110]:
list_of_languages[0].ngram[0]

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,2489.0
b,0.0,384.0
c,0.0,100.0
d,0.0,383.0
e,0.0,1495.0
f,0.0,61.0
g,0.0,347.0
h,0.0,186.0
i,0.0,1245.0
j,0.0,189.0


In [89]:
for language in list_of_languages:
    total_instances = language.ngram[0]['Instances'].sum()
    for index, row in language.ngram[0].iterrows():
        row['Probability'] = row['Instances']/total_instances

In [90]:
list_of_languages[0].ngram[0]

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.157552,2489.0
b,0.024307,384.0
c,0.00633,100.0
d,0.024244,383.0
e,0.094632,1495.0
f,0.003861,61.0
g,0.021965,347.0
h,0.011774,186.0
i,0.078807,1245.0
j,0.011964,189.0


In [23]:
df_of_test_tweets = None
with open('test-tweets-given.txt', encoding='utf8') as file:
    language_list = []
    content_list = []
    for line in file.readlines():
        try:
            tweet_id, user_name, language, tweet = line.split(maxsplit=3)
            language_list.append(language.strip())
            content_list.append(tweet.strip())
        except ValueError:
            pass
    data = {'Language': language_list, 'Content': content_list}
    df_of_test_tweets = pd.DataFrame.from_dict(data)

In [24]:
import math
def prob_of_language(list_of_languages, line):
    best_probability = float('-inf')
    best_language = None
    for language in list_of_languages:
        total_probability = math.log(language.probability)
        for character in line:
            if character in string.ascii_letters:
                character_probability = language.ngram[0].loc[character]['Probability']
                total_probability *= math.log(character_probability)
        if total_probability > best_probability:
            best_probability = total_probability
            best_language = language.symbol
    return best_language

In [None]:
list_of_guesses = []
for index, row in df_of_test_tweets.iterrows():
    list_of_guesses.append(prob_of_language(list_of_languages, row['Content']))
df_of_test_tweets['guess'] = list_of_guesses

In [101]:
df_of_test_tweets

Unnamed: 0,Language,Content,guess
0,es,@AnderDelPozo @PesqueWhite hahaha yo tambien m...,es
1,es,Foto antes de LA FOTO. #orla @ Universidad de ...,eu
2,es,@Nagore_Robles Que mal lo vas a pasar viendo a...,es
3,es,"""Hay cosas complicadas, y luego est√° quitarse ...",es
4,es,"""Me duele todo, que asco.""",es
...,...,...,...
6995,ca,"""Tirar la canya quan vas disfresat de tia. S'h...",en
6996,ca,Bon dia companys ||‚òÖ|| avui dia radiant ..cont...,en
6997,ca,@rosalizandra bona cursa a tots!!! @TheRedRun...,en
6998,es,@AnnaGomez1997 pues yo. tss http://t.co/HXGdwx...,es


In [102]:
guess_status = []
for index, row in df_of_test_tweets.iterrows():
    if row['Language'] == row['guess']:
        guess_status.append(True)
    else:
        guess_status.append(False)
df_of_test_tweets['Status'] = guess_status

In [103]:
df_of_test_tweets

Unnamed: 0,Language,Content,guess,Status
0,es,@AnderDelPozo @PesqueWhite hahaha yo tambien m...,es,True
1,es,Foto antes de LA FOTO. #orla @ Universidad de ...,eu,False
2,es,@Nagore_Robles Que mal lo vas a pasar viendo a...,es,True
3,es,"""Hay cosas complicadas, y luego est√° quitarse ...",es,True
4,es,"""Me duele todo, que asco.""",es,True
...,...,...,...,...
6995,ca,"""Tirar la canya quan vas disfresat de tia. S'h...",en,False
6996,ca,Bon dia companys ||‚òÖ|| avui dia radiant ..cont...,en,False
6997,ca,@rosalizandra bona cursa a tots!!! @TheRedRun...,en,False
6998,es,@AnnaGomez1997 pues yo. tss http://t.co/HXGdwx...,es,True


In [104]:
(df_of_test_tweets[df_of_test_tweets['Status'] == True].shape[0])/df_of_test_tweets.shape[0]

0.317

In [140]:
list_of_languages[0].ngram[0].loc['a']['Probability']

0.15755158880870995

In [111]:
list_of_letters = [character for character in string.ascii_letters] #readjust for the nb of vocabulary
new_list_of_letters = list_of_letters.copy()
new_list_of_letters.append('instances')
bigram_dataset = pd.DataFrame(np.zeros((52,53)), columns=new_list_of_letters, index=list_of_letters)
#bigram_dataset['Characters'] = list_of_letters
#bigram_dataset = bigram_dataset.set_index('Characters')

In [112]:
bigram_dataset

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,R,S,T,U,V,W,X,Y,Z,instances
a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
g,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
j,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
language = list_of_languages[0]
language.ngram.append(bigram_dataset.copy())
language_as_String = ''.join(language.dataset['Content'])
list_of_letters_x = [character for character in string.ascii_letters]
list_of_letters_y = [character for character in string.ascii_letters]
print(language.ngram[1].loc['a', 'a'])
for x in list_of_letters_x:
    for y in list_of_letters_y:
        language.ngram[1].loc[x, y] = language_as_String.count(x+y)
        
language.ngram[1]
#for character in list_of_letters:
    #language.ngram[0].loc[character]['Instances'] = ''.join(language.dataset['Content']).count(character)

0.0


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,R,S,T,U,V,W,X,Y,Z,instances
a,149.0,52.0,4.0,23.0,20.0,6.0,39.0,13.0,146.0,80.0,...,2.0,1.0,3.0,1.0,0.0,0.0,1.0,0.0,5.0,0.0
b,160.0,0.0,0.0,0.0,93.0,0.0,0.0,0.0,55.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c,10.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d,89.0,0.0,0.0,1.0,90.0,0.0,1.0,0.0,68.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
e,108.0,15.0,7.0,12.0,63.0,1.0,57.0,17.0,37.0,4.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
f,15.0,1.0,0.0,0.0,9.0,3.0,0.0,0.0,8.0,0.0,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
g,90.0,0.0,0.0,0.0,32.0,1.0,0.0,0.0,37.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
h,54.0,0.0,0.0,0.0,15.0,0.0,1.0,0.0,20.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i,116.0,29.0,2.0,36.0,45.0,5.0,15.0,13.0,38.0,7.0,...,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0
j,109.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,5.0,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:

# for language in list_of_languages:
#     language.ngram.append(bigram_dataset.copy())
#     list_of_letters = [character for character in string.ascii_letters]
#     for character in list_of_letters:
#         language.ngram[1].loc[character]['Instances'] = ''.join(language.dataset['Content']).count(character)


language = list_of_languages[0]
language.ngram.append(bigram_dataset.copy())
language_as_String = ''.join(language.dataset['Content'])
list_of_letters_x = [character for character in string.ascii_letters]
# for i in range(53):
    
sum = 0
count = 0
total_instances = 0 
for x in list_of_letters_x:
    
    for y in list_of_letters_y:
        if (count == 51):
            sum += language.ngram[1].loc[x,y]
            count = 0
            language.ngram[1].loc[x,'instances'] = sum
            total_instances += sum
            sum = 0
            
        else:
            count += 1
            sum += language.ngram[1].loc[x,y]
            

In [123]:
list_of_languages[0].ngram[1]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,R,S,T,U,V,W,X,Y,Z,instances
a,149.0,52.0,4.0,23.0,20.0,6.0,39.0,13.0,146.0,80.0,...,2.0,1.0,3.0,1.0,0.0,0.0,1.0,0.0,5.0,1829.0
b,160.0,0.0,0.0,0.0,93.0,0.0,0.0,0.0,55.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,379.0
c,10.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0
d,89.0,0.0,0.0,1.0,90.0,0.0,1.0,0.0,68.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,374.0
e,108.0,15.0,7.0,12.0,63.0,1.0,57.0,17.0,37.0,4.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1297.0
f,15.0,1.0,0.0,0.0,9.0,3.0,0.0,0.0,8.0,0.0,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,57.0
g,90.0,0.0,0.0,0.0,32.0,1.0,0.0,0.0,37.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,341.0
h,54.0,0.0,0.0,0.0,15.0,0.0,1.0,0.0,20.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,179.0
i,116.0,29.0,2.0,36.0,45.0,5.0,15.0,13.0,38.0,7.0,...,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,1055.0
j,109.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,5.0,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,171.0


In [124]:
for language in list_of_languages:
    for index, row in language.ngram[1].iterrows():
        row['Probability'] = row['instances']/total_instances
list_of_languages[0].ngram[1]

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,R,S,T,U,V,W,X,Y,Z,instances
a,149.0,52.0,4.0,23.0,20.0,6.0,39.0,13.0,146.0,80.0,...,2.0,1.0,3.0,1.0,0.0,0.0,1.0,0.0,5.0,1829.0
b,160.0,0.0,0.0,0.0,93.0,0.0,0.0,0.0,55.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,379.0
c,10.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.0
d,89.0,0.0,0.0,1.0,90.0,0.0,1.0,0.0,68.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,374.0
e,108.0,15.0,7.0,12.0,63.0,1.0,57.0,17.0,37.0,4.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1297.0
f,15.0,1.0,0.0,0.0,9.0,3.0,0.0,0.0,8.0,0.0,...,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,57.0
g,90.0,0.0,0.0,0.0,32.0,1.0,0.0,0.0,37.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,341.0
h,54.0,0.0,0.0,0.0,15.0,0.0,1.0,0.0,20.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,179.0
i,116.0,29.0,2.0,36.0,45.0,5.0,15.0,13.0,38.0,7.0,...,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,1055.0
j,109.0,0.0,0.0,1.0,7.0,0.0,0.0,0.0,5.0,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,171.0
