In [87]:
import pandas as pd
import numpy as np
df_of_train_tweets = None
with open('training-tweets.txt', encoding='utf8') as file:
    language_list = []
    content_list = []
    for line in file.readlines():
        try:
            tweet_id, user_name, language, tweet = line.split(maxsplit=3)
            language_list.append(language.strip())
            content_list.append(tweet.strip())
        except ValueError:
            pass
    data = {'Language': language_list, 'Content': content_list}
    df_of_train_tweets = pd.DataFrame.from_dict(data)        

In [88]:
df_of_train_tweets

Unnamed: 0,Language,Content
0,eu,"""eunoon, zeatik dakaat sentsazioa goizegi esna..."
1,es,Pedazo de tarta k me e kurrao!!!! Ske k buena ...
2,es,No hace falta que te digan que vas a morir par...
3,es,Empieza mi findeeeee :))))
4,es,Próxima parada: Carnaval del Toro de Ciudad Ro...
...,...,...
18313,es,"""4 días, 16 horas e 28 minutos 🌴 #BN"""
18314,es,@Anttoo_18 la hora y la edad sacan mi instinto...
18315,es,@Paoferrari57 HOY desde las 1:20 NCAA Women's ...
18316,es,Yo no se pa que cojones van a supervivientes.....


In [89]:
import string
def accepted_character(a_character, accepted_char_set):
    if a_character in accepted_char_set:
        return True
    else:
        return False

In [90]:
class Language:
    def __init__(self, symbol, description, dataset, ngram=None, probability=None):
        self.symbol = symbol
        self.description = description
        self.dataset = dataset
        self.ngram = []
        self.probability = probability
        
    def add_ngram(self, ngram):
        self.ngram.append(ngram)

In [91]:
language_symbol = {
    'eu': 'basque',
    'ca': 'catalan',
    'gl': 'galician',
    'es': 'spanish',
    'en': 'english',
    'pt': 'portugese'
}

In [92]:
list_of_languages = []
total_nb_of_tweets = df_of_train_tweets.shape[0]
for i in language_symbol.keys():
    language_dataset = df_of_train_tweets[df_of_train_tweets['Language'] == i]
    prob_of_language = language_dataset.shape[0]/total_nb_of_tweets
    a_language = Language(i, language_symbol.get(i), language_dataset, None, prob_of_language)
    
    list_of_languages.append(a_language)

In [93]:
list_of_letters = [character for character in string.ascii_letters] #readjust for the nb of vocabulary
unigram_dataset = pd.DataFrame(np.zeros((52,2)), columns=['Probability', 'Instances'])
unigram_dataset['Characters'] = list_of_letters
unigram_dataset = unigram_dataset.set_index('Characters')
print(unigram_dataset)

            Probability  Instances
Characters                        
a                   0.0        0.0
b                   0.0        0.0
c                   0.0        0.0
d                   0.0        0.0
e                   0.0        0.0
f                   0.0        0.0
g                   0.0        0.0
h                   0.0        0.0
i                   0.0        0.0
j                   0.0        0.0
k                   0.0        0.0
l                   0.0        0.0
m                   0.0        0.0
n                   0.0        0.0
o                   0.0        0.0
p                   0.0        0.0
q                   0.0        0.0
r                   0.0        0.0
s                   0.0        0.0
t                   0.0        0.0
u                   0.0        0.0
v                   0.0        0.0
w                   0.0        0.0
x                   0.0        0.0
y                   0.0        0.0
z                   0.0        0.0
A                   

In [94]:
unigram_dataset

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,0.0
b,0.0,0.0
c,0.0,0.0
d,0.0,0.0
e,0.0,0.0
f,0.0,0.0
g,0.0,0.0
h,0.0,0.0
i,0.0,0.0
j,0.0,0.0


In [95]:
#create unigram for each language
for language in list_of_languages:
    language.ngram.append(unigram_dataset.copy())
    list_of_letters = [character for character in string.ascii_letters]
    for character in list_of_letters:
        language.ngram[0].loc[character]['Instances'] = ''.join(language.dataset['Content']).count(character)

In [86]:
list_of_languages[0].ngram[0]

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,2489.0
b,0.0,384.0
c,0.0,100.0
d,0.0,383.0
e,0.0,1495.0
f,0.0,61.0
g,0.0,347.0
h,0.0,186.0
i,0.0,1245.0
j,0.0,189.0


In [96]:
for language in list_of_languages:
    total_instances = language.ngram[0]['Instances'].sum()
    for index, row in language.ngram[0].iterrows():
        row['Probability'] = row['Instances']/total_instances

In [97]:
list_of_languages[0].ngram[0]

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.157552,2489.0
b,0.024307,384.0
c,0.00633,100.0
d,0.024244,383.0
e,0.094632,1495.0
f,0.003861,61.0
g,0.021965,347.0
h,0.011774,186.0
i,0.078807,1245.0
j,0.011964,189.0


In [None]:
df_of_test_tweets = None
with open('test-tweets-given.txt', encoding='utf8') as file:
    language_list = []
    content_list = []
    for line in file.readlines():
        try:
            tweet_id, user_name, language, tweet = line.split(maxsplit=3)
            language_list.append(language.strip())
            content_list.append(tweet.strip())
        except ValueError:
            pass
    data = {'Language': language_list, 'Content': content_list}
    df_of_test_tweets = pd.DataFrame.from_dict(data)

In [None]:
import math
def prob_of_language(list_of_languages, line):
    best_probability = float('-inf')
    best_language = None
    for language in list_of_languages:
        total_probability = math.log(language.probability)
        for character in line:
            if character in string.ascii_letters:
                character_probability = language.ngram[0].loc[character]['Probability']
                total_probability *= math.log(character_probability)
        if total_probability > best_probability:
            best_probability = total_probability
            best_language = language.symbol
    return best_language

In [None]:
list_of_guesses = []
for index, row in df_of_test_tweets.iterrows():
    list_of_guesses.append(prob_of_language(list_of_languages, row['Content']))
df_of_test_tweets['guess'] = list_of_guesses

In [None]:
df_of_test_tweets

In [None]:
guess_status = []
for index, row in df_of_test_tweets.iterrows():
    if row['Language'] == row['guess']:
        guess_status.append(True)
    else:
        guess_status.append(False)
df_of_test_tweets['Status'] = guess_status

In [None]:
df_of_test_tweets

In [None]:
(df_of_test_tweets[df_of_test_tweets['Status'] == True].shape[0])/df_of_test_tweets.shape[0]

In [None]:
list_of_languages[0].ngram[0].loc['a']['Probability']

In [None]:
list_of_letters = [character for character in string.ascii_letters] #readjust for the nb of vocabulary
new_list_of_letters = list_of_letters.copy()
new_list_of_letters.append('instances')
bigram_dataset = pd.DataFrame(np.zeros((52,53)), columns=new_list_of_letters)
bigram_dataset['Characters'] = list_of_letters
bigram_dataset = bigram_dataset.set_index('Characters')

In [None]:
bigram_dataset

In [None]:
for language in list_of_languages: # is way too slow, need to verify results
    language.ngram.append(unigram_dataset.copy())
    for line in language.dataset['Content']:
        for character in line:
            if character in string.ascii_letters:
                language.ngram[1].loc[character]['Instances'] += 1

In [None]:
 language.ngram[1]