In [20]:
import pandas as pd
import numpy as np
df_of_train_tweets = None
with open('training-tweets.txt', encoding='utf8') as file:
    language_list = []
    content_list = []
    for line in file.readlines():
        try:
            tweet_id, user_name, language, tweet = line.split(maxsplit=3)
            language_list.append(language.strip())
            content_list.append(tweet.strip())
        except ValueError:
            pass
    data = {'Language': language_list, 'Content': content_list}
    df_of_train_tweets = pd.DataFrame.from_dict(data)        

In [2]:
df_of_train_tweets

Unnamed: 0,Language,Content
0,eu,"""eunoon, zeatik dakaat sentsazioa goizegi esna..."
1,es,Pedazo de tarta k me e kurrao!!!! Ske k buena ...
2,es,No hace falta que te digan que vas a morir par...
3,es,Empieza mi findeeeee :))))
4,es,Próxima parada: Carnaval del Toro de Ciudad Ro...
5,es,Graniza y tal.
6,es,¿Mi idolo? Lo que mas quiero en este mundo. ht...
7,es,"""Graniza y luego sale el sol. Muy normal en Ir..."
8,en,"""IM CONFUSED ,WHAT IS THE REAL JUSTIN? @justin..."
9,es,@Nyahika Creo que esta el dependiente majo que...


In [21]:
import string
def accepted_character(a_character, accepted_char_set):
    if a_character in accepted_char_set:
        return True
    else:
        return False

In [22]:
class Language:
    def __init__(self, symbol, description, dataset, ngram=None, probability=None):
        self.symbol = symbol
        self.description = description
        self.dataset = dataset
        self.ngram = []
        self.probability = probability
        
    def add_ngram(self, ngram):
        self.ngram.append(ngram)

In [23]:
language_symbol = {
    'eu': 'basque',
    'ca': 'catalan',
    'gl': 'galician',
    'es': 'spanish',
    'en': 'english',
    'pt': 'portugese'
}

In [24]:
list_of_languages = []
total_nb_of_tweets = df_of_train_tweets.shape[0]
for i in language_symbol.keys():
    language_dataset = df_of_train_tweets[df_of_train_tweets['Language'] == i]
    prob_of_language = language_dataset.shape[0]/total_nb_of_tweets
    a_language = Language(i, language_symbol.get(i), language_dataset, None, prob_of_language)
    list_of_languages.append(a_language)

In [8]:
list_of_letters = [character for character in string.ascii_letters] #readjust for the nb of vocabulary
unigram_dataset = pd.DataFrame(np.zeros((52,2)), columns=['Probability', 'Instances'])
unigram_dataset['Characters'] = list_of_letters
unigram_dataset = unigram_dataset.set_index('Characters')
# print(unigram_dataset)

In [34]:
print(list_of_letters)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [37]:
import nltk

freq = nltk.FreqDist(list_of_letters)
print(freq)


# for language in list_of_languages: # is way too slow, need to verify results
#     language.ngram.append(unigram_dataset.copy())
#     for line in language.dataset['Content']:
#         for character in line:
#             if character in string.ascii_letters:
#                 language.ngram[0].loc[character]['Instances'] += 1

<FreqDist with 52 samples and 52 outcomes>


In [14]:
list_of_languages[0].ngram[0]

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.0,2489.0
b,0.0,384.0
c,0.0,100.0
d,0.0,383.0
e,0.0,1495.0
f,0.0,61.0
g,0.0,347.0
h,0.0,186.0
i,0.0,1245.0
j,0.0,189.0


In [15]:
for language in list_of_languages:
    total_instances = language.ngram[0]['Instances'].sum()
    for index, row in language.ngram[0].iterrows():
        row['Probability'] = row['Instances']/total_instances

In [16]:
list_of_languages[0].ngram[0]

Unnamed: 0_level_0,Probability,Instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.157552,2489.0
b,0.024307,384.0
c,0.00633,100.0
d,0.024244,383.0
e,0.094632,1495.0
f,0.003861,61.0
g,0.021965,347.0
h,0.011774,186.0
i,0.078807,1245.0
j,0.011964,189.0


In [18]:
df_of_test_tweets = None
with open('test-tweets-given.txt', encoding='utf8') as file:
    language_list = []
    content_list = []
    for line in file.readlines():
        try:
            tweet_id, user_name, language, tweet = line.split(maxsplit=3)
            language_list.append(language.strip())
            content_list.append(tweet.strip())
        except ValueError:
            pass
    data = {'Language': language_list, 'Content': content_list}
    df_of_test_tweets = pd.DataFrame.from_dict(data)

In [17]:
import math
def prob_of_language(list_of_languages, line):
    best_probability = float('-inf')
    best_language = None
    for language in list_of_languages:
        total_probability = math.log(language.probability)
        for character in line:
            if character in string.ascii_letters:
                character_probability = language.ngram[0].loc[character]['Probability']
                total_probability *= math.log(character_probability)
        if total_probability > best_probability:
            best_probability = total_probability
            best_language = language.symbol
    return best_language

In [19]:
list_of_guesses = []
for index, row in df_of_test_tweets.iterrows():
    list_of_guesses.append(prob_of_language(list_of_languages, row['Content']))
df_of_test_tweets['guess'] = list_of_guesses

In [20]:
df_of_test_tweets

Unnamed: 0,Language,Content,guess
0,es,@AnderDelPozo @PesqueWhite hahaha yo tambien m...,es
1,es,Foto antes de LA FOTO. #orla @ Universidad de ...,eu
2,es,@Nagore_Robles Que mal lo vas a pasar viendo a...,es
3,es,"""Hay cosas complicadas, y luego está quitarse ...",es
4,es,"""Me duele todo, que asco.""",es
5,eu,Ta aste txuriyakin hastekoo bizitzako parriaak...,eu
6,es,@DeieneA hooooombre :),eu
7,es,"""Huelga indefinida de ALSA el 5 de marzo, afec...",en
8,es,@medeome @MdeMero yo lo conseguí. Hace 22 años...,es
9,es,@GaleHunterJkun El lobo de Wall Street &gt;&gt...,eu


In [21]:
guess_status = []
for index, row in df_of_test_tweets.iterrows():
    if row['Language'] == row['guess']:
        guess_status.append(True)
    else:
        guess_status.append(False)
df_of_test_tweets['Status'] = guess_status

In [23]:
df_of_test_tweets

Unnamed: 0,Language,Content,guess,Status
0,es,@AnderDelPozo @PesqueWhite hahaha yo tambien m...,es,True
1,es,Foto antes de LA FOTO. #orla @ Universidad de ...,eu,False
2,es,@Nagore_Robles Que mal lo vas a pasar viendo a...,es,True
3,es,"""Hay cosas complicadas, y luego está quitarse ...",es,True
4,es,"""Me duele todo, que asco.""",es,True
5,eu,Ta aste txuriyakin hastekoo bizitzako parriaak...,eu,True
6,es,@DeieneA hooooombre :),eu,False
7,es,"""Huelga indefinida de ALSA el 5 de marzo, afec...",en,False
8,es,@medeome @MdeMero yo lo conseguí. Hace 22 años...,es,True
9,es,@GaleHunterJkun El lobo de Wall Street &gt;&gt...,eu,False


In [24]:
(df_of_test_tweets[df_of_test_tweets['Status'] == True].shape[0])/df_of_test_tweets.shape[0]

0.317

In [25]:
list_of_languages[0].ngram[0].loc['a']['Probability']

0.15755158880870995

In [30]:
list_of_letters = [character for character in string.ascii_letters] #readjust for the nb of vocabulary
new_list_of_letters = list_of_letters.copy()
new_list_of_letters.append('instances')
bigram_dataset = pd.DataFrame(np.zeros((52,53)), columns=new_list_of_letters)
bigram_dataset['Characters'] = list_of_letters
bigram_dataset = bigram_dataset.set_index('Characters')

In [31]:
bigram_dataset

Unnamed: 0_level_0,a,b,c,d,e,f,g,h,i,j,...,R,S,T,U,V,W,X,Y,Z,instances
Characters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
g,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
h,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
i,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
j,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
import nltk
frequency = nltk.Freq0ist(character)

print(Frequency)

AttributeError: module 'nltk' has no attribute 'Freq0ist'