# Imports

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag 
# Download the tagger from the site: https://nlp.stanford.edu/software/tagger.shtml Extract the file:stanford-postagger.jar
from nltk import FreqDist
import random
from nltk import classify
from nltk import NaiveBayesClassifier

# Read clean data 

In [2]:
es_twitts = pd.read_csv('data/clean/es_twitts.csv')

<h3>Tokenizing and Removing Noise from the Data</h3>

In [3]:
def remove_noise(desc, stop_words = ()):        
        
    desc = re.sub('https?:\/\/.*[\r\n]*','', desc)    
    desc = re.sub("[^A-ZÑa-zñ]+"," ", desc)
    desc = desc.lower()
    cleaned_tokens = []    
    
    for token in desc.split(): ## Tokenizing the Data        
        if token not in stop_words:
            #print(token)
            cleaned_tokens.append(token)
        #else:
            #print('STOPWORDS:',token)
    return cleaned_tokens

In [4]:
# Download stopwords
nltk.download('stopwords')

# Load stopwords
stop_words = stopwords.words('spanish')
stop_words.extend(['mexico', 'coronavirusoutbreak', 'casos', 'covid', 'coronavirus', 'coronavid', 'coronaviruspandemic', 'pandemia', 'cuarentena', 'virus','cdmx','mx'])

# TODO: Use pandas table 
tweets = es_twitts['text'].tolist()
tweets_cleaned_tokens_list = []

for tokens in tweets:
    # print(tokens)
    tweets_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gibra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
x=2
print(tweets[x])
print('')
print(tweets[x].split())
print('')
print(tweets_cleaned_tokens_list[x])

Si empezaste a trabajar, necesitas dar de alta a tus beneficiarios ante el IMSS, ahora lo puedes hacer desde tu domicilio a traves de internet y evita filas | SanaDistancia
QuedateEnCasa 
Coronavirus
COVID19 MexicoUnido          

https://t.co/zv3POwhVXe https://t.co/1VOKagjdOF

['Si', 'empezaste', 'a', 'trabajar,', 'necesitas', 'dar', 'de', 'alta', 'a', 'tus', 'beneficiarios', 'ante', 'el', 'IMSS,', 'ahora', 'lo', 'puedes', 'hacer', 'desde', 'tu', 'domicilio', 'a', 'traves', 'de', 'internet', 'y', 'evita', 'filas', '|', 'SanaDistancia', 'QuedateEnCasa', 'Coronavirus', 'COVID19', 'MexicoUnido', 'https://t.co/zv3POwhVXe', 'https://t.co/1VOKagjdOF']

['si', 'empezaste', 'trabajar', 'necesitas', 'dar', 'alta', 'beneficiarios', 'imss', 'ahora', 'puedes', 'hacer', 'domicilio', 'traves', 'internet', 'evita', 'filas', 'sanadistancia', 'quedateencasa', 'mexicounido']


<h3>Normalizing data   from the Data</h3>

In [6]:
spanish_postagger = POS_Tag(
    r'stanford-tagger\models\spanish-ud.tagger',
    r'stanford-tagger\stanford-postagger-4.0.0.jar',
    encoding='utf8'
) 

In [7]:
def normalization(words,taggstarts):
    nouns = []    
    tagged_words = spanish_postagger.tag(words) ### Normalizing data            
    print (tagged_words) 
    
    for (word, tag) in tagged_words:      
        print(word + ' ' + tag)        
        if tag.startswith(taggstarts):
            nouns.append(word)            
    #print(nouns)    
    return nouns

#https://stackoverflow.com/questions/14732465/nltk-tagging-spanish-words-using-a-corpus

In [8]:
n_tweets_noun_tokens_list = []
a_tweets_noun_tokens_list = []
v_tweets_noun_tokens_list = []
for words in tweets_cleaned_tokens_list[:2]:    
    n_tweets_noun_tokens_list.append(normalization(words,'N'))
    a_tweets_noun_tokens_list.append(normalization(words,'A'))
    v_tweets_noun_tokens_list.append(normalization(words,'V'))

[('cualquier', 'DET'), ('enfermedad', 'NOUN'), ('respiratoria', 'ADJ'), ('automediques', 'ADJ'), ('prevencioncoronavirus', 'ADJ')]
cualquier DET
enfermedad NOUN
respiratoria ADJ
automediques ADJ
prevencioncoronavirus ADJ
[('cualquier', 'DET'), ('enfermedad', 'NOUN'), ('respiratoria', 'ADJ'), ('automediques', 'ADJ'), ('prevencioncoronavirus', 'ADJ')]
cualquier DET
enfermedad NOUN
respiratoria ADJ
automediques ADJ
prevencioncoronavirus ADJ
[('cualquier', 'DET'), ('enfermedad', 'NOUN'), ('respiratoria', 'ADJ'), ('automediques', 'ADJ'), ('prevencioncoronavirus', 'ADJ')]
cualquier DET
enfermedad NOUN
respiratoria ADJ
automediques ADJ
prevencioncoronavirus ADJ
[('atenci', 'NOUN'), ('n', 'NOUN'), ('terminal', 'NOUN'), ('nuevo', 'ADJ'), ('circo', 'NOUN'), ('implementan', 'VERB'), ('medidas', 'NOUN'), ('uso', 'NOUN'), ('mascarilla', 'VERB'), ('parte', 'NOUN'), ('usuaris', 'NOUN'), ('conductores', 'ADJ'), ('hacen', 'VERB'), ('vida', 'NOUN'), ('dichas', 'ADJ'), ('instalaciones', 'NOUN'), ('parte'

In [16]:
n_tweets_noun_tokens_list

[['enfermedad'],
 ['atenci',
  'n',
  'terminal',
  'circo',
  'medidas',
  'uso',
  'parte',
  'usuaris',
  'vida',
  'instalaciones',
  'parte',
  'esfuerzos',
  'marzo']]

In [17]:
x=0
print(n_tweets_noun_tokens_list[x])
print ('')
print(n_tweets_noun_tokens_list[x])

['enfermedad']

['enfermedad']


<h3>Determining Word Density</h3>

In [18]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [19]:
#all_pos_words = get_all_words(tweets_cleaned_tokens_list)
all_pos_words = get_all_words(n_tweets_noun_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(100))

[('parte', 2), ('enfermedad', 1), ('atenci', 1), ('n', 1), ('terminal', 1), ('circo', 1), ('medidas', 1), ('uso', 1), ('usuaris', 1), ('vida', 1), ('instalaciones', 1), ('esfuerzos', 1), ('marzo', 1)]


<h3>Preparing Data for the Model</h3>

Converting Tokens to a Dictionary

In [13]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(n_tweets_noun_tokens_list)

<h3>Splitting the Dataset for Training and Testing the Model</h3>

In [27]:
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

dataset = positive_dataset

random.shuffle(dataset)

# from sklearn.model_selection import train_test_split
# train_set, test_set = train_test_split(dataset, test_size=0.30, random_state=42)

train_data = dataset[:7000]
test_data = dataset[7000:]

In [28]:
positive_dataset

[]

<h3>Building and Testing the Model</h3>

In [15]:
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0
Most Informative Features
None
