On this phase we will train and test the sentiment analysis model using NLP (Natural Lenguage Processing) which involves classifying tweet-texts in a pre-defined sentiment. Important mention, to train the model the dataset used will be from corpus available here :

http://tass.sepln.org/tass_data/download.php?auth=4tNaxs9su4VeTvJejrj

In [None]:
import pandas as pd
import xml.etree.ElementTree as etree
import os
from glob import glob
from nltk.corpus import stopwords
import re, string
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag
from nltk import FreqDist
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

## Import twitts classified to train the model

In [None]:
df_tweets_classified = pd.DataFrame()
columns = ["content","sentiment"]
df_tweets_classified = pd.DataFrame(columns = columns)

raw_data_path = r'data/raw/*TASS2019*.xml'
xml_files = glob(raw_data_path)
xml_files

for files in xml_files:
    tree = etree.parse(files)
    root = tree.getroot()
    
    for node in root: 
        tweet = node.attrib.get("tweet")    
        content = node.find("content").text if node is not None else None
        sentiment = node.find("sentiment/polarity/value").text if node is not None else None        
        df_tweets_classified = df_tweets_classified.append(pd.Series([content,sentiment], index = columns), ignore_index = True)    

df_tweets_classified

## Removing Noise from the Data

In [None]:
# .words() method to get a list of stop words in Spanish
stop_words = stopwords.words('spanish')

# word accents from stopword list are removed for the clean process
stop_words = [re.sub('á','a', i) for i in stop_words]
stop_words = [re.sub('é','e', i) for i in stop_words]
stop_words = [re.sub('í','i', i) for i in stop_words]
stop_words = [re.sub('ó','o', i) for i in stop_words]
stop_words = [re.sub('ú','u', i) for i in stop_words]

In [None]:
# Transform all words in lower case in a new column called "clean_text"
df_tweets_classified["clean_content"] = df_tweets_classified["content"].str.lower()

# word accents from tweet list are removed for the clean process
df_tweets_classified.replace('á','a', regex=True, inplace=True)
df_tweets_classified.replace('é','e', regex=True, inplace=True)
df_tweets_classified.replace('í','i', regex=True, inplace=True)
df_tweets_classified.replace('ó','o', regex=True, inplace=True)
df_tweets_classified.replace('ú','u', regex=True, inplace=True)

# remove from words urls, tags, hastashs, special characters and words which contain 1 to 3 letters.  
df_tweets_classified['clean_content'].replace('http\S+','',regex=True, inplace = True)
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'@\S+', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'#\S+', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'\b\w{1,3}\b', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'[^a-zñ]+', ' ', x))

# Set "NaN" to create a NaN value for those empty fields after cleaning process
df_tweets_classified.replace("", float("NaN"), inplace=True)

# drop all rows that contain NaN under conten clean column,this for ensuring not letting blank values .
df_tweets_classified.dropna(subset = ["clean_content"], inplace=True)

df_tweets_classified['clean_content']=df_tweets_classified['clean_content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_tweets_classified[['content','clean_content']]

## Splitting Data by Positive and Negative Sentiments

In [None]:
df_tweets_classified.sentiment.unique()

In [None]:
positive_tweets = df_tweets_classified.query("sentiment == 'P'")
positive_tweets

In [None]:
negative_tweets = df_tweets_classified.query("sentiment == 'N'")
negative_tweets

## Tokenizing the Data

Tokenizing is by splitting the text based on whitespace and punctuation, token is a sequence of characters obtained by text that serves as a unit, basically we will create words from text, it will help make easier for the understanding machine process.

In [None]:
pos_token_tweets_list=[]
neg_token_tweets_list=[]

for token in positive_tweets['clean_content'].str.split():   
    pos_token_tweets_list.append(token)

for token in negative_tweets['clean_content'].str.split():          
    neg_token_tweets_list.append(token)

In [None]:
print(pos_token_tweets_list[9],'\n')
print(neg_token_tweets_list[9])

## Normalizing the Data - Fast way

Normalizing process takes some time of execution, you could skip the following "Manual way" and use this "Fast way", just running below block code which imports the tokens already normalized. If you want understand the process of normalization skip this and go for normalazing data - manual way. 

In [None]:
# Import Tokens already normalized
pos_tokens_normalized = []
neg_tokens_normalized = []

# Import
with open(r'data\clean\pos_tokens_normalized.txt', "r") as f:
    for line in f:
        pos_tokens_normalized.append(line.split())        

with open(r'data\clean\neg_tokens_normalized.txt', "r") as f:
    for line in f:
        neg_tokens_normalized.append(line.split())

## Normalizing the Data - Manual way

For Normalization "Manual way" ensure have downloaded the JAVA JRE since due space limitation on GitHub it's not not included, download the version jre1.8.0_251 from below link and place it at the root path of this project.

https://www.oracle.com/java/technologies/javase-jre8-downloads.html

In [None]:
spanish_postagger = POS_Tag(r'stanford-tagger/models/spanish.tagger', r'stanford-tagger/stanford-postagger.jar', encoding='utf8')
java_path = "jre1.8.0_251/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [None]:
def normalization(words):    
    tokens = []
    tagged_words = spanish_postagger.tag(words) ### Normalizing data            
    #print (tagged_words) 
        
    for (word, tag) in tagged_words:                      
        if tag not in ['np00000','word','nc0n000','di0000','pr000000','vaip000','sp000','z0','i']:
            #print(word+' '+tag)
            tokens.append(word)
    return tokens

In [None]:
pos_tokens_normalized = []
neg_tokens_normalized = []

for words in pos_token_tweets_list[:1]:
    pos_tokens_normalized.append(normalization(words))

for words in neg_token_tweets_list[:1]:
    neg_tokens_normalized.append(normalization(words))

## Validation normal tweets vs clean tweets, tokenized and normalized

In [None]:
#x=130
x=0
print(positive_tweets.content.iloc[x])
print(pos_tokens_normalized[x],'\n')

print(negative_tweets.content.iloc[x])
print(neg_tokens_normalized[x])

## Determining Word Density

In [None]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [None]:
all_pos_words = get_all_words(pos_tokens_normalized)
all_neg_words = get_all_words(neg_tokens_normalized)

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

print(freq_dist_pos.most_common(20),'\n')
print(freq_dist_neg.most_common(20))

## Preparing Data for the Model

Converting Tokens to a Dictionary

In [None]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(pos_tokens_normalized)
negative_tokens_for_model = get_tweets_for_model(neg_tokens_normalized)

In [None]:
pos_count=len(pos_tokens_normalized)
neg_count=len(neg_tokens_normalized)
print(pos_count,neg_count,'=',pos_count+neg_count )

## Splitting the Dataset for Training and Testing the Model

In [None]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:2500]
test_data = dataset[2500:]

## Building and Testing the Model

In [None]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(15))

## Test Model with custom messages

In [None]:
custom_tweet = "que malos dias ahorita con esto del covid"

custom_tokens = word_tokenize(custom_tweet)

print(classifier.classify(dict([token, True] for token in custom_tokens)))

## Export Model trained

In [None]:
#tensorflow
# To save:
import pickle
f = open('sentiment_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()