In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
lyrics = pd.read_csv("/kaggle/input/bad-bunny-lyrics/bad_bunny_lyrics.csv")

In [3]:
# Good regex resource: https://realpython.com/regex-python/#python-regex-metacharacters
import re

In [4]:
# practice_str = '[Letra de "NI BIEN NI MAL"]  [Intro: Bad Bunny] Yeh-yeh Yeh-yeh Yeh-yeh'
practice_song_lyric = lyrics.iloc[0,4]
practice_song_lyric

In [5]:
print(practice_song_lyric)

In [6]:
# Source: https://stackoverflow.com/questions/65022050/cleaning-song-lyrics-with-regex

In [7]:
# removed brackets used to denote sections of song
# removed new lines and added periods for sentence tokenization
removed_brackets = re.sub(r"[\[].*?[\]]","",practice_song_lyric)
no_brackets_or_new_lines = re.sub(r"\n{2}","",removed_brackets)
no_brackets_or_new_lines = re.sub(r"[\n]",". ",no_brackets_or_new_lines)
print(no_brackets_or_new_lines)

In [8]:
# might need to replace new lines with periods
correct_vas = re.sub(r"va\'","vas",no_brackets_or_new_lines)
correct_para_zero = re.sub(r"Pa\'l","Para el",correct_vas)
correct_para = re.sub(r"Pa\'","Para ",correct_para_zero)
correct_voy = re.sub(r"vo\'","voy ",correct_para)
correct_para_2 = re.sub(r"pa\'","para",correct_voy)
correct_de = re.sub(r"\'e","de",correct_para_2)
correct_puyas = re.sub(r"puya\'","puyas",correct_de)
correct_puyas += '.'
correct_puyas

In [9]:
!pip install stanza 

In [10]:
# Source: https://stanfordnlp.github.io/stanza/
# Source: https://stanfordnlp.github.io/stanza/tokenize.html#:~:text=Training%2DOnly%20Options-,Description,invoked%20by%20the%20name%20tokenize%20.
# Note: Stanza does not offer a Spanish Sentiment Analyzer
# Check out Stanza's other sections like:
# - Pipeline and Processors
# - Part-of-Speech and Morphological Features
# - Lemmatization
import stanza

In [11]:
stanza.download('es', package='ancora', processors='tokenize,mwt,pos,lemma', verbose=True)

In [12]:
stNLP = stanza.Pipeline(processors='tokenize,mwt,pos,lemma', lang='es', use_gpu=True)

In [13]:
doc = stNLP('Barack Obama nació en Hawaii.')
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences for word in sent.words], sep='\n')

In [14]:
# Source: https://stanfordnlp.github.io/stanza/tokenize.html#:~:text=Training%2DOnly%20Options-,Description,invoked%20by%20the%20name%20tokenize%20.
# tokenization and sentence segmentation performed
doc = stNLP(correct_puyas)

In [15]:
print(*[f'word: {word.text+" "}\tlemma: {word.lemma}' for sent in doc.sentences[4:6] for word in sent.words], sep='\n')

In [16]:
for i, sentence in enumerate(doc.sentences[4:6]):
    print(f'====== Sentence {i+1} tokens =======')
    print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

In [17]:
print([sentence.text for sentence in doc.sentences[4:6]])

In [18]:
doc.sentences[4:5]

In [19]:
# replaces all Unicode whitespace (such as \u) with a single space
#re.sub("\s+", " ", lyric, flags=re.UNICODE)

In [20]:
# Spanish word normalization Python packages and libraries (Cucco)
# ftfy - handles mojibake

0. Sentence Tokenization - nltk package (tokenize and FreqDist)
0. Finding semantically related words using word embeddings (already loadded the dataset)
    * Look at Google Colab book on word2vec to see an example of how to do this
0. Spanish lemmatizer - https://stackoverflow.com/questions/60534999/how-to-solve-spanish-lemmatization-problems-with-spacy
    * The link above has sources that show how to make your own lemmatizer, FreeLing (provides lemmatization in Spanish and other languages), and spacy-stanza ("spaCy's API with the Stanza's models")
    * https://github.com/pablodms/spacy-spanish-lemmatizer    
    * pattern is another library
0. Word Cloud
1. **Topic Modeling** - can later let the user generate songs based on a topic
2. **Sentiment Analysis** - user can create songs with a specific sentiment
    * Another option for sentiment analysis: https://colab.research.google.com/github/pysentimiento/pysentimiento/blob/master/notebooks/PySentimiento_Sentiment_Analysis_in_Spanish.ipynb#scrollTo=VtBRRj_RaVr7
3. Learn how to Use Kaggle Console effectively for projects

* How to track Jupyter notebook changes in GitHub - https://blog.reviewnb.com/jupyter-notebook-on-github-oss-examples/
* Top 8 Python libraries for NLP - https://www.analyticsvidhya.com/blog/2021/05/top-8-python-libraries-for-natural-language-processing-nlp-in-2021/

# Sentiment Analysis (Library #1)

In [21]:
!pip install sentiment-analysis-spanish
!pip install keras tensorflow

In [22]:
from sentiment_analysis_spanish import sentiment_analysis

In [23]:
sentiment = sentiment_analysis.SentimentAnalysisSpanish()

In [40]:
# Numbers close to zero are negative
# Numbers close to 1 are positive
# Numbers close to 0.5 are neutral
# Results made me think about: "It's not what you say but how you say it"
print(*[f'The sentence:\t {sentence.text} has a sentiment of:\t{sentiment.sentiment(sentence.text)}' for sentence in doc.sentences[20:30]], 
      sep='\n')

# Sentiment Analysis (Library #2)

In [41]:
!pip install pysentimiento

In [42]:
from pysentimiento import create_analyzer
analyzer = create_analyzer(task="sentiment", lang="es")

In [48]:
print(*[f'The sentence:\t {sentence.text} has a sentiment of:\t{analyzer.predict(sentence.text)}' for sentence in doc.sentences[20:22]], 
      sep='\n')

In [51]:
for sentence in doc.sentences[20:30]:
    analyzer_output_obj = analyzer.predict(sentence.text)
    print("The sentence is: ",sentence.text,"\n", analyzer_output_obj.probas)

In [45]:
emotion_analyzer = create_analyzer(task="emotion", lang="es")

In [None]:
[*print(f'')]

In [55]:
# can print the most likely emotion 
for sentence in doc.sentences[20:30]:
    analyzer_output_obj = emotion_analyzer.predict(sentence.text)
    print("The sentence is: ", sentence.text,"\n","The emotion is: ", analyzer_output_obj.output)

In [46]:
# can also output the probability of emotions
for sentence in doc.sentences[20:30]:
    analyzer_output_obj = emotion_analyzer.predict(sentence.text)
    print("The sentence is: ", sentence.text,"\n","The emotion probabilities are" analyzer_output_obj.probas)