# 2 - Extração de Características da Música e Pré-processamento (Português e Inglês)

In [1]:
#Dependências
import string
import os
import textlytics
import re
import nltk
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
from collections import OrderedDict #Ordena alfabeticamente
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
%matplotlib inline

[nltk_data] Downloading package punkt to /home/jorge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jorge/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/jorge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
textlytics.config.setLanguage('english')

In [3]:
# Leitura do dataset
def read_data(path, sep):
    data = pd.read_csv(path, sep=sep)
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
    return data

In [4]:
data = read_data('music_mar_2021/data/raw/pop_mar21.csv', sep=",")

In [5]:
data.head()

Unnamed: 0,lang,genero,exib,titulo,artista,letras
0,en,Tecnopop,936,Sick Sad World,Blood On The Dance Floor,The poor.The broken.The abused.And it's spoken...
1,en,Tecnopop,24437,Sexting,Blood On The Dance Floor,Less than three.Is just a tease.Send those noo...
2,en,Tecnopop,5726,Call Me Master,Blood On The Dance Floor,Tonight our bodies getting intertwined.It's fu...
3,en,Tecnopop,2151,Success Is The Best Revenge,Blood On The Dance Floor,I'm a killer.A chiller.A bonafied thriller.Ope...
4,en,Tecnopop,24252,Bewitched,Blood On The Dance Floor,You're attractive little witch you're beautifu...


In [6]:
#Função para recuperar as classes
def classesRetrieval(dataframe,labels):
    df = dataframe
    df['label'] = labels
    return df

In [7]:
def encoderLabel(data,labelfield):
    def encoder(label,labels_dict):
        return labels_dict[label]
    
    qtdLabels = len(set(data[labelfield]))
    nameLabels = set(data[labelfield])
    labels_dict = {}
    i = 0
    for l in nameLabels:
        labels_dict[l] = i
        i = i + 1
    print("As classes foram codificadas da seguinte forma:")
    print(labels_dict)
    print()
    return data[labelfield].apply(encoder, labels_dict=labels_dict)

In [8]:
#Preserva as classes em uma variavel
genero = data['genero']

In [9]:
print("Total de instâncias: {}".format(len(data)))

for gen in data.genero.unique():
    print("Gênero {}: {}".format(gen,len(data[data.genero == gen ])))

Total de instâncias: 8836
Gênero Tecnopop: 2169
Gênero Power-Pop: 1162
Gênero Pop Rock: 5505


In [10]:
data = data.drop(columns=['lang', 'genero','exib','titulo','artista'])
data.head(5)

Unnamed: 0,letras
0,The poor.The broken.The abused.And it's spoken...
1,Less than three.Is just a tease.Send those noo...
2,Tonight our bodies getting intertwined.It's fu...
3,I'm a killer.A chiller.A bonafied thriller.Ope...
4,You're attractive little witch you're beautifu...


### Extração de Características Estatísticas Textuais (CET) e Part of Speech Tagging (POS Tag)

In [11]:
#Torna o texto em letras minúsculas
def lowercase(text):
    text = str(text).lower()
    return text

In [12]:
#Remove algumas impurezas da coleção textual
def fix_words(text):
    text = str(text)
    text = re.sub(r'\bh[ah]*\b', "", text) # remove "hahahah" 
    text = re.sub(r'\b[oh]*\b', "", text) # remove "oohhhh" 
    #text = re.sub(r'\bq[ue]*\b', "que", text) # normaliza "queeeeeeeeee"
    return text

In [13]:
#Remove ruídos simples que podem interferir na contagem e extração das características estatísticas
def simpleNoiseRemoval(text):
    text = re.sub(r"[0-9]+", "", text)
    text = text.replace("#","")
    text = text.replace("(","")
    text = text.replace(")","")
    text = text.replace("...","")
    text = text.replace("!","")
    text = text.replace("&","")
    text = text.replace("&amp","")
    text = text.replace("'","")
    text = text.replace('"','')
    text = text.replace('‘','')
    
    text = text.replace('é','')
    text = text.replace('','')
    text = text.replace('aa','')
    text = text.replace('–','')
    text = text.replace('’','')
    text = text.replace('em','')
    return text

In [14]:
data['letras'] =  data['letras'].apply(lowercase)
data['letras'] =  data['letras'].apply(fix_words)
data['letras'] =  data['letras'].apply(simpleNoiseRemoval)

In [15]:
data.head(5)

Unnamed: 0,letras
0,the poor.the broken.the abused.and its spoken....
1,less than three.is just a tease.send those noo...
2,tonight our bodies getting intertwined.its fuc...
3,im a killer.a chiller.a bonafied thriller.open...
4,youre attractive little witch youre beautiful....


In [16]:
textlytics.features2Dataframe(data,'letras')

In [17]:
data.head(5)

Unnamed: 0,letras,Characters,Words,AvgWordLen,UniqueWords,Sentences,AvgWordsSentence,Syllables,AvgSyllableWords,RareWordsRatio,LexicalDiversity,Readability,ReadabilitySchoolarity,IncidenceVerbs,IncidenceAdj,IncidenceNouns,IncidenceCon,IncidencePron,ContentIncidence,ContentDiversity
0,the poor.the broken.the abused.and its spoken....,1073,213,4.126761,119,32,6.65625,249,1.169014,0.42723,0.558685,101.180315,5th grade,0.041,0.033,0.055,0.038,0.007,0.129,0.605634
1,less than three.is just a tease.send those noo...,1611,354,3.627119,173,73,4.849315,386,1.090395,0.316384,0.488701,109.665488,5th grade,0.081,0.03,0.076,0.041,0.047,0.187,0.528249
2,tonight our bodies getting intertwined.its fuc...,1758,372,3.887097,131,60,6.2,439,1.180108,0.13172,0.352151,100.704903,5th grade,0.079,0.043,0.066,0.049,0.06,0.188,0.505376
3,im a killer.a chiller.a bonafied thriller.open...,1190,258,3.833333,150,72,3.583333,285,1.104651,0.437984,0.581395,109.744428,5th grade,0.051,0.017,0.048,0.033,0.032,0.116,0.449612
4,youre attractive little witch youre beautiful....,1502,332,3.575301,81,39,8.512821,362,1.090361,0.078313,0.243976,105.949909,5th grade,0.072,0.011,0.099,0.061,0.029,0.182,0.548193


In [18]:
data = data.drop(columns=['Readability','ReadabilitySchoolarity'])

In [19]:
#Lista com o nome dos atributos (colunas) da representação CET
cet_att = list(data)[1:11]

In [20]:
pos_att = list(data)[11:18]

### Extração da representação Bag of Words

In [21]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
    
# Remove as stopwords de um texto
def remove_stop_words(text, stopwords):
    for sw in stopwords:
        text = re.sub(r'\b%s\b' % sw, "", text)
        
    return text

In [22]:
def remove_punctuation(text):  
    # re.sub(replace_expression, replace_string, target)
    new_text = re.sub(r"\.|,|;|:|-|!|\?|´|`|^|'", " ", text)
    new_text = new_text.strip()
    return new_text

In [23]:
def remove_others(text):
    noise = ['"','(',')','[',']','{','}',"'",'%','*','“','“’','#','?','@',"/",'\\',"_",'&','amp','.','+','$','※','=','=','^','”']
    
    for n in noise:
        new_text = text.replace(n,"")
    
    new_text = re.sub(r"u+h","", new_text)
    new_text = re.sub(r"o+h","", new_text)
    new_text = re.sub(r"a+h","", new_text)
    new_text = re.sub(r"ah+","", new_text)
    return new_text

In [24]:
def remove_numbers(text):
    text = str(text)
    new_text = re.sub(r"[0-9]+", "", text)
    return new_text

In [25]:
#Cria um dicionário dos radicais para as palavras mais frequentes
def generate_retrieval_dict(dataset,fieldName):
    collection = " ".join(dataset[fieldName])
    stemmer = SnowballStemmer("english")
    #stemmer = RSLPStemmer()
    word_list = collection.split()
    root = []
    result = []
    retrieval_dict = {}
    for w in word_list:
        word_old = w
        word_new = stemmer.stem(w)
        result.append(word_new)
        if (len(word_new)<len(word_old)):
            root.append(word_new)
    root = [w for w in set(root)]
    if(len(root)>0):
        for r in root:
            retrieval = []
            for w in word_list:
                if(r in w):
                    retrieval.append(w)
            counts = dict(Counter(retrieval).most_common(1))
            labels, values = zip(*counts.items())
            meaningful = str(labels[0])
            retrieval_dict[r] = meaningful
    return retrieval_dict

In [26]:
# Stemming function
def stemming(text):
    # Instance the Snowball stemmer
    stemmer = SnowballStemmer("english")
    #stemmer = RSLPStemmer()
    word_list = text.split()
    result = []
    for w in word_list:
        result.append(stemmer.stem(w))
    result = " ".join(result)
    return result

In [27]:
# Lengthening pattern to match on text

lengthening_pattern = "a{3,}|b{3,}|c{3,}|d{3,}|e{3,}|f{3,}|g{3,}|h{3,}|i{3,}|j{3,}|" \
"k{3,}|l{3,}|m{3,}|n{3,}|o{3,}|p{3,}|q{3,}|r{3,}|s{3,}|t{3,}|" \
"u{3,}|v{3,}|x{3,}|w{3,}|y{3,}|z{3,}"

# Reduce lengthening in a text
def lengthening_reduction(text, lenPattern):
    lengthenings = re.findall(lenPattern, text)
    if lengthenings:
        lengthenings = lengthenings[0]
        text = re.sub(lengthenings, lengthenings[0:1], text)
    return text

In [28]:
def lengthening_reduction_multiple(text, lenPattern):
    lengthenings = re.findall(lenPattern, text)
    for lengthening in lengthenings:
        text = re.sub(lengthening, lengthening[0:1], text)
    return text

In [29]:
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [30]:
from nltk.stem import WordNetLemmatizer
# The argument "special_terms" refers to terms that should not be parsed.

def lemmatizing(text, special_terms=[]):
    
    tokens = textlytics.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    
    for token, tag in tags:
        if token in special_terms:
            lemmatized_tokens.append(token)
        else:
            lemma = lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag))
            lemmatized_tokens.append(lemma)
        
    lemmatized_tokens_string = " ".join([token for token in lemmatized_tokens])
    
    return lemmatized_tokens_string

In [31]:
#Troca as palavras radicalizadas pela melhor representação
def stemming_retrieval(text,dic):
    p = 0
    result = text.split()
    while p < len(result):
        if(result[p] in dic):
            key = result[p]
            result[p] = dic[key]
        p+=1
    result = " ".join(result)
    return result

In [32]:
def only_ascii_letters(text):    
    letters = set(string.ascii_letters)
    tokens = textlytics.word_tokenize(text)
    new_tokens = []

    for token in tokens:
        word = ''.join((filter(lambda x: x in letters, token)))
        if(len(word)>=3):
            new_tokens.append(word)

    new_text = ' '.join(new_tokens)
    
    return new_text

In [33]:
def preprocessing(dataframe, fieldName, config):   
    if config["lowercase"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(lowercase)
        
    if config["remove_numbers"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(remove_numbers)
    
    if config["remove_stopwords"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(remove_stop_words, stopwords=stopwords)
        
    if config["generate_retrieval_dict"] == True:
        retrieval_dict = generate_retrieval_dict(dataframe,fieldName)
        
    if config["stemming"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(stemming)

    if config["stemming_retrieval"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(stemming_retrieval, dic=retrieval_dict)
        
    if config["lemmatizing"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(lemmatizing)
        
    if config["reduce_lengthening"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(lengthening_reduction_multiple, lenPattern=lengthening_pattern)
        
    if config["remove_others"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(remove_others)
        
    if config["remove_punctuation"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(remove_punctuation)
        
    if config["only_ascii_letters"] == True:
        dataframe[fieldName] = dataframe[fieldName].apply(only_ascii_letters)
        
    return dataframe

In [34]:
config = {  "lowercase": True, \
            "remove_punctuation": True, \
          "remove_others": True, \
            "remove_numbers": True, \
            "remove_stopwords": True,
         "generate_retrieval_dict": False,
         "stemming":True,
          "lemmatizing":True,
          "reduce_lengthening":False,
         "stemming_retrieval":False,
         "only_ascii_letters": True}

In [35]:
print(data["letras"].head(5))

0    the poor.the broken.the abused.and its spoken....
1    less than three.is just a tease.send those noo...
2    tonight our bodies getting intertwined.its fuc...
3    im a killer.a chiller.a bonafied thriller.open...
4    youre attractive little witch youre beautiful....
Name: letras, dtype: object


In [36]:
preprocessing(data, 'letras', config)
print(data["letras"].head(5))

0    poor broken abuse spoken american dream turn n...
1    less three tease send noodz mak drool hit make...
2    tonight bodi get intertwine fuck filthi fee bl...
3    killer chiller bonafi thriller open like bottl...
4    your attract littl witch your beautiful wicked...
Name: letras, dtype: object


In [37]:
def transformData(data, fieldName, my_tokenizer, weight):
    
    if weight == "TP":
        vectorizer = CountVectorizer(tokenizer=my_tokenizer, binary=True)
        X = vectorizer.fit_transform(data[fieldName])
    
    elif weight == "TF":
        vectorizer = CountVectorizer(tokenizer=my_tokenizer)
        X = vectorizer.fit_transform(data[fieldName])
        
    elif weight == "TFIDF":
        vectorizer = TfidfVectorizer(tokenizer=my_tokenizer)
        X = vectorizer.fit_transform(data[fieldName])
        
    feature_names = vectorizer.get_feature_names()
    text_collection = OrderedDict([(index, text) for index, text in enumerate(data[fieldName])])
    corpus_index = [n for n in text_collection]
    df = pd.DataFrame(X.todense(), index=corpus_index, columns=feature_names)
    
    return df

In [38]:
from textlytics import word_tokenize

In [39]:
BowTFIDF = transformData(data,'letras',word_tokenize,'TFIDF')

In [40]:
BowTFIDF.head(5)

Unnamed: 0,aba,aback,abacus,abad,abandon,abandoned,abandonment,abate,abattoir,abba,...,zooropavosprung,zooropayou,zoovier,zorn,zsa,zucchero,zucchini,zulu,zuma,zwei
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature Selection (for BoW only)

In [41]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression



X= BowTFIDF

labels ={}
i = 0

for label in set(genero):
    labels[label] = i
    i+=1

y = [labels[l] for l in genero]

selector = SelectPercentile(percentile=20).fit(X,y)

#selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)

In [42]:
col_index = selector.get_support()

In [43]:
cols = BowTFIDF.columns[col_index]

In [44]:
BowTFIDF = BowTFIDF[cols]

In [45]:
BowTFIDF.head()

Unnamed: 0,aba,abc,aberdeen,abide,abil,ability,aboard,abolish,abridge,absenc,...,zanzibar,zebedee,zee,zest,zig,zillion,zoom,zoomin,zorn,zulu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Devolvendo as classes para cada representação

In [46]:
cet = classesRetrieval(data[cet_att],genero)

pos = classesRetrieval(data[pos_att],genero)

bow = classesRetrieval(BowTFIDF,genero)

In [47]:
cet.head()

Unnamed: 0,Characters,Words,AvgWordLen,UniqueWords,Sentences,AvgWordsSentence,Syllables,AvgSyllableWords,RareWordsRatio,LexicalDiversity,label
0,1073,213,4.126761,119,32,6.65625,249,1.169014,0.42723,0.558685,Tecnopop
1,1611,354,3.627119,173,73,4.849315,386,1.090395,0.316384,0.488701,Tecnopop
2,1758,372,3.887097,131,60,6.2,439,1.180108,0.13172,0.352151,Tecnopop
3,1190,258,3.833333,150,72,3.583333,285,1.104651,0.437984,0.581395,Tecnopop
4,1502,332,3.575301,81,39,8.512821,362,1.090361,0.078313,0.243976,Tecnopop


In [48]:
pos.head()

Unnamed: 0,IncidenceVerbs,IncidenceAdj,IncidenceNouns,IncidenceCon,IncidencePron,ContentIncidence,ContentDiversity,label
0,0.041,0.033,0.055,0.038,0.007,0.129,0.605634,Tecnopop
1,0.081,0.03,0.076,0.041,0.047,0.187,0.528249,Tecnopop
2,0.079,0.043,0.066,0.049,0.06,0.188,0.505376,Tecnopop
3,0.051,0.017,0.048,0.033,0.032,0.116,0.449612,Tecnopop
4,0.072,0.011,0.099,0.061,0.029,0.182,0.548193,Tecnopop


In [49]:
bow.head()

Unnamed: 0,aba,abc,aberdeen,abide,abil,ability,aboard,abolish,abridge,absenc,...,zebedee,zee,zest,zig,zillion,zoom,zoomin,zorn,zulu,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tecnopop
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tecnopop
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tecnopop
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tecnopop
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tecnopop


### Exportando todas a representações geradas (CSV)

In [50]:
def export_to_csv(dataframe,filename):
    dataframe.to_csv(filename, index=False)
    print("Exportado.")

In [53]:
export_to_csv(pos,'music_mar_2021/data/traditional_rep/pop_mar21_pos.csv')

Exportado.
