In [32]:
import pandas as pd
import time
from langdetect import detect
from transformers import pipeline
from google.cloud import translate_v2 as translate
from joblib import Parallel, delayed
import os

In [33]:
data = pd.read_csv('../raw_data/data_lyrics_10k.csv')
#data_sample = data[:29]

In [34]:
def detect_language(text):
    if text != 'None':
        return detect(text)
    else:
        return 'None'
    
def translate_text(text, language):
    if language != ('en', 'None'):
        translate_client = translate.Client()
        result = translate_client.translate(text, target_language='en')
        return result["translatedText"].replace('&#39;',"'")
    else:
        return text


In [35]:
def preprocess_language(data):

    data['language'] = Parallel(n_jobs=os.cpu_count())\
        (delayed(detect_language)(lyric)\
        for lyric in data['lyrics'])
        
    data['translated_lyrics'] = Parallel(n_jobs=os.cpu_count())\
        (delayed(translate_text)(lyric, language)\
        for lyric, language in zip(data['lyrics'], data['language']))
    
    return data

In [36]:
"""
# works without parallelization
def get_language_backup(df):
    df["language"] = df["lyrics"].apply(lambda x: detect(x) if x != 'None' else 'None')
    return df


def translate_text_backup(text, language):
    if language != ('en', 'None'):
        translate_client = translate.Client()
        result = translate_client.translate(text, target_language='en')
        return result["translatedText"].replace('&#39;',"'")
    else:
        return text
    
    
def get_translation_backup(df):
    df['translated_lyrics'] = df.apply(lambda row: translate_text(row['lyrics'], row['language']), axis=1)
    return df


def preprocess_lyrics_language_backup(df):
    df = get_language_backup(df)
    df = get_translation_backup(df)
    return df
"""

'\n# works without parallelization\ndef get_language_backup(df):\n    df["language"] = df["lyrics"].apply(lambda x: detect(x) if x != \'None\' else \'None\')\n    return df\n\n\ndef translate_text_backup(text, language):\n    if language != (\'en\', \'None\'):\n        translate_client = translate.Client()\n        result = translate_client.translate(text, target_language=\'en\')\n        return result["translatedText"].replace(\'&#39;\',"\'")\n    else:\n        return text\n    \n    \ndef get_translation_backup(df):\n    df[\'translated_lyrics\'] = df.apply(lambda row: translate_text(row[\'lyrics\'], row[\'language\']), axis=1)\n    return df\n\n\ndef preprocess_lyrics_language_backup(df):\n    df = get_language_backup(df)\n    df = get_translation_backup(df)\n    return df\n'

In [37]:
model_path = "j-hartmann/emotion-english-distilroberta-base"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, top_k=None, max_length=512, truncation=True)


def get_emotions(lyrics):
    if lyrics != 'None':
        emotions = classifier(lyrics)
        emotions = sorted(emotions[0], key=lambda x: x["label"])
        new_dict = {}
        for dictionary in emotions:
            new_dict[dictionary['label']] = dictionary['score']
        return new_dict
    else:
        emotions = {
            'anger':0,
            'disgust':0,
            'fear':0,
            'joy':0,
            'neutral':0,
            'sadness':0,
            'surprise':0
        }
        return emotions


def preprocess_emotions(data):
    data["emotions"] = data['translated_lyrics'].apply(lambda x : get_emotions(x))
    data = pd.concat([data, data['emotions'].apply(pd.Series).fillna('None')], axis=1)

    return data.drop(columns=['emotions'])

In [38]:
def full_preprocess(data):
    return preprocess_emotions(preprocess_language(data))

In [40]:
start = time.time()
preprocessed_data = full_preprocess(data)     
end = time.time()
print(f"data full_preprocessing: {(end - start)/120} h")

data full_preprocessing: 33.85306013623873 h


In [41]:
# save dataframe to csv
preprocessed_data.to_csv('../raw_data/data_lyrics_10k_preprocessed.csv', mode='a')

In [42]:
preprocessed_data

Unnamed: 0.1,Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,...,lyrics,language,translated_lyrics,anger,disgust,fear,joy,neutral,sadness,surprise
0,0,0.817,2013,0.015800,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,...,Well take a look at what's left in that sunset...,en,Well take a look at what's left in that sunset...,0.038732,0.022354,0.056098,0.144097,0.580219,0.074745,0.083755
1,1,0.548,2003,0.006610,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,...,"If you're havin' girl problems, I feel bad for...",en,"If you're havin' girl problems, I feel bad for...",0.297886,0.332039,0.033108,0.010371,0.056380,0.252725,0.017490
2,2,0.732,2014,0.047700,['Sam Hunt'],0.590,235507,0.940,0,3BuPop8SzLG2Q88TJcFAjp,...,Snapbacks and Levi jeans PBR and burnt CDs Run...,en,Snapbacks and Levi jeans PBR and burnt CDs Run...,0.146808,0.020869,0.231433,0.041871,0.131439,0.199754,0.227825
3,3,0.475,1981,0.000473,['Iron Maiden'],0.340,288947,0.974,0,7EvjTEzuv7TWaIaWY63sWV,...,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,4,0.550,1930,0.994000,"['Markos Vamvakaris', 'Apostolos Xatzixristos']",0.410,197653,0.169,0,38PozVGXXoeO8dTEVzy74Y,...,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,0.726,2012,0.002240,['Muse'],0.671,184200,0.766,1,1tjHKKI0r82IB5KL29whHs,...,You won't get much closer 'til you sacrifice i...,en,You won't get much closer 'til you sacrifice i...,0.060204,0.003664,0.850040,0.018728,0.033371,0.010278,0.023715
9996,9996,0.834,1943,0.944000,['M. K. Thyagaraja Bhagavathar'],0.422,181853,0.477,0,0PkzQsgs6DeCAQPvx4iFgo,...,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9997,9997,0.902,1975,0.682000,['Ismael Rivera'],0.776,214377,0.139,0,6Hb2J7m0fhGiIU4Zx9Pk4C,...,Borinquen! (Borinqueneando borinco Borinquenea...,es,Borinquen! (Borinqueneando borinco Borinquenea...,0.012464,0.001729,0.002078,0.806134,0.075666,0.012552,0.089377
9998,9998,0.896,1995,0.187000,"['Marco Antonio Solís', 'Los Bukis']",0.724,179987,0.666,0,2oziP5rlqR0kKHbGzIOL0b,...,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
