In [2]:
import pandas as pd
import time
from langdetect import detect
from transformers import pipeline
from google.cloud import translate_v2 as translate
from joblib import Parallel, delayed
import os

In [33]:
data = pd.read_csv('../raw_data/data_lyrics_10k.csv')
#data_sample = data[:29]

In [34]:
def detect_language(text):
    if text != 'None':
        return detect(text)
    else:
        return 'None'
    
def translate_text(text, language):
    if language != ('en', 'None'):
        translate_client = translate.Client()
        result = translate_client.translate(text, target_language='en')
        return result["translatedText"].replace('&#39;',"'")
    else:
        return text


In [35]:
def preprocess_language(data):

    data['language'] = Parallel(n_jobs=os.cpu_count())\
        (delayed(detect_language)(lyric)\
        for lyric in data['lyrics'])
        
    data['translated_lyrics'] = Parallel(n_jobs=os.cpu_count())\
        (delayed(translate_text)(lyric, language)\
        for lyric, language in zip(data['lyrics'], data['language']))
    
    return data

In [37]:
model_path = "j-hartmann/emotion-english-distilroberta-base"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, top_k=None, max_length=512, truncation=True)


def get_emotions(lyrics):
    if lyrics != 'None':
        emotions = classifier(lyrics)
        emotions = sorted(emotions[0], key=lambda x: x["label"])
        new_dict = {}
        for dictionary in emotions:
            new_dict[dictionary['label']] = dictionary['score']
        return new_dict
    else:
        emotions = {
            'anger':0,
            'disgust':0,
            'fear':0,
            'joy':0,
            'neutral':0,
            'sadness':0,
            'surprise':0
        }
        return emotions


def preprocess_emotions(data):
    data["emotions"] = data['translated_lyrics'].apply(lambda x : get_emotions(x))
    data = pd.concat([data, data['emotions'].apply(pd.Series).fillna('None')], axis=1)

    return data.drop(columns=['emotions'])

In [38]:
def full_preprocess(data):
    return preprocess_emotions(preprocess_language(data))

In [40]:
start = time.time()
preprocessed_data = full_preprocess(data)     
end = time.time()
print(f"data full_preprocessing: {end - start} s")

data full_preprocessing: 33.85306013623873 h


In [41]:
# save dataframe to csv
preprocessed_data.to_csv('../raw_data/data_lyrics_10k_preprocessed.csv', mode='a')