In [1]:
import pandas as pd
import time
from langdetect import detect
from transformers import pipeline
#from google.cloud import translate_v2 as translate
from joblib import Parallel, delayed
import os
from googletrans import Translator

In [2]:
data = pd.read_csv('../raw_data/data_lyrics_10k.csv')
data_sample = data[:29]

In [6]:
def detect_language(text):
    if text != 'None':
        return detect(text)
    else:
        return 'None'
    
def translate_text(text, language):
    if language != ('en', 'None'):
        translate_client = Translator()
        result = translate_client.translate(text, dest='en')
        return result.text
    else:
        return text


In [17]:
def preprocess_language(data):

    data['language'] = Parallel(n_jobs=os.cpu_count())\
        (delayed(detect_language)(lyric)\
        for lyric in data['lyrics'])
        
    data['translated_lyrics'] = data.apply(lambda x: translate_text(x['lyrics'], x['language']), axis=1)
        
    #data['translated_lyrics'] = Parallel(n_jobs=os.cpu_count())\
    #    (delayed(translate_text)(lyric, language)\
    #    for lyric, language in zip(data['lyrics'], data['language']))
    
    return data

In [18]:
preprocess_language(data[:4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['language'] = Parallel(n_jobs=os.cpu_count())\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['translated_lyrics'] = data.apply(lambda x: translate_text(x['lyrics'], x['language']), axis=1)


Unnamed: 0.1,Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,...,loudness,mode,name,popularity,release_date,speechiness,tempo,lyrics,language,translated_lyrics
0,0,0.817,2013,0.0158,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,...,-3.506,1,Close Your Eyes,50,2013-12-10,0.0322,143.952,Well take a look at what's left in that sunset...,en,Well take a look at what's left in that sunset...
1,1,0.548,2003,0.00661,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,...,-4.297,0,99 Problems,61,2003-11-14,0.398,89.554,"If you're havin' girl problems, I feel bad for...",en,"If you're havin' girl problems, I feel bad for..."
2,2,0.732,2014,0.0477,['Sam Hunt'],0.59,235507,0.94,0,3BuPop8SzLG2Q88TJcFAjp,...,-4.124,1,Raised On It,54,2014-10-27,0.0409,94.02,Snapbacks and Levi jeans PBR and burnt CDs Run...,en,Snapbacks and Levi jeans PBR and burnt CDs Run...
3,3,0.475,1981,0.000473,['Iron Maiden'],0.34,288947,0.974,0,7EvjTEzuv7TWaIaWY63sWV,...,-5.114,1,Drifter - 2015 Remaster,29,1981-02-02,0.106,101.276,,,


In [37]:
model_path = "j-hartmann/emotion-english-distilroberta-base"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, top_k=None, max_length=512, truncation=True)


def get_emotions(lyrics):
    if lyrics != 'None':
        emotions = classifier(lyrics)
        emotions = sorted(emotions[0], key=lambda x: x["label"])
        new_dict = {}
        for dictionary in emotions:
            new_dict[dictionary['label']] = dictionary['score']
        return new_dict
    else:
        emotions = {
            'anger':0,
            'disgust':0,
            'fear':0,
            'joy':0,
            'neutral':0,
            'sadness':0,
            'surprise':0
        }
        return emotions


def preprocess_emotions(data):
    data["emotions"] = data['translated_lyrics'].apply(lambda x : get_emotions(x))
    data = pd.concat([data, data['emotions'].apply(pd.Series).fillna('None')], axis=1)

    return data.drop(columns=['emotions'])

In [38]:
def full_preprocess(data):
    return preprocess_emotions(preprocess_language(data))

In [40]:
start = time.time()
preprocessed_data = full_preprocess(data)     
end = time.time()
print(f"data full_preprocessing: {end - start} s")

data full_preprocessing: 33.85306013623873 h


In [41]:
# save dataframe to csv
preprocessed_data.to_csv('../raw_data/data_lyrics_10k_preprocessed.csv', mode='a')