In [1]:
import pandas as pd
import time
from langdetect import detect
from transformers import pipeline

In [2]:
data_check = pd.read_csv('../raw_data/data.csv')
data_check.shape

(170653, 19)

In [3]:
data = pd.read_csv('../raw_data/data_lyrics.csv')
data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)
data.drop([49996, 49997], inplace=True)
data.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(170650, 20)

In [4]:
# remove podcasts (long duration) and intros (short duration) from data
data['duration_m'] = (data['duration_ms'] /1000)/60
data = data[(data['duration_m'] > 1) & (data['duration_m'] < 10)].reset_index()

# creates column ('index') containing both artist and song name
data['artists'] = data['artists'].apply(lambda x: x.replace("['", '').replace("'", '').replace("]", ''))
data['index'] = data['artists'] + ' - "' + data['name'] + '"'

# sets 'index' as index
#data = data.set_index(data['index']).sort_index(axis=1)

data = data.drop(columns=['artists',
                        'name',
                        'release_date',
                        'duration_ms'])

data = data.drop_duplicates()
data.shape

(166764, 18)

In [5]:
string = 'You might also like'

def clean_lyric(lyric, string=string):
    if string in lyric:
        lyric = lyric.replace(string,'')
    return lyric

In [6]:
data['year'] = data['year'].astype('int64')
data['explicit'] = data['explicit'].astype('int64')
data['key'] = data['key'].astype('int64')
data['popularity'] = data['popularity'].astype('int64')
data['mode'] = data['mode'].astype('int64')
data['lyrics'] = data['lyrics'].astype('string')
data['index'] = data['index'].astype('string')
data['id'] = data['id'].astype('string')
data.fillna('None', inplace=True)
data['lyrics'] = data['lyrics'].apply(lambda x: clean_lyric(x) if x != 'None' else 'None')
data['lyrics'] = data['lyrics'].apply(lambda x: x.replace('', 'None') if x == '' else x)
data.fillna('None', inplace=True)
data.shape

(166764, 18)

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-mul-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-mul-en")



In [8]:
def detect_language(text):
    if text == 'None':
        return 'None'
    else:
        try:
            return detect(text)
        except:
            return 'None'

    
def translate_text(text, language):
    if text == 'None':
        return text
    
    elif language == 'en':
        return text
    
    else:
        batch = tokenizer([text[:1300]], return_tensors="pt", max_length=512, truncation=True)
        generated_ids = model.generate(**batch)
        result = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        #time.sleep(1)
        return result
    
def preprocess_language(data):
    data['language'] = data.apply(lambda x: detect_language(x['lyrics']), axis=1)
    data['translated_lyrics'] = data.apply(lambda x: translate_text(x['lyrics'], x['language']), axis=1)

    return data


In [9]:
model_path = "j-hartmann/emotion-english-distilroberta-base"
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path, top_k=None, max_length=512, truncation=True)


def get_emotions(lyrics):
    if lyrics == 'None':
        emotions = {
            'anger':0,
            'disgust':0,
            'fear':0,
            'joy':0,
            'neutral':0,
            'sadness':0,
            'surprise':0
        }
        return emotions
    
    else:
        emotions = classifier(lyrics)
        emotions = sorted(emotions[0], key=lambda x: x["label"])
        new_dict = {}
        for dictionary in emotions:
            new_dict[dictionary['label']] = dictionary['score']
        return new_dict


def preprocess_emotions(data):
    data["emotions"] = data['translated_lyrics'].apply(lambda x : get_emotions(x))
    data = pd.concat([data, data['emotions'].apply(pd.Series).fillna('None')], axis=1)

    return data.drop(columns=['emotions'])

In [10]:
data.loc[5421]['lyrics']

'1,2,3,4,5,6,7,8'

In [11]:
step = 100
dataset_start = 161200
dataset_end = dataset_start + step
dataset_max = data.shape[0]
iterations = int(dataset_max/step)

In [12]:
start = time.time()

for iteration in range(iterations):
    print('------------------------------------------')
    print(f'Iteration N°{iteration+1}...')
    start_loop = time.time()
    
    if dataset_start <= (dataset_max - step):
        full_data = preprocess_language(data[dataset_start:dataset_end])
        print(f"Language features loaded.")
        
        full_data = preprocess_emotions(full_data)
        print(f"Emotion features loaded.")
        
        if dataset_start == 0:
            full_data.to_csv('../raw_data/data_full.csv', mode='a')
            
        else:
            full_data.to_csv('../raw_data/data_full.csv', mode='a', header=False)
        
        end_loop = time.time()
        
        print(f'From index {dataset_start} to index {dataset_end-1} saved ✅')
        print(f'Elapsed time: {(end_loop - start_loop)/60} m')
        
        dataset_start += step
        dataset_end = dataset_start + step
        print('------------------------------------------')
    else:
        print('done')
        
end = time.time()

print(f"data full features elapsed time: {(end - start)/3600} h")

------------------------------------------
Iteration N°1...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['language'] = data.apply(lambda x: detect_language(x['lyrics']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['translated_lyrics'] = data.apply(lambda x: translate_text(x['lyrics'], x['language']), axis=1)


Language features loaded.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["emotions"] = data['translated_lyrics'].apply(lambda x : get_emotions(x))


Emotion features loaded.
From index 161200 to index 161299 saved ✅
Elapsed time: 3.2335795323053995 m
------------------------------------------
------------------------------------------
Iteration N°2...
Language features loaded.
Emotion features loaded.
From index 161300 to index 161399 saved ✅
Elapsed time: 2.574806634585063 m
------------------------------------------
------------------------------------------
Iteration N°3...
Language features loaded.
Emotion features loaded.
From index 161400 to index 161499 saved ✅
Elapsed time: 3.5801962018013 m
------------------------------------------
------------------------------------------
Iteration N°4...
Language features loaded.
Emotion features loaded.
From index 161500 to index 161599 saved ✅
Elapsed time: 1.995859718322754 m
------------------------------------------
------------------------------------------
Iteration N°5...
Language features loaded.
Emotion features loaded.
From index 161600 to index 161699 saved ✅
Elapsed time: 

In [13]:
full_data = preprocess_language(data[166700:])

full_data = preprocess_emotions(full_data)


if dataset_start == 0:
    full_data.to_csv('../raw_data/data_full.csv', mode='a')
    
else:
    full_data.to_csv('../raw_data/data_full.csv', mode='a', header=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['language'] = data.apply(lambda x: detect_language(x['lyrics']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['translated_lyrics'] = data.apply(lambda x: translate_text(x['lyrics'], x['language']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["emotions"] = data['t