In [2]:
# General imports
import os
import pandas as pd
import numpy as np
import spacy
import matplotlib
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from ipywidgets import IntProgress
from IPython.display import display
import json
import re

In [5]:
##load dataset
with open('only_german_lyrics.json', 'r') as f:
    dataset = json.load(f)

## preprocessing the lyrics
1) removing stopwords
2) removing punctuation
3) saving the processed lyrics inside the dictionary under ['processed_lyrics']

In [40]:
nlp = spacy.load("de_core_news_sm", exclude="ner")
stopwords_de = spacy.lang.de.stop_words.STOP_WORDS
f = IntProgress(min=0, max=5992) # instantiate the bar
display(f) # display the bar
word_list = []
i = 0
for key in dataset.keys():
    for idx in dataset[key]:
        song_filtered = []
        for line in dataset[key][idx]['lyrics'].split("\n"):
            line_list = ""
            for word in nlp(str(line)):
                if not word.lemma_ in stopwords_de and word.has_vector:
                    if not word.is_punct:
                    # if word.text != '\n' and word.text != '\n\n' and word.text != '\n\n\n' and word.text != ',' and word.text !=:
                        line_list += word.text + " "
            if line == "":
                continue
            song_filtered.append(line_list)
        dataset[key][idx].update({'processed_lyrics' : song_filtered})
    f.value += 1

IntProgress(value=0, max=5992)

In [42]:
# Save the new dictionary as .json
with open('preprocessed_only_german_lyrics' + '.json', 'w', encoding='utf8') as f:
    json.dump(dataset, f)

## Training Word2Vec
### 2 approaches:
- analyse only nouns
- analyse whatever the previous step gave us

In [64]:
  ## preprocessing ##
## Each sentence be placed in its own index and inside we will have the words of the sentence in a list 
processed_lyrics_lowcase = []
sentences_list_lowcase = []
for key in dataset.keys():
    for idx in dataset[key]:
        for sentence in dataset[key][idx]['processed_lyrics']:
            words_list = []
            words_string = ""
            for word in sentence.split(" "):
                if word == '':
                    continue
                words_list.append(word.lower())
                words_string += word + " "
            sentences_list_lowcase.append(words_list)
            processed_lyrics_lowcase.append(sentence)


In [3]:
# Loading the new dictionary as .json
with open('preprocessed_only_german_lyrics.json', 'r', encoding='utf8') as f:
    new_dataset = json.load(f)

In [4]:
new_dataset['Beginner']['0']

{'title': 'Füchse',
 'album': 'Bambule',
 'album_cover': 'https://images.genius.com/402fea79dcbeed4370b8e9e67362752d.800x793x1.jpg',
 'genius_album_id': 11330,
 'release_date': '1998-11-10',
 'featured_artists': ['Samy Deluxe'],
 'featured_artists_pics': ['https://s3.amazonaws.com/rapgenius/Samy-Deluxe.jpg'],
 'producer_artists': ['Eizi Eiz', 'Platin Martin'],
 'writer_artists': ['Denyo', 'Samy Deluxe', 'Eizi Eiz'],
 'primary_artist_picture': 'https://s3.amazonaws.com/rapgenius/1355220276_photo.jpg',
 'lyrics_path': '/Beginner-fuchse-lyrics',
 'genius_track_id': 52626,
 'lyrics': '\n\nWickeda-MC., mit Flows pur Natur wie Sensi.\nUnd Freestyles näher am Geschehen als jedes Hip-Hop-Fanzine.\nFuchs\' mich in die Materie.,  da es Möglichkeiten unbegrenzt gibt.\nLeider folgen viele falschen Vorbildern und lernen\'s nie.\nDas Publikum zu rocken, da es auf \'ner anderen Frequenz liegt.\nWas uns nicht betrifft, weil man unsere Bühnenpräsenz liebt.\nUnd Rap-Exzellenz sieht, die sich der Vorstel

In [50]:
### Training Word2Vec ###
  ## preprocessing ##

## Each sentence be placed in its own index and inside we will have the words of the sentence in a list 
processed_lyrics = []
sentences_list = []
for key in dataset.keys():
    for idx in dataset[key]:
        for sentence in dataset[key][idx]['processed_lyrics']:
            processed_lyrics.append(sentence)
            words_list = []
            for word in sentence.split(" "):
                if word == '':
                    continue
                words_list.append(word)
            sentences_list.append(words_list)



In [262]:
processed_lyrics[0]
sentences_list[3]

['folgen', 'viele', 'falschen', 'Vorbildern', "lernen's"]

In [107]:
### Training Word2Vec ###
  ## preprocessing ##

## Each sentence be placed in its own index and inside we will have the words of the sentence in a list 
processed_lyrics_lowcase = []
sentences_list_lowcase = []
for key in dataset.keys():
    for idx in dataset[key]:
        for sentence in dataset[key][idx]['processed_lyrics']:
            words_list = []
            words_string = ""
            for word in sentence.split(" "):
                if word == '':
                    continue
                if word[0].isupper():
                    words_list.append(word.lower())
                    words_string += word.lower() + " "
            sentences_list_lowcase.append(words_list)
            processed_lyrics_lowcase.append(words_string)


In [435]:
## Some manipulation of data to have larger 'sentences'
i = 0
temp_string = ""
longer_strings = []
longer_sentences = []
temp_sent = []
for sentence in sentences_list:
    for word in sentence:
        if i < 40:
            temp_sent.append(word.lower())
            temp_string += word.lower() + " "
            i += 1
        else:
            i = 1
            longer_sentences.append(temp_sent)
            longer_strings.append(temp_string)
            temp_sent = []
            temp_string = ""
            temp_string += word + " "
            temp_sent.append(word)

In [305]:
longer_strings[2]
longer_sentences[2]

['knurrt',
 'sau',
 'hau',
 'bau',
 'verschließ',
 'tür',
 'ziehe',
 "durch's",
 'revier',
 'markier',
 'mal',
 'höre',
 'gelaber',
 'schleiche',
 'promenade',
 'zeuge',
 'maskerade',
 'buchstaben',
 'grelle',
 'farben']

In [436]:
## Training ##
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

bigrams = Phrases(longer_sentences, min_count=5, threshold=2)
new_lines = [bigrams[line.split(" ")] for line in longer_strings]

In [231]:
## Creating vocab of top words ##
vocab = [word for sentence in longer_sentences for word in sentence]
counter = Counter(vocab)
vocab = [word[0] for word in counter.most_common(1000)]

In [237]:
vocab[3]

'weiß'

In [376]:
w2v = Word2Vec(sentences=new_lines, min_count=50, vector_size=200, window=10, workers=20)  ### your code ###

In [377]:
w2v.build_vocab(new_lines, min_count = 50) ### your code ###

In [378]:
w2v.train(new_lines, epochs = 100, total_examples=w2v.corpus_count) ### your code ###

(54248914, 103446300)

In [396]:
w2v.wv.similar_by_vector((w2v.wv.get_vector("vater") - w2v.wv.get_vector("tot")), topn=20)

[('vater', 0.6659219264984131),
 ('lag', 0.2421538084745407),
 ('kind', 0.23708118498325348),
 ('gab', 0.23078513145446777),
 ('lehrer', 0.22680450975894928),
 ('bekam', 0.2243422120809555),
 ('mutter', 0.22265377640724182),
 ('eltern', 0.21760931611061096),
 ('denkt', 0.21479246020317078),
 ('job', 0.214352548122406),
 ('höre', 0.21342961490154266),
 ('schule', 0.20592470467090607),
 ('warst', 0.20409569144248962),
 ('sagte', 0.20310145616531372),
 ('jahr', 0.199281245470047),
 ('bringt', 0.1965971291065216),
 ('mom', 0.1916189044713974),
 ('kinder', 0.1887824982404709),
 ('kontakt', 0.18746133148670197),
 ('bezahlt', 0.1863062083721161)]

In [380]:
w2v.wv.most_similar("schlampe", topn=20)

[('fotze', 0.30678847432136536),
 ('kotzen', 0.3059578835964203),
 ('nutte', 0.2999865710735321),
 ('koks', 0.2737177610397339),
 ('lecker', 0.26625657081604004),
 ('zeige', 0.2574453353881836),
 ('pillen', 0.21938812732696533),
 ('clubs', 0.2177010029554367),
 ('Bitch', 0.21735909581184387),
 ('maybach', 0.2152678370475769),
 ('bmw', 0.20855097472667694),
 ('fick', 0.20701313018798828),
 ('yayo', 0.20612511038780212),
 ('’ne', 0.20466090738773346),
 ('bekommt', 0.20399445295333862),
 ('wodka', 0.2034698724746704),
 ('chick', 0.20120780169963837),
 ('Arsch', 0.2009485960006714),
 ('lächerlich', 0.19792573153972626),
 ('ne', 0.19767078757286072)]

In [437]:
w2v2 = Word2Vec(sentences=new_lines, min_count=50, vector_size=200, window=10, workers=20)  ### your code ###
w2v2.build_vocab(new_lines, min_count = 50) ### your code ###
w2v2.train(new_lines, epochs = 100, total_examples=w2v.corpus_count) ### your code ###

(53836894, 102636100)

In [447]:
w2v2.wv.most_similar("dealer", topn=20)

[('beamer', 0.3358168601989746),
 ('kilos', 0.3334430754184723),
 ('haze', 0.26742303371429443),
 ('cannabis', 0.2623513638973236),
 ('filme', 0.258696973323822),
 ('batzen', 0.2508927881717682),
 ('para', 0.24952782690525055),
 ('squad', 0.24947337806224823),
 ('ticker', 0.2430565059185028),
 ('frankfurt', 0.2421850562095642),
 ('ticken', 0.23554347455501556),
 ('kriminell', 0.2344028502702713),
 ('dicka', 0.23384280502796173),
 ('rapper', 0.2307867407798767),
 ('diggi', 0.23056194186210632),
 ('hip-hop', 0.22577889263629913),
 ('straße', 0.22545133531093597),
 ('wooh', 0.22114306688308716),
 ('kiez', 0.22109700739383698),
 ('verpackt', 0.21803018450737)]

In [426]:
w2v2.wv.similar_by_vector(w2v2.wv.get_vector("Frau") + w2v2.wv.get_vector("Straße"), topn=20)

[('Frau', 0.7372536063194275),
 ('Straße', 0.6250594854354858),
 ('Vater', 0.2779926061630249),
 ('kriegst', 0.2638784646987915),
 ('Kahba', 0.2520490884780884),
 ('Kissen', 0.24277158081531525),
 ('Label', 0.23462143540382385),
 ('Faust', 0.22711928188800812),
 ('küsst', 0.2250686138868332),
 ('Mutter', 0.22503824532032013),
 ('Frauen', 0.21073800325393677),
 ('Rechnung', 0.2058262676000595),
 ('Woche', 0.2040182203054428),
 ('Stich', 0.20082958042621613),
 ('Maul', 0.19981902837753296),
 ('fick', 0.19815170764923096),
 ('Ernst', 0.19718725979328156),
 ('Traum', 0.1954738199710846),
 ('Ausländer', 0.18353557586669922),
 ('geklaut', 0.18275777995586395)]

In [422]:
w2v2.wv.most_similar("geld", topn=20)

KeyError: "Key 'geld' not present in vocabulary"