In [None]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, StackedEmbeddings, BytePairEmbeddings
from flair.data import Sentence
import torch
from collections import Counter

import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

import re
import numpy as np

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

In [None]:
word_embedding = WordEmbeddings('en')
stacked_embedding = StackedEmbeddings([WordEmbeddings('en'),BytePairEmbeddings('en')])
embedding = DocumentPoolEmbeddings([word_embedding])

In [None]:
df['genre'].unique()

In [None]:
df = pd.read_csv('data/all.csv')
jungle = df[df['genre']=='jungle / footwork']

In [None]:
df = pd.read_csv('data/new/grime.csv')

In [None]:
embeddings_list = []
meta = []
for i in df['Term'].tolist():
    sentence = Sentence(i)
    word_embedding.embed(sentence)
    embed = sentence[0].embedding.numpy()
    if np.any(embed):
        embeddings_list.append(embed)
        meta.append(i)

In [None]:
pd.DataFrame(embeddings_list).to_csv('grime_word_tensors.tsv', sep='\t', index=False, header=False)

In [None]:
pd.Series(meta, name='word').to_csv('grime_word_meta.tsv', sep='\t', index=False, header=False)

## words

In [None]:
all_word = jungle['description'].str.replace('\xa0|/', ' ', regex=True).str.cat(sep=' ')

In [None]:
clean = re.sub(r'\.|,|\n|‘|’|!|\]|\[|\)|\(|`|;|”|', '', all_word)
shortlist = [x[0] for x in Counter(clean.split(' ')).most_common(20000)]

In [None]:
tokens_without_sw = [word.lower() for word in shortlist if not word.lower() in stopwords.words() and word!='']

In [None]:
tokens_without_sw

In [None]:
embeddings_list = []
meta = []
for i in tokens_without_sw:
    sentence = Sentence(i)
    word_embedding.embed(sentence)
    embed = sentence[0].embedding.numpy()
    if np.any(embed):
        embeddings_list.append(embed)
        meta.append(i)

In [None]:
len(meta)

In [None]:
len(embeddings_list)

In [None]:
pd.DataFrame(embeddings_list)[:-1].to_csv('word_tensors.tsv', sep='\t', index=False, header=False)

In [None]:
pd.Series(meta, name='word')[:-1].to_csv('word_meta.tsv', sep='\t', index=False, header=False)

## descriptions

In [None]:
df.dropna(subset=['description'], inplace=True)

In [None]:
def get_tensors(x):
    sentence = Sentence(x['description'])
    embedding.embed(sentence)
    return sentence.get_embedding()

In [None]:
df['embedding'] = df.progress_apply(get_tensors, axis=1)

In [None]:
df['np_embedding'] = df['embedding'].apply(lambda x: x.detach().numpy())

In [None]:
df['description'] = df['description'].str.replace('\n', ' ')

In [None]:
df[['artist','album','catalogue_no','genre','description']].to_csv('meta.tsv', sep='\t', index=False)
df['np_embedding'].apply(pd.Series).to_csv('tensors.tsv', sep='\t', index=False, header=False)