# Reducing dimension of text data: an example of applying PCA and t-SNE to word embeddings 

In this notebook I am applying dimensionality reduction techniques to preprocessed lyrics of the songs.
Dataset is a set of lyrics of 48 various songs, from poetry to Justin Bieber. <br>

I've wanted to see how much varianced can be collected by reducing the dimensions of word vectors from 300 to 2. <br>
Actually I will call these vectors "song vectors" because each of these vectors is an averaged embedding of all words existing in the song.

## Songs preprocessing

In [None]:
# Import libraries
import spacy
import pandas as pd
import numpy as np
import nltk 
import string
import os
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
nlp = spacy.load('en_core_web_md')

In [1]:
# Set parameters for plots
plt.rcParams.update({
    "lines.color": "white",
    "patch.edgecolor": "white",
    "text.color": "black",
    "axes.facecolor": "white",
    "axes.edgecolor": "lightgray",
    "axes.labelcolor": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "lightgray",
    "figure.facecolor": "black",
    "figure.edgecolor": "black",
    "savefig.facecolor": "black",
    "savefig.edgecolor": "black"})

NameError: name 'plt' is not defined

In [None]:
# Set directory
folder_path = "/Volumes/Macintosh HD – dane/GitHub/masters/first semester/Unsupervised Learning/article_2/data"
os.chdir(folder_path)

In [None]:
# Function which creates a list, each position of a final list is a list of song words
def get_lyrics(list_of_songs = []):
    
    for file in os.listdir(): # iterate through directory
        
        if file.endswith(".txt"): # open each file
            file_path = f"{folder_path}/{file}"

            lyrics = pd.read_csv(file_path, sep='\b', quoting=3, encoding='utf-8', header=None, names=['lines'])
            lyrics_list = lyrics['lines'].tolist() # transform song words into a list
            list_of_songs.append(lyrics_list)
            
    return list_of_songs

In [None]:
# Function which outputs a list of vectors created from the lyrics
def lyrics_preprocess(songs_list, stopwords, songs_vectors = [], stop_words_check = []):

    for song in tqdm(songs_list):

        text = " ".join(song) # create a list of words
        doc = nlp(text)

        tokens  = [token.text for token in doc] # tokenize
        tokens = [token.lemma_ for token in doc] # lemmantize

        tokens = [token for token in tokens if token not in string.punctuation] # remove punctuation
        tokens = [token.lower() for token in tokens] # lower words
        tokens = [item for item in tokens if item not in stopwords] # remove stopwords

        for word in tokens: # checking if any stop word somehow was not deleted
            
            if word in stopwords:
                stop_words_check.append(word)
            assert len(stop_words_check) == 0, 'Error: not all of the stopwords were deleted from text'
        
        tokens_concat = " ".join(tokens) # joining words into one string
        sentence_vec = nlp(tokens_concat) # vectorizing
        songs_vectors.append(sentence_vec.vector) #saving song vector
        
    return songs_vectors

In [None]:
# Function which creates column names from files names in the directory
def create_column_names(list_of_columns = []):

    for file in os.listdir():
        file = file[:-4]
        list_of_columns.append(file)

        if '.DS_S' in list_of_columns:
            list_of_columns.remove('.DS_S')

    return list_of_columns

In [None]:
# Uploading stop words
stop = STOP_WORDS
stop.update(['...', '....', '1', '2', '3', '4', '5', 'chorus', ':]', '[:'])

In [None]:
# Exctracting songs vectors
lyrics = get_lyrics()
lyrics_vectors = lyrics_preprocess(lyrics, stopwords = stop)
df_columns = create_column_names()

songs_df = pd.DataFrame(lyrics_vectors).T
songs_df.columns = df_columns
songs_df.head()

In [None]:
# Saving dataframe
songs_df.to_csv('/Volumes/Macintosh HD – dane/GitHub/masters/first semester/Unsupervised Learning/article_2/songs_vectors.csv')

## Exploring similarities

In [None]:
# Upload dataframe
songs_df = pd.read_csv("/Volumes/Macintosh HD – dane/GitHub/masters/first semester/Unsupervised Learning/article_2/songs_vectors.csv", index_col=0)
column_names = songs_df.columns
songs_df = songs_df.T

In [None]:
# Calculate cosine similarity between songs
cos_sim = []
for column in column_names:
    a = songs_df[column]
    for i in column_names:
        b = songs_df[i]
        similarity = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        cos_sim.append(similarity) 

In [None]:
# Create similarity matrix
cos_sim_matrix = np.reshape(cos_sim, (48, 48))
cos_sim_df = pd.DataFrame(cos_sim_matrix, index = column_names, columns = column_names)

In [None]:
# Plot a heatmap
plt.rcParams["figure.figsize"] = (20,11)
fig = sns.heatmap(cos_sim_df)

## PCA

In [None]:
# Scaling data
sc = StandardScaler()
sc.fit(songs_df)
df_transformed = sc.transform(songs_df)
df_transformed.shape

In [None]:
# Initialize PCA
pca = PCA()
df_pca = pca.fit_transform(df_transformed)

In [None]:
# Amount of explained variance
variance_pca = pca.explained_variance_ratio_

In [None]:
# Cumulative explained variance
cumulative_variance = np.cumsum(variance_pca)

In [None]:
print("Explained variance with 2 components: {}%".format(np.round(np.sum(pca.explained_variance_ratio_) * 100, 2)))

In [None]:
# Plot scree plot
plt.rcParams["figure.figsize"] = (15,8)
plt.bar(range(0, len(variance_pca)), 
        variance_pca, 
        alpha=0.5,
        align='center',
        label='Explained variance by each single principal component'
       
)
plt.step(range(0, len(variance_pca)),
        cumulative_variance,
        where='mid',
        label='Cumulative explained variance')
plt.xlabel('Principal component index', fontsize=15)
plt.ylabel('% of explained variance', fontsize=15)
plt.xlim(-0.3, 15)
plt.ylim(0, 1)
plt.legend(loc="lower right")
plt.show()

In [None]:
# Compute similarities with information about artists
similarities_with_artists = []
for column in column_names:
    a = songs_df[column]
    for i in column_names:
        b = songs_df[i]
        similarity = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        similarities_with_artists.append([column, i, similarity]) 

In [None]:
similarities_with_artists[0:5]

In [None]:
# Prepare dataframe and delete observations from diagonal of similarity matrix
similarities_df = pd.DataFrame(similarities_with_artists, columns = ['artist1', 'artist2', 'cosine_similarity']).reset_index()
similarities_df['cosine_similarity'] = np.round(similarities_df['cosine_similarity'], 7)
similarities_df = similarities_df[(similarities_df['cosine_similarity'] != 1) | (similarities_df['artist1'] != similarities_df['artist2'])]

In [None]:
# Prepare top 10 similarities
top10 = similarities_df.sort_values(by='cosine_similarity', ascending=False).head(20).reset_index()
top10 = top10.iloc[::2, :]
top10['artist1_artist2'] = top10['artist1'] + ' and ' + top10['artist2']

In [None]:
# Plot top 10 similarities
plt.rcParams["figure.figsize"] = (30,10)
fig = plt.bar(x='artist1_artist2', height='cosine_similarity', 
            data=top10,
            width = 0.8)
plt.margins(x=0)

In [None]:
# Prepare bottom 10 similarities
bottom10 = similarities_df.sort_values(by='cosine_similarity', ascending=False).tail(20).reset_index()
bottom10 = bottom10.iloc[::2, :]
bottom10

In [None]:
bottom10['artist1_artist2'] = bottom10['artist1'] + ' and ' + bottom10['artist2']

In [None]:
# Plot bottom 10 similarities
plt.rcParams["figure.figsize"] = (30,10)
fig = plt.bar(x='artist1_artist2', height='cosine_similarity', 
            data=bottom10,
            width = 0.8)
plt.margins(x=0)

## t-SNE

In [None]:
songs_df = songs_df.reset_index()

In [None]:
# Initialize t-SNE
tsne = TSNE(n_components=2, perplexity = 5)
tsne_fit = tsne.fit_transform(songs_df.iloc[:, 1:])

In [None]:
# Plot reduced points
fix, ax = plt.subplots()
#plt.rcParams["figure.figsize"] = (15,8)
ax.scatter(tsne_fit[:, 0], 
            tsne_fit[:, 1])
#plt.xlim(-20, 25)
#plt.ylim(-15, 25)

In [None]:
# Plot points with songs names
fix, ax = plt.subplots()
plt.rcParams["figure.figsize"] = (15,8)
ax.scatter(tsne_fit[:, 0], 
            tsne_fit[:, 1])
plt.xlim(-250, 250)
#plt.ylim(-15, 25)

xax = tsne_fit[:, 0]
yax = tsne_fit[:, 1]
for i, txt in enumerate(column_names):

    not_to_move = ['ludacris', 'prince', 'patti-smith', 'michael-jackson', 'amy-winehouse', 'lin-manuel-miranda', 'disney', 'kanye-west']
    if txt not in not_to_move:
        ax.annotate(txt, (xax[i] - 0.3, yax[i] + 0.6))
        
    else:
         ax.annotate(txt, (xax[i], yax[i]))