In [1]:
import csv
import glob
import pandas as pd
import numpy as np
import contractions
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import gensim
from gensim import corpora

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cep4u/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def _get_csv_file_list(unzipping_output_folder):
    """
    Extract all the csv file paths given the folder path.
    :param unzipping_output_folder: Folder path.
    :return: List of CSV file paths.
    """
    csv_file_list = [i for i in glob.glob(f'{unzipping_output_folder}/**/*.csv', recursive=True)]
    print(f'{len(csv_file_list)} CSV files extracted.')
    return csv_file_list

def _read_and_combine(csv_file_list):
    """
    Read the list of data CSVs and combine them into a single DataFrame.
    Handles CSVs where from the 5th column onwards, data should be merged into one column.
    :param csv_file_list: List of CSV file paths where the data is.
    :return: Combined DataFrame.
    """
    all_data = []
    
    for csv_file_name in csv_file_list:
        with open(csv_file_name, 'r') as file:
            header = next(file).strip().split(",")[:4] + ["lyrics"]
            for line in file:
                fields = line.strip().split(",")
                
                merged_field = " ".join(fields[4:])
                all_data.append(fields[:4] + [merged_field])
                
    combined_df = pd.DataFrame(all_data, columns=header)
    print(f'Total: {len(combined_df)}')
    return combined_df
            
data_folder = '/Users/cep4u/JingEdward/tunen/data/lyric_rawdata/azlyrics-scraper'
csv_file_list = _get_csv_file_list(data_folder)
all_data = _read_and_combine(csv_file_list)

27 CSV files extracted.
Total: 149445


In [3]:
class DataCleaner:
    def __init__(self, df, column_name, stop_words, wnl ):
        self.df = df
        self.column_name = column_name
        self.stop_words = stop_words
        self.wnl = wnl
    
    def remove_null(self, df, column_name):
        df = df[df[column_name].notnull()]
        return df

    def remove_contractions(self, df, column_name):
        df[f'RemoveContractions_{column_name}'] = df[column_name].apply(lambda x: [contractions.fix(word) for word in x.split()])
        return df

    def rebuild_string(self, df, column_name):
        df[f'{column_name}_string_nocont'] = [' '.join(map(str, l)) for l in df[f'RemoveContractions_{column_name}']]
        return df

    def tokenize(self, df, column_name):
        df[f'tokenized_{column_name}'] = df[f'{column_name}_string_nocont'].apply(word_tokenize)
        return df
    
    def token_cleanup(self, df, column_name):
        edge_cases = ["``", "’", "''", "image", "title", "alt", "src", "width", "img", "http", "cbc", "jpg", "16x9_460", "buzzfeed", "com", "h1", "href", "href=", 'p', '/p', '/a' "rel", "www", "reuters", "timesofindia", "indiatimes", "margin", "nofollow", '8217', '8230']
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word.lower() for word in x])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in string.punctuation])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in self.stop_words])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if '/' not in word])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in edge_cases])
        return df
    
    def make_bigrams(self, df, column_name):
        bigram = gensim.models.Phrases(df[f'tokenized_{column_name}'], min_count=5, threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)

        def get_bigrams(tokens_list):
            return bigram_mod[tokens_list]

        df[f'bigrams_{column_name}'] = df[f'tokenized_{column_name}'].apply(get_bigrams)
        
        return df
        
    def lemmatize_tokens(self, df, column_name):
        clean_up = ["'s", "--"]
        df[f'lemmatized_{column_name}'] = df[f'bigrams_{column_name}'].apply(lambda x: [self.wnl.lemmatize(word) for word in x])
        df[f'lemmatized_{column_name}'] = df[f'lemmatized_{column_name}'].apply(lambda x: [word for word in x if word not in clean_up])
        return df

    def clean(self):
        df = self.remove_null(self.df, self.column_name)
        df = self.remove_contractions(df, self.column_name)
        df = self.rebuild_string(df, self.column_name)
        df = self.tokenize(df, self.column_name)
        df = self.token_cleanup(df, self.column_name)
        df = self.make_bigrams(df, self.column_name)
        df = self.lemmatize_tokens(df, self.column_name)
        return df

In [4]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

desc_cleaner_news = DataCleaner(all_data, 'lyrics', stop_words, wnl)
cleaned_df_desc = desc_cleaner_news.clean()

In [7]:
data_folder = '/Users/cep4u/JingEdward/tunen/data/lyric_rawdata'

filename = 'cleaned_lyric_df.csv'

file_path = f"{data_folder}/{filename}"

cleaned_df_desc.to_csv(file_path, index=False)