In [2]:
import lucem_illud_2020 #pip install -U git+git://github.com/Computational-Content-Analysis-2020/lucem_illud_2020.git

#All these packages need to be installed from pip
#For ML
import sklearn
import sklearn.naive_bayes
import sklearn.tree
import sklearn.ensemble
import sklearn.neural_network
import sklearn.decomposition

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
import matplotlib.colors # For nice colours
import seaborn as sns#Makes plots look nice, also heatmaps
import scipy as sp #for interp

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas as pd
import requests
import json
import math

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

## LGBTQ corpora

In [None]:
import requests
from bs4 import BeautifulSoup #called `beautifulsoup4`, an html parser
import re #for regexs
# LGBT words extraction 
lgbt_content = BeautifulSoup(requests.get("https://lgbtccneworleans.org/useful-vocabulary/").text)
keywords = lgbt_content.body.find('div',class_='entry-content clearfix').findAll('strong')
keywords = [i.text for i in keywords]
keywords

In [None]:
# cleaning
keywords_set = set()
for word in keywords:
    tokens = lucem_illud_2020.word_tokenize(word)
    normalized = lucem_illud_2020.normalizeTokens(tokens)
    for n in normalized:
        keywords_set.add(n)
keywords_set

In [None]:
# manually adjustment
keywords_set.add('bi')
keywords_set.add('les')
keywords_set.add('queen')
keywords_set.add('king')
keywords_set.remove('person')
keywords_set.remove('research')
keywords_set.remove('mis)gendering')
keywords_set.remove('confirm')
keywords_set.remove('system')
keywords_set.remove('pass')
keywords_set.remove('non')
keywords_set.remove('theory')
keywords_set.remove('question')
keywords_set

In [None]:
lgbt_words = pd.DataFrame(keywords_set)
lgbt_words.to_csv('lgbt_words.csv',index=None)

## Cross-Section

In [None]:
# Lyrics Training Data Loading
lyrics_1 = pd.read_csv("380000-lyrics-from-metrolyrics/lyrics.csv")
lyrics_2 = pd.read_csv("lyrics-dataset/lyrics_features.csv")

In [None]:
# filtering data
lyrics_1 = lyrics_1.dropna()
lyrics_1 = lyrics_1.loc[lyrics_1['genre'] != 'Not Available']

In [None]:
lyrics_1.to_csv('cleaned_lyrics_1.csv',index=None,columns=['artist','genre','lyrics'])

In [32]:
lyrics_1 = pd.read_csv("cleaned_lyrics_1.csv")

In [6]:
my_stop_words = [str(i) for i in range(10)]
my_stop_words += ['intro','verse','pre','post','lift','chorus','bridge','outro']
my_stop_words

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'intro',
 'verse',
 'pre',
 'post',
 'lift',
 'chorus',
 'bridge',
 'outro']

In [8]:
from multiprocessing import Pool
p = Pool()
lyrics_1['tokens'] = list(p.map(lucem_illud_2020.word_tokenize,lyrics_1['lyrics'])) 

In [11]:
lyrics_1['normalized'] = list(p.starmap(lucem_illud_2020.normalizeTokens,
                        [(x,my_stop_words) for x in lyrics_1['tokens']]))

In [13]:
lyrics_1['tokens'][:10]

0    [Oh, baby, how, you, doing, You, know, I, 'm, ...
1    [playin, everything, so, easy, it, 's, like, y...
2    [If, you, search, For, tenderness, It, is, n't...
3    [Oh, oh, oh, I, oh, oh, oh, I, Verse, 1, If, I...
4    [Party, the, people, the, people, the, party, ...
5    [I, heard, Church, bells, ringing, I, heard, A...
6    [This, is, just, another, day, that, I, would,...
7    [Waiting, waiting, waiting, waiting, Waiting, ...
8    [Verse, 1, I, read, all, of, the, magazines, w...
9    [N, n, now, honey, You, better, sit, down, and...
Name: tokens, dtype: object

In [14]:
lyrics_1['normalized'][:10]

0    [oh, baby, know, be, gon, na, cut, right, chas...
1    [playin, easy, like, sure, way, not, be, sure,...
2    [search, tenderness, hard, find, love, need, l...
3    [oh, oh, oh, oh, oh, oh, write, book, stand, t...
4    [party, people, people, party, pop, sit, look,...
5    [hear, church, bell, ring, hear, choir, singe,...
6    [day, spend, waitin, right, stare, night, wish...
7    [wait, wait, wait, wait, wait, wait, wait, wai...
8    [read, magazine, wait, say, wait, get, stick, ...
9    [n, n, honey, well, sit, look, because, have, ...
Name: normalized, dtype: object

In [15]:
lyrics_1.to_csv('cleaned_lyrics_tokens.csv',index=None)

## Cross-Time

In [18]:
lyrics_1 = pd.read_csv("lyrics.csv",usecols=['year','artist','genre','lyrics'])
lyrics_1 = lyrics_1.dropna()
lyrics_1 = lyrics_1.loc[lyrics_1['genre'] != 'Not Available']

In [19]:
lyrics_1[:5]

Unnamed: 0,year,artist,genre,lyrics
0,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."


In [25]:
pd.unique(lyrics_1['year'])

array([2009, 2007, 2013, 2010, 2012, 2006, 2016, 2011, 2015, 2008, 2014,
       1998, 2002, 1995, 2004, 1972, 2005, 1978, 1970, 1981, 1994, 1997,
       1993, 1982, 1983, 1986, 1992, 1977, 1989, 1979, 1996, 2001, 1990,
       1987, 2003, 1975, 1973, 1991, 1999, 1974, 1980, 2000, 1984, 1976,
        702, 1971, 1985, 1988,  112, 1968,   67])

In [26]:
lyrics_1 = lyrics_1.loc[lyrics_1['year'].isin([67,112,702,1968]) == False] 

In [30]:
lyrics_1.to_csv('lyrics_year.csv',index=None)

## Reclean

In [45]:
lyrics_1 = pd.read_csv("cleaned_lyrics_tokens.csv")
lyrics_1

Unnamed: 0,artist,genre,lyrics,tokens,normalized
0,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...","['Oh', 'baby', 'how', 'you', 'doing', 'You', '...","['oh', 'baby', 'know', 'be', 'gon', 'na', 'cut..."
1,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...","['playin', 'everything', 'so', 'easy', 'it', ""...","['playin', 'easy', 'like', 'sure', 'way', 'not..."
2,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,"['If', 'you', 'search', 'For', 'tenderness', '...","['search', 'tenderness', 'hard', 'find', 'love..."
3,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...","['Oh', 'oh', 'oh', 'I', 'oh', 'oh', 'oh', 'I',...","['oh', 'oh', 'oh', 'oh', 'oh', 'oh', 'write', ..."
4,beyonce-knowles,Pop,"Party the people, the people the party it's po...","['Party', 'the', 'people', 'the', 'people', 't...","['party', 'people', 'people', 'party', 'pop', ..."
...,...,...,...,...,...
237421,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ...","['I', 'got', 'ta', 'say', 'Boy', 'after', 'onl...","['get', 'ta', 'boy', 'couple', 'date', 'hand',..."
237422,edens-edge,Country,I helped you find her diamond ring\nYou made m...,"['I', 'helped', 'you', 'find', 'her', 'diamond...","['help', 'find', 'diamond', 'ring', 'try', 'to..."
237423,edens-edge,Country,Look at the couple in the corner booth\nLooks ...,"['Look', 'at', 'the', 'couple', 'in', 'the', '...","['look', 'couple', 'corner', 'booth', 'look', ..."
237424,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...,"['When', 'I', 'fly', 'off', 'this', 'mortal', ...","['fly', 'mortal', 'earth', 'be', 'measure', 'd..."


In [46]:
import ast
invalid = []
for i in range(len(lyrics_1)):
    word_list = ast.literal_eval(lyrics_1.loc[i,'normalized'])
    if len(word_list) <=10:
        invalid.append(i)
invalid

[85,
 89,
 187,
 356,
 369,
 733,
 759,
 859,
 913,
 936,
 1001,
 1018,
 1046,
 1116,
 1269,
 1280,
 1396,
 1567,
 1595,
 1600,
 1608,
 1722,
 1732,
 1753,
 1834,
 1934,
 1979,
 2169,
 2185,
 2187,
 2189,
 2191,
 2192,
 2202,
 2203,
 2206,
 2207,
 2267,
 2373,
 2376,
 2386,
 2393,
 2395,
 2409,
 2413,
 2418,
 2419,
 2741,
 2765,
 2786,
 2829,
 2957,
 3006,
 3087,
 3119,
 3123,
 3125,
 3143,
 3155,
 3159,
 3166,
 3177,
 3268,
 3269,
 3330,
 3367,
 3407,
 3453,
 3512,
 3576,
 3581,
 3582,
 3584,
 3585,
 3586,
 3661,
 3718,
 3722,
 3740,
 3810,
 3827,
 3828,
 3862,
 3863,
 3865,
 3866,
 3907,
 3913,
 3976,
 4061,
 4068,
 4083,
 4088,
 4089,
 4398,
 4411,
 4426,
 4443,
 4448,
 4452,
 4453,
 4460,
 4467,
 4479,
 4534,
 4714,
 4769,
 4915,
 5004,
 5015,
 5091,
 5098,
 5100,
 5286,
 5313,
 5319,
 5360,
 5477,
 5601,
 5602,
 5603,
 5604,
 5605,
 5606,
 5607,
 5608,
 5609,
 5610,
 5611,
 5612,
 5613,
 5614,
 5615,
 5616,
 5617,
 5618,
 5619,
 5620,
 5621,
 5622,
 5623,
 5624,
 5625,
 5626,
 562

In [47]:
len(invalid)

6153

In [48]:
lyrics_1 = lyrics_1.drop(invalid)
lyrics_1

Unnamed: 0,artist,genre,lyrics,tokens,normalized
0,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...","['Oh', 'baby', 'how', 'you', 'doing', 'You', '...","['oh', 'baby', 'know', 'be', 'gon', 'na', 'cut..."
1,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...","['playin', 'everything', 'so', 'easy', 'it', ""...","['playin', 'easy', 'like', 'sure', 'way', 'not..."
2,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,"['If', 'you', 'search', 'For', 'tenderness', '...","['search', 'tenderness', 'hard', 'find', 'love..."
3,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...","['Oh', 'oh', 'oh', 'I', 'oh', 'oh', 'oh', 'I',...","['oh', 'oh', 'oh', 'oh', 'oh', 'oh', 'write', ..."
4,beyonce-knowles,Pop,"Party the people, the people the party it's po...","['Party', 'the', 'people', 'the', 'people', 't...","['party', 'people', 'people', 'party', 'pop', ..."
...,...,...,...,...,...
237421,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ...","['I', 'got', 'ta', 'say', 'Boy', 'after', 'onl...","['get', 'ta', 'boy', 'couple', 'date', 'hand',..."
237422,edens-edge,Country,I helped you find her diamond ring\nYou made m...,"['I', 'helped', 'you', 'find', 'her', 'diamond...","['help', 'find', 'diamond', 'ring', 'try', 'to..."
237423,edens-edge,Country,Look at the couple in the corner booth\nLooks ...,"['Look', 'at', 'the', 'couple', 'in', 'the', '...","['look', 'couple', 'corner', 'booth', 'look', ..."
237424,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...,"['When', 'I', 'fly', 'off', 'this', 'mortal', ...","['fly', 'mortal', 'earth', 'be', 'measure', 'd..."


In [49]:
lyrics_1.to_csv('cleaned_lyrics_tokens.csv',columns=['artist','genre','lyrics',
                                                     'normalized'],
                index=None)

In [50]:
lyrics_1.to_csv('cleaned_lyrics_1.csv',columns=['artist','genre','lyrics'],index=None)

### Drop Duplicate

In [12]:
lyrics_df = pd.read_csv('lyrics_year.csv')
lyrics_df

Unnamed: 0,year,artist,genre,lyrics
0,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."
...,...,...,...,...
242604,2012,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ..."
242605,2012,edens-edge,Country,I helped you find her diamond ring\nYou made m...
242606,2012,edens-edge,Country,Look at the couple in the corner booth\nLooks ...
242607,2012,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...


In [13]:
lyrics_df = lyrics_df.drop_duplicates()
lyrics_df

Unnamed: 0,year,artist,genre,lyrics
0,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see..."
2,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...
3,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po..."
...,...,...,...,...
242604,2012,edens-edge,Country,"I gotta say\nBoy, after only just a couple of ..."
242605,2012,edens-edge,Country,I helped you find her diamond ring\nYou made m...
242606,2012,edens-edge,Country,Look at the couple in the corner booth\nLooks ...
242607,2012,edens-edge,Country,When I fly off this mortal earth\nAnd I'm meas...


In [7]:
my_stop_words = [str(i) for i in range(10)]
my_stop_words += ['intro','verse','pre','post','lift','chorus','bridge','outro',
                  'instrumental']
my_stop_words

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'intro',
 'verse',
 'pre',
 'post',
 'lift',
 'chorus',
 'bridge',
 'outro',
 'instrumental']

In [14]:
import re
def filtering(x):
    strip_blank = re.sub('\n', ' ', x.lower())
    my_stop_words = ['intro','verse','pre','post','lift','chorus','bridge','outro',
                     '\[','\]',':',';'] + [str(i) for i in range(10)]
    regex_command = '|'.join(['('+ w + ')' for w in my_stop_words])
    regex = re.compile(regex_command)
    return re.sub(regex, '', strip_blank)

In [15]:
from multiprocessing import Pool
p = Pool()
lyrics_df['lyrics'] = p.map(filtering, lyrics_df['lyrics'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [17]:
lyrics_df['tokens'] = p.map(lucem_illud_2020.word_tokenize,lyrics_df['lyrics']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
lyrics_df['normalized'] = list(p.starmap(lucem_illud_2020.normalizeTokens,
                        [(x,my_stop_words) for x in lyrics_df['tokens']]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
lyrics_df = lyrics_df.reset_index(drop=True)

In [28]:
invalid = []
for i in range(len(lyrics_df)):
    word_list = lyrics_df.loc[i,'normalized']
    if len(word_list) <=10:
        invalid.append(i)
invalid

[83,
 87,
 179,
 345,
 494,
 878,
 904,
 1014,
 1069,
 1162,
 1205,
 1274,
 1429,
 1554,
 1724,
 1752,
 1757,
 1765,
 1886,
 1915,
 1996,
 2097,
 2329,
 2345,
 2347,
 2350,
 2360,
 2422,
 2530,
 2533,
 2543,
 2550,
 2565,
 2569,
 2574,
 2575,
 2898,
 2922,
 2943,
 2981,
 3110,
 3160,
 3241,
 3273,
 3277,
 3279,
 3297,
 3309,
 3313,
 3320,
 3331,
 3443,
 3444,
 3505,
 3541,
 3581,
 3627,
 3685,
 3749,
 3754,
 3755,
 3757,
 3758,
 3759,
 3833,
 3890,
 3894,
 3990,
 3996,
 4024,
 4025,
 4059,
 4061,
 4108,
 4114,
 4175,
 4260,
 4285,
 4286,
 4590,
 4603,
 4618,
 4635,
 4643,
 4650,
 4657,
 4669,
 4724,
 4889,
 4947,
 5094,
 5183,
 5194,
 5272,
 5454,
 5481,
 5487,
 5528,
 5641,
 5760,
 5761,
 5762,
 5763,
 5764,
 5765,
 5766,
 5767,
 5768,
 5769,
 5770,
 5771,
 5772,
 5773,
 5774,
 5775,
 5776,
 5777,
 5778,
 5779,
 5780,
 5781,
 5782,
 5783,
 5784,
 5785,
 5786,
 5787,
 5788,
 5789,
 5790,
 5791,
 5792,
 5793,
 5795,
 5796,
 5797,
 5798,
 5799,
 5800,
 5801,
 5802,
 5803,
 5804,
 5805,
 

In [29]:
len(invalid)

4345

In [30]:
lyrics_df = lyrics_df.drop(invalid)
lyrics_df

Unnamed: 0,year,artist,genre,lyrics,tokens,normalized
0,2009,beyonce-knowles,Pop,"oh baby, how you doing? you know i'm gonna cut...","[oh, baby, how, you, doing, you, know, i, 'm, ...","[oh, baby, know, be, gon, na, cut, right, chas..."
1,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem...","[playin, everything, so, easy, it, 's, like, y...","[playin, easy, like, sure, way, not, be, sure,..."
2,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,"[if, you, search, for, tenderness, it, is, n't...","[search, tenderness, hard, find, love, need, l..."
3,2009,beyonce-knowles,Pop,"oh oh oh i, oh oh oh i if i wrote a book abo...","[oh, oh, oh, i, oh, oh, oh, i, if, i, wrote, a...","[oh, oh, oh, oh, oh, oh, write, book, stand, t..."
4,2009,beyonce-knowles,Pop,"party the people, the people the party it's po...","[party, the, people, the, people, the, party, ...","[party, people, people, party, pop, sit, look,..."
...,...,...,...,...,...,...
236130,2012,edens-edge,Country,"i gotta say boy, after only just a couple of d...","[i, got, ta, say, boy, after, only, just, a, c...","[get, ta, boy, couple, date, hand, outright, b..."
236131,2012,edens-edge,Country,i helped you find her diamond ring you made me...,"[i, helped, you, find, her, diamond, ring, you...","[help, find, diamond, ring, try, tomorrow, bec..."
236132,2012,edens-edge,Country,look at the couple in the corner booth looks a...,"[look, at, the, couple, in, the, corner, booth...","[look, couple, corner, booth, look, lot, like,..."
236133,2012,edens-edge,Country,when i fly off this mortal earth and i'm measu...,"[when, i, fly, off, this, mortal, earth, and, ...","[fly, mortal, earth, be, measure, depth, girth..."


In [33]:
lyrics_df = lyrics_df.loc[lyrics_df['genre']!='Other']
lyrics_df

Unnamed: 0,year,artist,genre,lyrics,tokens,normalized
0,2009,beyonce-knowles,Pop,"oh baby, how you doing? you know i'm gonna cut...","[oh, baby, how, you, doing, you, know, i, 'm, ...","[oh, baby, know, be, gon, na, cut, right, chas..."
1,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem...","[playin, everything, so, easy, it, 's, like, y...","[playin, easy, like, sure, way, not, be, sure,..."
2,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,"[if, you, search, for, tenderness, it, is, n't...","[search, tenderness, hard, find, love, need, l..."
3,2009,beyonce-knowles,Pop,"oh oh oh i, oh oh oh i if i wrote a book abo...","[oh, oh, oh, i, oh, oh, oh, i, if, i, wrote, a...","[oh, oh, oh, oh, oh, oh, write, book, stand, t..."
4,2009,beyonce-knowles,Pop,"party the people, the people the party it's po...","[party, the, people, the, people, the, party, ...","[party, people, people, party, pop, sit, look,..."
...,...,...,...,...,...,...
236130,2012,edens-edge,Country,"i gotta say boy, after only just a couple of d...","[i, got, ta, say, boy, after, only, just, a, c...","[get, ta, boy, couple, date, hand, outright, b..."
236131,2012,edens-edge,Country,i helped you find her diamond ring you made me...,"[i, helped, you, find, her, diamond, ring, you...","[help, find, diamond, ring, try, tomorrow, bec..."
236132,2012,edens-edge,Country,look at the couple in the corner booth looks a...,"[look, at, the, couple, in, the, corner, booth...","[look, couple, corner, booth, look, lot, like,..."
236133,2012,edens-edge,Country,when i fly off this mortal earth and i'm measu...,"[when, i, fly, off, this, mortal, earth, and, ...","[fly, mortal, earth, be, measure, depth, girth..."


In [42]:
lyrics_df.to_csv('cleaned_lyrics_tokens.csv',columns=['artist','genre','lyrics',
                'normalized'],index=None)

In [43]:
lyrics_df.to_csv('cleaned_lyrics_1.csv',columns=['artist','genre','lyrics'],index=None)

In [46]:
lyrics_df.to_csv('cleaned_lyrics_year.csv',
                 columns=['year','artist','genre','lyrics','normalized'],
                 index=None)

### Language Filtering

In [3]:
lyrics_df = pd.read_csv('cleaned_lyrics_year.csv')
lyrics_df

Unnamed: 0,year,artist,genre,lyrics,normalized
0,2009,beyonce-knowles,Pop,"oh baby, how you doing? you know i'm gonna cut...","['oh', 'baby', 'know', 'be', 'gon', 'na', 'cut..."
1,2009,beyonce-knowles,Pop,"playin' everything so easy, it's like you seem...","['playin', 'easy', 'like', 'sure', 'way', 'not..."
2,2009,beyonce-knowles,Pop,if you search for tenderness it isn't hard to ...,"['search', 'tenderness', 'hard', 'find', 'love..."
3,2009,beyonce-knowles,Pop,"oh oh oh i, oh oh oh i if i wrote a book abo...","['oh', 'oh', 'oh', 'oh', 'oh', 'oh', 'write', ..."
4,2009,beyonce-knowles,Pop,"party the people, the people the party it's po...","['party', 'people', 'people', 'party', 'pop', ..."
...,...,...,...,...,...
226727,2012,edens-edge,Country,"i gotta say boy, after only just a couple of d...","['get', 'ta', 'boy', 'couple', 'date', 'hand',..."
226728,2012,edens-edge,Country,i helped you find her diamond ring you made me...,"['help', 'find', 'diamond', 'ring', 'try', 'to..."
226729,2012,edens-edge,Country,look at the couple in the corner booth looks a...,"['look', 'couple', 'corner', 'booth', 'look', ..."
226730,2012,edens-edge,Country,when i fly off this mortal earth and i'm measu...,"['fly', 'mortal', 'earth', 'be', 'measure', 'd..."


In [4]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

In [5]:
from multiprocessing import Pool
p = Pool()
lyrics_df['lang'] = p.map(detect, lyrics_df['lyrics']) 
lyrics_df['lang']

0         en
1         en
2         en
3         en
4         en
          ..
226727    en
226728    en
226729    en
226730    en
226731    en
Name: lang, Length: 226732, dtype: object

In [6]:
pd.unique(lyrics_df['lang'])

array(['en', 'fr', 'es', 'so', 'pl', 'de', 'lt', 'sw', 'af', 'pt', 'lv',
       'id', 'sq', 'cy', 'ca', 'it', 'sk', 'hr', 'fi', 'tl', 'et', 'nl',
       'sl', 'ro', 'sv', 'no', 'hu', 'da', 'cs', 'vi', 'tr', 'ja'],
      dtype=object)

In [7]:
lang_groups = lyrics_df.groupby(['lang'])

In [10]:
lyrics_df = lyrics_df.loc[lyrics_df['lang'] == 'en']

In [12]:
lyrics_df = lyrics_df.reset_index(drop=True)

In [25]:
lyrics_df.to_csv('cleaned_lyrics_tokens.csv',columns=['artist','genre','lyrics',
                'normalized'],index=None)
lyrics_df.to_csv('cleaned_lyrics_1.csv',columns=['artist','genre','lyrics'],index=None)
lyrics_df.to_csv('cleaned_lyrics_year.csv',
                 columns=['year','artist','genre','lyrics','normalized'],
                 index=None)

### filtering garbled 

In [23]:
lyrics_df['garbled'] = lyrics_df['lyrics'].str.contains("\\x",regex=False)