In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
import re


In [11]:
singers_names = ['Harry Styles', 'Zayn Malik', 'Louis Tomlinson', 'Niall Horan', 'Liam Payne']

In [2]:
df = pd.read_csv('df_main.csv')

In [28]:
def get_nouns_verbs(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_text = [word for word in word_tokens if word.isalnum() and word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_text]

    pos_tags = pos_tag(lemmatized_words)
    nouns_verbs = [word for word, tag in pos_tags if tag.startswith('N') or tag.startswith('V')]

    return nouns_verbs

In [29]:
def most_common_word(text):
    nouns_verbs = get_nouns_verbs(text)
    word_counts = Counter(nouns_verbs)
    return word_counts.most_common(1)[0][0]

In [26]:
def most_common_word_count(text):
    nouns_verbs = get_nouns_verbs(text)
    word_counts = Counter(nouns_verbs)
    return word_counts.most_common(1)[0][1]

In [30]:
import string
def get_unique_words(text):
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    words = text.split()
    unique_words = set(words)
    num_unique_words = len(unique_words)
    return num_unique_words

In [92]:
df['Unique_Words'] = df['Lyrics'].apply(get_unique_words)

In [31]:
df['Most_Common'] = df['Lyrics'].apply(most_common_word)
df['Most_Common_Count'] = df['Lyrics'].apply(most_common_word_count)

In [51]:
df['Writer(s)'] = df['Writer(s)'].apply(lambda x: re.sub(r'[\'\[\]\\]', '', x))

In [65]:
for singer in singers_names:
    df[singer+'_W'] = df['Writer(s)'].apply(lambda x: True if singer in x else False)

In [53]:
unique_writers = df['Writer(s)'].explode().unique()

In [74]:
writers_dict = {}

for writers in unique_writers:
    writers = writers.split(',')
    for writer in writers:
        writer = writer.strip()
        if writer in writers_dict:
            writers_dict[writer] = writers_dict[writer]+1
        else:
            writers_dict[writer] = 1

In [75]:
writers_dict

{'Ed Sheeran': 4,
 'Oliver Frank': 1,
 'Niall Horan': 17,
 'Liam Payne': 31,
 'Harry Styles': 21,
 'Louis Tomlinson': 34,
 'Ed Drewett': 9,
 'John Ryan': 33,
 'Julian Bunetta': 36,
 'Jamie Scott': 24,
 'Achraf Jannusi': 2,
 'Bilal Hajji': 1,
 'Eric Sanicola': 1,
 'Geo Slam': 1,
 'RedOne Teddy': 1,
 'Sky': 1,
 'Kristoffer Fogelmark': 3,
 'Savan Kotecha': 10,
 'Albin Nedler': 2,
 'Rami Yacoub': 6,
 'Wayne Hector': 10,
 'Carl Falk': 6,
 'Zayn Malik': 11,
 'Sam Martin': 1,
 'Tom Fletcher': 4,
 'Danny Jones': 3,
 'Dougie Poynter': 3,
 'Jacob Kasher': 3,
 'Gamal "LunchMoney"': 1,
 'Lewis John': 1,
 'Ryan': 1,
 'Steve Robson': 6,
 'Nandini Srivastava': 1,
 'Maureen McDonald': 1,
 'Bernhard Lloyd': 1,
 'Marian Gold': 1,
 'Frank Mertens': 1,
 'S. Pages': 1,
 'Mehner': 1,
 'Steve Mac': 1,
 'August Rigo': 1,
 'Lindy Robbins': 4,
 'Shellback Kristian': 1,
 'Lundin': 1,
 'Ina Wroldsen': 1,
 'Ammar Malik': 1,
 'Ross Golan': 1,
 'Johan Carlsson': 2,
 'Cagaanan Rami': 1,
 'Yacoub Carl': 1,
 'Falk Sava

In [78]:
df_writers = pd.DataFrame(list(writers_dict.items()), columns=['Name', 'NumSongs'])

In [83]:
df_writers.to_csv('df_writers.csv', index=False)

In [2]:
df = pd.read_csv('df_main.csv')

In [7]:
len(df.columns)

27

In [14]:
df.head()

Unnamed: 0,Song,Artist(s),Writer(s),Album(s),Year,Lyrics,Views,Lyrics_word_count,All_word_count,All%,...,Niall%,Zayn_word_count,Zayn%,Most_Common,Harry Styles_W,Zayn Malik_W,Louis Tomlinson_W,Niall Horan_W,Liam Payne_W,Unique_Words
0,18,One Direction,"Ed Sheeran, Oliver Frank",Four,2014,"I got a heart and I got a soul Believe me, I w...",70.0,365,155,42.465753,...,15.616438,37,10.136986,love,False,False,False,False,False,96
1,A.M.,One Direction,"Niall Horan, Liam Payne, Harry Styles, Louis T...",Made in the A.M.,2015,Won't you stay till the A.M.? All my favourite...,4.4,365,129,35.342466,...,13.972603,0,0.0,stay,True,False,True,True,True,98
2,Act My Age,One Direction,"John Ryan, Julian Bunetta, Ed Drewett",Four,2014,"One, two, three, four When I'm fat and old and...",20.0,246,119,48.373984,...,15.853659,41,16.666667,act,False,False,False,False,False,74
3,Alive,One Direction,"Julian Bunetta, John Ryan, Jamie Scott, Louis ...",Midnight Memories,2013,My mother told me I should go and get some the...,1.3,306,109,35.620915,...,15.03268,28,9.150327,said,False,False,True,False,False,111
4,Another World,One Direction,"Achraf Jannusi, Bilal Hajji, Eric Sanicola, Ge...",Up All Night,2011,"It's not me, it's not you, there's a reason I'...",1.2,390,195,50.0,...,6.666667,34,8.717949,take,False,False,False,False,False,89


In [20]:
def add_only_writer(writers):
    i = 0
    count = 0
    member = None
    writers = writers.split(', ')
    while i<len(writers) and count<2:
        if writers[i] in singers_names:
            count = count + 1
            member = writers[i]
        i = i+1
    if count!=0 and count<2:
        return member
    return None
    

In [21]:
df['Main_One'] = df['Writer(s)'].apply(add_only_writer)

In [33]:
df.to_excel('df_main.xlsx', index=False)
df.to_csv('df_main.csv', index=False)

In [37]:
most_common_word = df['Most_Common'].tolist()

In [39]:
dict_most_frequent_word = {}
for word in most_common_word:
    dict_most_frequent_word[word] = df['Lyrics'].str.count(word).sum()

In [44]:
from operator import itemgetter
dict_most_frequent_word = dict(sorted(dict_most_frequent_word.items(), key=itemgetter(1), reverse=True))


In [51]:
most_freq_df = pd.DataFrame(dict_most_frequent_word.items(), columns=['Word', 'Count'])

In [53]:
most_freq_df.to_csv('most_freq_df.csv', index=True)

In [2]:
df = pd.read_excel('df_main.xlsx')

In [31]:
df[df['Louis Tomlinson_W']==True]['Song'].count()

37

In [32]:
df[df['Niall Horan_W']==True]['Song'].count()

17

In [36]:
df[df['Main_One']=='Liam Payne']['Song'].count()

1

In [35]:
df[df['Main_One']=='Louis Tomlinson']['Song'].count()

4