In [46]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
import re


In [64]:
singers_names = ['Harry Styles', 'Zayn Malik', 'Louis Tomlinson', 'Niall Horan', 'Liam Payne']

In [2]:
df = pd.read_csv('df_main.csv')

In [12]:
def get_nouns_verbs(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text.lower())
    filtered_text = [word for word in word_tokens if word.isalnum() and word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_text]

    pos_tags = pos_tag(lemmatized_words)
    nouns_verbs = [word for word, tag in pos_tags if tag.startswith('N') or tag.startswith('V')]

    return nouns_verbs

In [23]:
def most_common_word(text):
    nouns_verbs = get_nouns_verbs(text)
    word_counts = Counter(nouns_verbs)
    return word_counts.most_common(1)[0][0]

In [24]:
df['Most_Common'] = df['Lyrics'].apply(most_common_word)

In [51]:
df['Writer(s)'] = df['Writer(s)'].apply(lambda x: re.sub(r'[\'\[\]\\]', '', x))

In [65]:
for singer in singers_names:
    df[singer+'_W'] = df['Writer(s)'].apply(lambda x: True if singer in x else False)

In [53]:
unique_writers = df['Writer(s)'].explode().unique()

In [74]:
writers_dict = {}

for writers in unique_writers:
    writers = writers.split(',')
    for writer in writers:
        writer = writer.strip()
        if writer in writers_dict:
            writers_dict[writer] = writers_dict[writer]+1
        else:
            writers_dict[writer] = 1

In [75]:
writers_dict

{'Ed Sheeran': 4,
 'Oliver Frank': 1,
 'Niall Horan': 17,
 'Liam Payne': 31,
 'Harry Styles': 21,
 'Louis Tomlinson': 34,
 'Ed Drewett': 9,
 'John Ryan': 33,
 'Julian Bunetta': 36,
 'Jamie Scott': 24,
 'Achraf Jannusi': 2,
 'Bilal Hajji': 1,
 'Eric Sanicola': 1,
 'Geo Slam': 1,
 'RedOne Teddy': 1,
 'Sky': 1,
 'Kristoffer Fogelmark': 3,
 'Savan Kotecha': 10,
 'Albin Nedler': 2,
 'Rami Yacoub': 6,
 'Wayne Hector': 10,
 'Carl Falk': 6,
 'Zayn Malik': 11,
 'Sam Martin': 1,
 'Tom Fletcher': 4,
 'Danny Jones': 3,
 'Dougie Poynter': 3,
 'Jacob Kasher': 3,
 'Gamal "LunchMoney"': 1,
 'Lewis John': 1,
 'Ryan': 1,
 'Steve Robson': 6,
 'Nandini Srivastava': 1,
 'Maureen McDonald': 1,
 'Bernhard Lloyd': 1,
 'Marian Gold': 1,
 'Frank Mertens': 1,
 'S. Pages': 1,
 'Mehner': 1,
 'Steve Mac': 1,
 'August Rigo': 1,
 'Lindy Robbins': 4,
 'Shellback Kristian': 1,
 'Lundin': 1,
 'Ina Wroldsen': 1,
 'Ammar Malik': 1,
 'Ross Golan': 1,
 'Johan Carlsson': 2,
 'Cagaanan Rami': 1,
 'Yacoub Carl': 1,
 'Falk Sava

In [78]:
df_writers = pd.DataFrame(list(writers_dict.items()), columns=['Name', 'NumSongs'])

In [80]:
df_writers.to_csv('df_writers.csv', index=False)