In [172]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer



In [173]:
arabic_lyrics = pd.read_csv("arabicLyrics.csv")
arabic_lyrics.head()

Unnamed: 0,songID,Singer,SongTitle,SongWriter,Composer,LyricsOrder,Lyrics,SingerNationality,SongDialect
0,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,2,اروح لاحبابي والاقي الفرح ساكن عينهم,Morocco,Meghribi
1,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,3,ابتسم لافراحهم وانا من الهم احترق,Morocco,Meghribi
2,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,4,واسأل جروحي من ترى حس بعذابي منهم,Morocco,Meghribi
3,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,5,وبالحقيقه انصدم محدن معه همي فرق,Morocco,Meghribi
4,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,6,دورت في كل الوجيه حسيت غربه بينهم,Morocco,Meghribi


In [174]:
arabic_lyrics[["SingerNationality"]].value_counts()

SingerNationality
Egypt                139193
Saudi Arabia          87822
Lebanon               78220
Iraq                  70640
Sudan                 44580
Kuwait                26518
Syria                 23580
UAE                   19263
Morocco               11846
Tunisia                5611
Yemen                  5279
Jordan                 4309
Algeria                3074
Qatar                  2746
Bahrain                2508
Palestine              1116
Oman                   1086
Libya                   505
dtype: int64

In [175]:
moroccan_songs = arabic_lyrics[arabic_lyrics["SingerNationality"] == "Morocco"]
moroccan_songs.head()

Unnamed: 0,songID,Singer,SongTitle,SongWriter,Composer,LyricsOrder,Lyrics,SingerNationality,SongDialect
0,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,2,اروح لاحبابي والاقي الفرح ساكن عينهم,Morocco,Meghribi
1,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,3,ابتسم لافراحهم وانا من الهم احترق,Morocco,Meghribi
2,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,4,واسأل جروحي من ترى حس بعذابي منهم,Morocco,Meghribi
3,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,5,وبالحقيقه انصدم محدن معه همي فرق,Morocco,Meghribi
4,1537,ابتسام,اروح لاحبابي,ملامح,بندر بن فهد,6,دورت في كل الوجيه حسيت غربه بينهم,Morocco,Meghribi


In [192]:
all_lyrics_by_singer = moroccan_songs.groupby(['songID','Singer','SongTitle','SongWriter','Composer','SingerNationality'])['Lyrics'].apply(lambda x: ' '.join(x)).reset_index()
all_lyrics_by_singer.head()

Unnamed: 0,songID,Singer,SongTitle,SongWriter,Composer,SingerNationality,Lyrics
0,20,جنات,اسمع كلامي,محمد عاطف,شريف اسماعيل,Morocco,أسمع كلامى وصدقه انت اللي روحى بتعشقه كان حلم ...
1,24,جنات,افهمني حبيبي,غير معروف,غيرمعروف,Morocco,عايز تعرف ليه انا بتقل كده .. وانا متاكد ان ان...
2,25,جنات,اكتر من سنه,نادر عبد الله,محمد يحيى,Morocco,اكتر من سنه جنات فات يجي اكتر من سنة و ما نستش...
3,38,جنات,الطفله البريئه,نادر عبد الله,وليد سعد,Morocco,الطفلة البريئة المغمضة بقت من النهاردة مش كده ...
4,52,جنات,اللي بيني وبينك,خالد منير & نادر عبد الله,محمد رحيم,Morocco,اللي بيني وبينك جنات الي بيني وبينك الي بيني و...


### Removing the punctuation

In [195]:
import re
import string

def remove_punctuation(text):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    table = str.maketrans(' ', ' ', punctuation)
    punctua_bag_words = [w.translate(table) for w in word_tokenize(text)]

    return ' '.join(punctua_bag_words)

all_lyrics_by_singer["Lyrics"] = all_lyrics_by_singer["Lyrics"].apply(lambda x: remove_punctuation(x))

In [209]:
from collections import Counter
from nltk.tokenize import word_tokenize

bag_words = word_tokenize(' '.join(list(all_lyrics_by_singer["Lyrics"])))
cnt = Counter(bag_words)

cnt.most_common()[:10]

[('انا', 1305),
 ('ما', 1131),
 ('من', 1069),
 ('يا', 988),
 ('في', 974),
 ('و', 765),
 ('قلبي', 531),
 ('ولا', 527),
 ('لا', 507),
 ('اللي', 502)]

In [163]:
all_lyrics_by_song = list(all_lyrics_by_singer["Lyrics"])
songs_title = all_lyrics_by_singer['SongTitle']

'أسمع كلامى وصدقه انت اللي روحى بتعشقه كان حلم نفسي احققة اني ابقى ليك قربني منك ضمني قد اما احبك حبيني من غيرك انت يهمني دا انا روحي فيك ملهوفه عليك ومسلمه تقدر تقول مستسلمه حاسه اني طايره في السما واخدني الشوق متحيره متغيره وكأني لسه صغيره لو بحلم انا م الحلم دا مش عايزه أفوق لحظة حنان انا عيشتها لحظة ما شوفتك وقتها وبأعلى صوت انا قولتها هفضل معاك جمبك طريقى هكمله لو لسه عمري فى أوله المستحيل انا هعمله علشان هواك'

In [162]:
all_lyrics_by_singer

metrix_term = pd.DataFrame(columns=)

for 

Unnamed: 0,songID,Singer,SongTitle,SongWriter,Composer,SingerNationality,Lyrics
0,20,جنات,اسمع كلامي,محمد عاطف,شريف اسماعيل,Morocco,أسمع كلامى وصدقه انت اللي روحى بتعشقه كان حلم ...
1,24,جنات,افهمني حبيبي,غير معروف,غيرمعروف,Morocco,عايز تعرف ليه انا بتقل كده وانا متاكد ان انا ...
2,25,جنات,اكتر من سنه,نادر عبد الله,محمد يحيى,Morocco,اكتر من سنه جنات فات يجي اكتر من سنة و ما نستش...
3,38,جنات,الطفله البريئه,نادر عبد الله,وليد سعد,Morocco,الطفلة البريئة المغمضة بقت من النهاردة مش كده ...
4,52,جنات,اللي بيني وبينك,خالد منير & نادر عبد الله,محمد رحيم,Morocco,اللي بيني وبينك جنات الي بيني وبينك الي بيني و...
...,...,...,...,...,...,...,...
704,29942,سميرة سعيد,يلي هواك,خالد البذال,سليمان الملا,Morocco,يلي هواك اصعب مراحل حياتي فيك الرجا ماخاب لوّن...
705,30025,سميرة سعيد,يوم اقابلك فيه,محمد زكي الملاح,ابراهيم رافت,Morocco,وانسي كل الدنيا جنبك واحكي لعيونك وقلبك وانسي ...
706,30064,طاهرة حماميش,يوم ما افترقنا,غير معروف,غير معروف,Morocco,ليه يوم ما افترقنا انا قولتله هعتبره ماضي وهعر...
707,30069,سميرة سعيد,يوم من عمرنا,غير معروف,غير معروف,Morocco,يوم من عمرنا كان زى النهارده من كام سنه يوم جي...


In [164]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(all_lyrics_by_song)

ok = pd.DataFrame(X.toarray().T, index=vectorizer.get_feature_names(), columns=list(songs_title))
ok

Unnamed: 0,اسمع كلامي,افهمني حبيبي,اكتر من سنه,الطفله البريئه,اللي بيني وبينك,أسرارنا,أنا ماشي ساهل,أنا و أنا,إرحمني,ابشرك,...,يايمه حبيته,يحاسبلي,يخليك للى,يعذبني,يلا نفرح,يلي هواك,يوم اقابلك فيه,يوم ما افترقنا,يوم من عمرنا,يوم ورا يوم
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
anlarda,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ﻭﻳﻠﻲ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ﻳﺎ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ﻳﺎﺣﺒﻲ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ﻳﺪﻕ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# getting the frequency word in all lyrics

In [165]:
len(all_lyrics_by_song)

709

In [166]:
from collections import Counter
from nltk.tokenize import word_tokenize

bag_words = word_tokenize(str(all_lyrics_by_song))
cnt = Counter(bag_words)

cnt.most_common()[:100]

[('انا', 1266),
 ('ما', 1121),
 ('من', 1059),
 ('في', 971),
 ('يا', 962),
 ('و', 765),
 ("'", 710),
 (',', 708),
 ('ولا', 526),
 ('قلبي', 525),
 ('اللي', 498),
 ('لا', 495),
 ('كل', 471),
 ('مش', 463),
 ('انت', 433),
 ('لو', 418),
 ('على', 416),
 ('حبيبي', 375),
 ('يوم', 335),
 ('فى', 332),
 ('ده', 316),
 ('ايه', 301),
 ('ليه', 291),
 ('كان', 290),
 ('وانا', 289),
 ('عليك', 282),
 ('معاك', 262),
 ('كنت', 255),
 ('الله', 235),
 ('اللى', 224),
 ('غير', 215),
 ('بس', 214),
 ('أنا', 207),
 ('الحب', 196),
 ('هو', 195),
 ('وانت', 194),
 ('ليك', 186),
 ('عمري', 185),
 ('عليا', 185),
 ('ليا', 171),
 ('الدنيا', 169),
 ('كده', 167),
 ('لي', 164),
 ('حبك', 161),
 ('فيك', 154),
 ('قلبى', 150),
 ('حب', 142),
 ('تاني', 141),
 ('لك', 138),
 ('خلاص', 137),
 ('لما', 135),
 ('عليه', 134),
 ('او', 133),
 ('عيني', 130),
 ('بيك', 129),
 ('فيه', 127),
 ('آه', 124),
 ('القلب', 118),
 ('ان', 117),
 ('بعد', 113),
 ('فيها', 111),
 ('عشان', 109),
 ('عن', 109),
 ('قلبك', 108),
 ('دي', 107),
 ('اه', 105),
 ('الناس