# Poems Topic Extraction Notebook

This notebook has been used in order to extact measure semantic similarity among poems.

In [1]:
import os
import numpy as np
import unidecode
from tqdm import tqdm
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Converts a poem with multiple lines into one singular string

def poem_to_string(poem):
    result = ""
    for i in poem:
        result +=' '+i
    return result

In [3]:
# Iterates over all the nawbas and appends the poems into two lists (original and transliterated versions)

total_list_poems = []
total_list_poems_transliterated = []

test_dict = {}

for filename in os.listdir('outputs/'):
    if filename.endswith('.txt'):
        name = os.path.join('/Users/Morgan/Desktop/UPF/musicology.nosync/outputs', filename)
        with open(name, "rb") as fp:  

            b = pickle.load(fp)
        

        
    for i in range(len(b)):

        if len(b[i])>4 and b[i][-1]=='qaṣīdah':
            #print(filename.split('.txt')[0])
            
            #print(b[i][0])
        #if len(b[i])>4:
        #    if not isinstance(b[i][4], float):
            
            total_list_poems.append(poem_to_string(b[i][4]))
            total_list_poems_transliterated.append(poem_to_string(b[i][5]))
            test_dict[(filename.split('.txt')[0],b[i][0])] = (len(total_list_poems)-1, [])
            
            
            
print('Original size :',len(total_list_poems))
print('Transliterated size :',len(total_list_poems_transliterated))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Morgan/Desktop/UPF/musicology.nosync/outputs/9b194c42-f974-42f8-89b1-123bb07170b8.txt'

In [4]:
# For both versions, lists all the keywords and builds a vector representation of each poem
# Each component of these vectors corresponds to the presence of each keyword in the corresponding poem 


vectorizer = CountVectorizer(analyzer='word',
                             max_df=0.95, 
                             min_df=0.04,
                             ngram_range=(1,6))

X_original = vectorizer.fit_transform(total_list_poems)

print('Number of keywords original :', len(vectorizer.get_feature_names()))
X_array_original = X_original.toarray()
print(len(np.where(X_array_original==0)[0])/(X_array_original.shape[0]*X_array_original.shape[1])*100,'%')


vectorizer = CountVectorizer(analyzer='word',
                             max_df=0.95,
                             min_df=0.04,
                             lowercase=True,
                             strip_accents='unicode',
                             ngram_range=(1,6))

X = vectorizer.fit_transform(total_list_poems_transliterated)

print('Number of keywords transliterated :', len(vectorizer.get_feature_names()))
X_array = X.toarray()
print(100-np.count_nonzero(X_array)/(X_array.shape[0]*X_array.shape[1])*100,'%')

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [131]:
# For a given poem, finds the closest ones based on the vector representation

def neighbor_graph(array, index):
    neighbors = []
    instance = array[index]
    for i in tqdm(range(len(array))):
        if i!=index:
            # only looks at non-zero components in common from both vectors
            indexes = list(set(np.nonzero(array[i])[0]).intersection(np.nonzero(instance)[0]))
            neighbors.append(len(indexes))
    return np.array(neighbors)

In [132]:
# Code to test the function built above, i denotes the index of the poem (from 0 to 147)
# nb denotes the number of similar poems that are returned

i = 25
nb = 5

neighbors = neighbor_graph(X_array, i)
neighbors_original = neighbor_graph(X_array_original, i)


n = np.argsort(neighbors)[-nb:]
n_original = np.argsort(neighbors_original)[-nb:]

print('Poem = ',i,total_list_poems[i])
print('Closest elements :',n)
for j in n:
    if j == i:
        continue

    print('Score {}'.format(j),':',neighbors[j])
for j in n:
    if j == i:
        continue

    print(total_list_poems[j])
    print('-----------------')
    
    
    
print('\n')
print('Original based :\n')



print('Poem = ',i,total_list_poems[i])
print('\n')
print('Closest elements :',n_original)
for j in n_original:
    if j == i:
        continue

    print('Score {}'.format(j),':',neighbors_original[j])
for j in n_original:
    if j == i:
        continue

    print(total_list_poems[j])
    print('-----------------')    

100%|██████████| 153/153 [00:00<00:00, 64102.34it/s]
100%|██████████| 153/153 [00:00<00:00, 36490.87it/s]

Poem =  25  وَمَنْ تَكُنْ بِرَسُولِ اللَّهِ نُصْرَتُهُ إِنْ تَلْقَهُ الأُسْدُ فِي آجَامِهَا تَجِمِ
Closest elements : [ 59  60 119  92   6]
Score 59 : 3
Score 60 : 3
Score 119 : 4
Score 92 : 4
Score 6 : 4
 جُفُونِي قَادَتْ إِلَى حَيْنِي وَثَارِي عِنْدِي فَمَا أَطْلُبْ دَعُونِي أَقْتَصُّ مِنْ جَفْنِي بِسُهْدِي وَعَبْرَتِي أَسْكُبْ
-----------------

-----------------
 أَبْشِرْ لَقَدْ نِلْتَ مَا تَرْجُو وَتَنْتَظِرُ وَقَدْ جَرَى بِالَّذِي تَخْتَارُهُ القَدَرُ وَسَاعَدَتْكَ مِنَ الأَيَّامِ أَرْبَعَةٌ العِزُّ وَالنَّصْرُ وَالتَّمْكِينُ   وَالظَّفَرُ
-----------------
 أَيَا فَاضِحَ البَدْرِ عِنْدَ التَّمَامْ فَهَلْ لَكَ وَاوٌ وَصَادٌ وَلامْ وَهَلْ لا عَطَفْتَ عَلَى عَاشِقٍ يَهِيمُ بِقَافٍ وَبَاءٍ وَلامْ تَمَلَّكْتَ عَقْلِي بِثَغْرٍ شَنِيبْ وَرِيقٍ كَعَيْنٍ وَسِينٍ وَلامْ
-----------------
 وَمَنْ تَكُنْ بِرَسُولِ اللَّهِ نُصْرَتُهُ إِنْ تَلْقَهُ الأُسْدُ فِي آجَامِهَا تَجِمِ
-----------------


Original based :

Poem =  25  وَمَنْ تَكُنْ بِرَسُولِ اللَّهِ نُصْرَتُهُ إِنْ تَلْقَهُ الأُسْدُ 




In [134]:
import random

# function that returns a random set of dissimilar poems

def get_dissimilar(array, index, nb):
    anti_neighbors = []
    neighbors = neighbor_graph(array, index)
    for i in range(nb):
        j =  index
        while (j in neighbors or j==index) or array[index, j]>0:
            j = random.randint(0,array.shape[1]-1)
            print(j)
        anti_neighbors.append(j)
    return anti_neighbors

In [139]:
# We carry out this process for all the poems:
# Build a list of n semantically similar poems
# Build a list of n semantically dissimilar poems


for i in range(len(total_list_poems)):
    for j in test_dict.keys():
        if test_dict[j][0]==i:
            
            #neighbors = neighbor_graph(X_array, i)
            #neighbors_original = neighbor_graph(X_array_original, i)
            #n = np.argsort(neighbors)[-nb:]
            #n_original = np.argsort(neighbors_original)[-nb:]
            n = get_dissimilar(X_array, i, 5)
            #test_dict[j][1].append(n_original)
            test_dict[j][1].append(n)          
            
            
# final processing of the dictionary to ease its future use
new_dict = {}
for key in list(test_dict.keys()):
    temp = []
    for neighbor_index in test_dict[key][1][0]:
        for key_2 in list(test_dict.keys()):
            if test_dict[key_2][0] == neighbor_index:
                temp.append(key_2)
    new_dict[key] = temp
    
# we save the dictionary in a txt file 
with open("dissimilarity_transliterated.txt", "wb") as fp:   #Pickling
    pickle.dump(new_dict, fp)

100%|██████████| 153/153 [00:00<00:00, 41276.68it/s]
100%|██████████| 153/153 [00:00<00:00, 36366.80it/s]
100%|██████████| 153/153 [00:00<00:00, 35762.85it/s]
100%|██████████| 153/153 [00:00<00:00, 41757.45it/s]
100%|██████████| 153/153 [00:00<00:00, 48326.57it/s]
100%|██████████| 153/153 [00:00<00:00, 45619.43it/s]
100%|██████████| 153/153 [00:00<00:00, 34105.47it/s]
100%|██████████| 153/153 [00:00<00:00, 39311.96it/s]
100%|██████████| 153/153 [00:00<00:00, 32895.66it/s]
100%|██████████| 153/153 [00:00<00:00, 41771.04it/s]
100%|██████████| 153/153 [00:00<00:00, 12566.40it/s]
100%|██████████| 153/153 [00:00<00:00, 24658.16it/s]
100%|██████████| 153/153 [00:00<00:00, 40767.96it/s]
100%|██████████| 153/153 [00:00<00:00, 17219.75it/s]
100%|██████████| 153/153 [00:00<00:00, 19686.13it/s]
100%|██████████| 153/153 [00:00<00:00, 28937.97it/s]
100%|██████████| 153/153 [00:00<00:00, 33663.56it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

108
64
69
189
37
37
9
175
54
20
124
87
136
39
86
11
101
145
176
141
121
50
81
66
160
158
182
183
130
113
147
76
171
90
185
62
24
120
65
3
157
98
62
38
150
154
3
26
55
61
49
156
34
104
49
149
34
13
38
179
187
37
44
86
117
174
48
101
89
102
31
76
129
87
16
159
64
90
165
74
82
3
121
21
147
58
40
115
189
170
49
190
52
124
6
14
57
132
147


100%|██████████| 153/153 [00:00<00:00, 15687.88it/s]
100%|██████████| 153/153 [00:00<00:00, 29148.28it/s]
100%|██████████| 153/153 [00:00<00:00, 12021.89it/s]
100%|██████████| 153/153 [00:00<00:00, 28635.81it/s]
100%|██████████| 153/153 [00:00<00:00, 34397.97it/s]
100%|██████████| 153/153 [00:00<00:00, 52613.64it/s]
100%|██████████| 153/153 [00:00<00:00, 59601.42it/s]
100%|██████████| 153/153 [00:00<00:00, 44029.40it/s]
100%|██████████| 153/153 [00:00<00:00, 60047.58it/s]
100%|██████████| 153/153 [00:00<00:00, 39566.47it/s]
100%|██████████| 153/153 [00:00<00:00, 64475.89it/s]
100%|██████████| 153/153 [00:00<00:00, 34740.61it/s]
100%|██████████| 153/153 [00:00<00:00, 36143.54it/s]
100%|██████████| 153/153 [00:00<00:00, 37296.79it/s]
100%|██████████| 153/153 [00:00<00:00, 33782.30it/s]
100%|██████████| 153/153 [00:00<00:00, 35820.74it/s]
100%|██████████| 153/153 [00:00<00:00, 40830.22it/s]
100%|██████████| 153/153 [00:00<00:00, 17428.33it/s]
100%|██████████| 153/153 [00:00<00:00, 36849.1

83
67
9
145
88
50
152
158
180
162
13
187
40
160
52
51
180
39
158
29
157
46
155
59
19
54
147
135
7
47
13
124
131
65
18
74
162
115
56
71
82
182
172
72
62
34
92
154
190
121
123
67
71
17
128
51
64
44
70
88
191
34
180
73
190
172
118
138
45
76
74
1
17
188
189
76
82
180
190
1
181
46
183
131
189
41
100
92
36
6
18
66
174
31
24
137
9
7
108
21
177
79
113
115
24
61
108
59
96
92
44
48
137
26
50
100


100%|██████████| 153/153 [00:00<00:00, 7825.67it/s]
100%|██████████| 153/153 [00:00<00:00, 28245.09it/s]
100%|██████████| 153/153 [00:00<00:00, 20084.14it/s]
100%|██████████| 153/153 [00:00<00:00, 35140.10it/s]
100%|██████████| 153/153 [00:00<00:00, 31618.47it/s]
100%|██████████| 153/153 [00:00<00:00, 30953.53it/s]
100%|██████████| 153/153 [00:00<00:00, 21664.65it/s]
100%|██████████| 153/153 [00:00<00:00, 29087.50it/s]
100%|██████████| 153/153 [00:00<00:00, 17443.49it/s]
100%|██████████| 153/153 [00:00<00:00, 77316.69it/s]
100%|██████████| 153/153 [00:00<00:00, 36699.56it/s]
100%|██████████| 153/153 [00:00<00:00, 51602.49it/s]
100%|██████████| 153/153 [00:00<00:00, 78422.16it/s]

83
45
165
55
122
162
48
29
179
137
139
172
162
56
105
10
128
85
116
51
48
165
186
99
121
154
25
146
153
172
1
112
155
119
136
51
99
119
117
25
94
12
70
153
13
121
160
120
112
4
101
179
58
111
173
1
45
8
152
59
23
94
184
182
137
64
147
85
9
63
129
177
141
34
36
77
171
160
164
34
22
172
139
180
86
170



100%|██████████| 153/153 [00:00<00:00, 19540.47it/s]
100%|██████████| 153/153 [00:00<00:00, 41962.24it/s]
100%|██████████| 153/153 [00:00<00:00, 67105.36it/s]
100%|██████████| 153/153 [00:00<00:00, 26980.39it/s]
100%|██████████| 153/153 [00:00<00:00, 69760.68it/s]
100%|██████████| 153/153 [00:00<00:00, 59293.03it/s]
100%|██████████| 153/153 [00:00<00:00, 11354.61it/s]
100%|██████████| 153/153 [00:00<00:00, 26184.45it/s]
100%|██████████| 153/153 [00:00<00:00, 62092.74it/s]
100%|██████████| 153/153 [00:00<00:00, 48819.21it/s]
100%|██████████| 153/153 [00:00<00:00, 64391.78it/s]
100%|██████████| 153/153 [00:00<00:00, 25836.56it/s]
100%|██████████| 153/153 [00:00<00:00, 21491.24it/s]
100%|██████████| 153/153 [00:00<00:00, 77466.02it/s]
100%|██████████| 153/153 [00:00<00:00, 51898.79it/s]
100%|██████████| 153/153 [00:00<00:00, 18809.64it/s]
100%|██████████| 153/153 [00:00<00:00, 51777.35it/s]
100%|██████████| 153/153 [00:00<00:00, 15940.79it/s]
100%|██████████| 153/153 [00:00<00:00, 21039.

13
69
19
47
107
8
189
131
163
94
6
8
85
4
78
32
18
93
142
144
190
60
35
78
12
48
177
46
59
158
144
87
188
161
89
6
136
76
121
164
41
36
75
189
43
62
7
136
40
6
1
85
123
186
13
15
161
113
72
101
18
47
48
37
1
172
95
35
115
34
79
109
164
168
135
5
111
123
182
24
78
101
74
30
141
77
191
60
112
169
181
85
183
49
57
71
134
30
188
121
28
78
103
110
161
46
42
108
115
165
165
109
23
40
120
19
182
186
24
154
142
24
130
44
188
105
25
1
160
68
102
42
2
46
25
38
160


100%|██████████| 153/153 [00:00<00:00, 38635.07it/s]
100%|██████████| 153/153 [00:00<00:00, 33871.45it/s]
100%|██████████| 153/153 [00:00<00:00, 42953.72it/s]
100%|██████████| 153/153 [00:00<00:00, 12293.18it/s]
100%|██████████| 153/153 [00:00<00:00, 22527.07it/s]
100%|██████████| 153/153 [00:00<00:00, 45198.51it/s]
100%|██████████| 153/153 [00:00<00:00, 15071.13it/s]
100%|██████████| 153/153 [00:00<00:00, 16898.71it/s]
100%|██████████| 153/153 [00:00<00:00, 24371.60it/s]


19
14
130
170
174
188
135
131
39
163
131
10
96
154
149
38
143
180
75
56
109
186
122
64
23
4
167
91
178
6
47
169
132
170
174
191
60
148
27
168
120
91
50
39
10
181
64
3
97
140
27
17
65
191
95
149
144
171


100%|██████████| 153/153 [00:00<00:00, 11833.90it/s]
100%|██████████| 153/153 [00:00<00:00, 7474.04it/s]
100%|██████████| 153/153 [00:00<00:00, 45198.51it/s]
100%|██████████| 153/153 [00:00<00:00, 9167.68it/s]
100%|██████████| 153/153 [00:00<00:00, 69174.14it/s]
100%|██████████| 153/153 [00:00<00:00, 48527.56it/s]
100%|██████████| 153/153 [00:00<00:00, 22812.15it/s]
100%|██████████| 153/153 [00:00<00:00, 19408.68it/s]
100%|██████████| 153/153 [00:00<00:00, 28867.68it/s]
100%|██████████| 153/153 [00:00<00:00, 33946.71it/s]
100%|██████████| 153/153 [00:00<00:00, 13045.38it/s]
100%|██████████| 153/153 [00:00<00:00, 22273.73it/s]
100%|██████████| 153/153 [00:00<00:00, 25398.90it/s]
100%|██████████| 153/153 [00:00<00:00, 24175.12it/s]


175
54
167
161
175
62
106
166
157
151
82
35
2
86
144
135
177
107
146
66
137
165
189
151
60
43
51
190
60
47
9
29
44
113
29
63
92
42
70
97
16
175
102
191
182
62
22
22
2
114
187
152
121
24
137
53
47
78
124
139
26
145
33
0
119
187
64
96
107
124
112
158
10
183
20
62
59
188
173
32
134
4
23
127
60
18
151


100%|██████████| 153/153 [00:00<00:00, 17997.77it/s]
100%|██████████| 153/153 [00:00<00:00, 8766.90it/s]
100%|██████████| 153/153 [00:00<00:00, 50777.70it/s]
100%|██████████| 153/153 [00:00<00:00, 8722.57it/s]
100%|██████████| 153/153 [00:00<00:00, 39834.17it/s]
100%|██████████| 153/153 [00:00<00:00, 42299.68it/s]
100%|██████████| 153/153 [00:00<00:00, 27523.10it/s]
100%|██████████| 153/153 [00:00<00:00, 19386.98it/s]
100%|██████████| 153/153 [00:00<00:00, 79274.68it/s]
100%|██████████| 153/153 [00:00<00:00, 21983.03it/s]
100%|██████████| 153/153 [00:00<00:00, 47133.93it/s]
100%|██████████| 153/153 [00:00<00:00, 41722.16it/s]
100%|██████████| 153/153 [00:00<00:00, 40744.67it/s]
100%|██████████| 153/153 [00:00<00:00, 54522.39it/s]


146
38
119
48
67
15
146
18
169
95
2
117
98
152
27
174
136
8
128
131
61
151
152
121
106
85
181
53
133
149
16
109
138
72
19
115
8
102
117
152
133
70
37
180
190
104
87
109
27
66
191
89
20
107
35
184
178
173
183
27
186
155
165
187
43
5
66
14
59
189
77
170
106
3
143
50
174
152
114
53
150
181
108
4
64
146
5
131


100%|██████████| 153/153 [00:00<00:00, 30631.43it/s]
100%|██████████| 153/153 [00:00<00:00, 46364.32it/s]
100%|██████████| 153/153 [00:00<00:00, 52008.15it/s]
100%|██████████| 153/153 [00:00<00:00, 46930.56it/s]
100%|██████████| 153/153 [00:00<00:00, 28688.30it/s]
100%|██████████| 153/153 [00:00<00:00, 24696.11it/s]
100%|██████████| 153/153 [00:00<00:00, 74265.54it/s]
100%|██████████| 153/153 [00:00<00:00, 43208.22it/s]
100%|██████████| 153/153 [00:00<00:00, 13418.83it/s]
100%|██████████| 153/153 [00:00<00:00, 27227.65it/s]
100%|██████████| 153/153 [00:00<00:00, 55316.65it/s]
100%|██████████| 153/153 [00:00<00:00, 28116.39it/s]
100%|██████████| 153/153 [00:00<00:00, 48446.97it/s]
100%|██████████| 153/153 [00:00<00:00, 44441.03it/s]
100%|██████████| 153/153 [00:00<00:00, 22530.23it/s]
100%|██████████| 153/153 [00:00<00:00, 51465.92it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

122
59
27
83
145
152
74
12
141
51
18
73
87
158
110
22
96
3
183
91
151
7
174
95
190
120
173
75
169
14
105
117
153
60
16
99
53
76
21
149
172
103
3
165
117
159
179
173
136
185
46
126
115
103
187
79
53
118
135
43
66
54
139
65
124
87
59
36
189
20
166
56
163
169
88
116
17
74
105
188
23
89
76
11
8
131
16
119
94
38
38


100%|██████████| 153/153 [00:00<00:00, 10435.97it/s]
100%|██████████| 153/153 [00:00<00:00, 41284.64it/s]
100%|██████████| 153/153 [00:00<00:00, 76187.64it/s]
100%|██████████| 153/153 [00:00<00:00, 22384.84it/s]
100%|██████████| 153/153 [00:00<00:00, 36360.62it/s]
100%|██████████| 153/153 [00:00<00:00, 55938.68it/s]
100%|██████████| 153/153 [00:00<00:00, 34090.97it/s]
100%|██████████| 153/153 [00:00<00:00, 59907.44it/s]
100%|██████████| 153/153 [00:00<00:00, 46054.87it/s]
100%|██████████| 153/153 [00:00<00:00, 20628.39it/s]
100%|██████████| 153/153 [00:00<00:00, 17531.65it/s]
100%|██████████| 153/153 [00:00<00:00, 30741.49it/s]
100%|██████████| 153/153 [00:00<00:00, 42160.73it/s]
100%|██████████| 153/153 [00:00<00:00, 32546.97it/s]
100%|██████████| 153/153 [00:00<00:00, 34615.06it/s]
100%|██████████| 153/153 [00:00<00:00, 44592.35it/s]
100%|██████████| 153/153 [00:00<00:00, 42252.34it/s]
  0%|          | 0/153 [00:00<?, ?it/s]

129
165
81
56
80
28
114
8
94
5
13
153
170
141
84
146
113
22
52
95
121
38
104
80
184
162
75
15
0
152
37
151
13
28
156
179
102
136
44
172
122
92
50
134
6
185
73
64
35
123
14
137
110
168
83
167
9
90
161
109
48
45
36
16
175
20
75
0
48
159
163
7
149
91
100
170
13
87
153
7
164
35
116
178
68
162
84
15
83
156
104
82
117


100%|██████████| 153/153 [00:00<00:00, 17549.87it/s]
100%|██████████| 153/153 [00:00<00:00, 55671.77it/s]
100%|██████████| 153/153 [00:00<00:00, 60810.06it/s]
100%|██████████| 153/153 [00:00<00:00, 55036.75it/s]

15
137
107
36
88
168
15
82
11
91
148
127
155
9
81
148
7
191
115
66
144
142
43
14
57
38
123
64
100
93
5



