In [1]:
pip install python-docx


Note: you may need to restart the kernel to use updated packages.


In [2]:
#download the data from the sdg website, they have a button that allows u to download all
#take the file u get and put it inside the same directory as the jupyter notebook file
#create an empty file named "corpus"
#at the end you should get a corpus file containing 17 word documents, each for a specific SDG
from docx import Document
import os


for i in range(1,18):
    if(i<10):
        fileName = "Metadata-0" +str(i)
    else:
        fileName = "Metadata-" +str(i)
    print(fileName)
    initDoc = Document()
    for file in os.listdir("./SDG-indicator-metadata"):
        if file.startswith(fileName) and file.endswith(".docx"):  
            source_document = Document("./SDG-indicator-metadata/"+ file)
            for paragraph in source_document.paragraphs:
                text = paragraph.text
                initDoc.add_paragraph(text)
    initDoc.save("./corpus/SDG" + str(i) + ".docx")

Metadata-01
Metadata-02
Metadata-03
Metadata-04
Metadata-05
Metadata-06
Metadata-07
Metadata-08
Metadata-09
Metadata-10
Metadata-11
Metadata-12
Metadata-13
Metadata-14
Metadata-15
Metadata-16
Metadata-17


# Text Preprocessing
we will create some functions that we can use in order to make sure our data is 

In [3]:
import contractions
def decontract_words(text):
# creating an empty list
    expanded_words = []
    for word in text.split():
# using contractions.fix to expand the shortened words
      expanded_words.append(contractions.fix(word))

    expanded_text = ' '.join(expanded_words)
    return expanded_text

In [4]:
#defining the methods we will use

import re


UrlRegex = r'https?://\S+|www.\S+'
HtmlStyleLinkRegex = r'<a\s+(?:[^>]*?\s+)?href=(["])(.*?)\1'
ampRegex = r'&amp;'
brRegex = '<br/> | <br> | </br>'
specialCharRegex = r'[^a-zA-Z0-9-" "]'


def filterRegex(s):
    
    #remove all html style link
    br_regex = re.compile(r'<(?!br).*?>')
    filtered_text = br_regex.sub('', s)
    filtered_text = re.sub(UrlRegex,'', filtered_text)
    filtered_text = re.sub(ampRegex, '', filtered_text)
    
    # remove all <br>
    filtered_text = re.sub(brRegex, '', filtered_text)

    
    # remove all special characters
    filtered_text = re.sub(specialCharRegex,'',filtered_text)
    
    return filtered_text


In [5]:
import nltk
nltk.download('stopwords') 

from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

def remove_stopwords(s):
    words = s.split()
    newWords= []
    for val in words:
        if (val not in stops):
            newWords.append(val)
    
    return " ".join(newWords)



[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [6]:
#suppose we have nonsense written in our document, they will be removed by checking if thr word belongs to nltk.corpus.words
import nltk
nltk.download('words')

words = set(nltk.corpus.words.words())

def remove_nonsense(sent): 
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) if w.lower() in words or not w.isalpha())


    

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [7]:
#if you look inside the document, we have enumerations using letters and numbers, for ex a0, a1 etc... so we used this function 
#to remove the digits from them and we then used the function remove_alone_char to remove remaining letters that are alone in the context
# for ex a, b, c , ...
def removeDigitsFromString(s):
    words = s.split()
    fixedWords = []
    for val in words:
        f=filter(str.isalpha,val)
        s1= "".join(f)
        fixedWords.append(s1)
    
    return " ".join(fixedWords)
    

In [8]:
def remove_alone_char(s):
    words = s.split()
    fixedWords = []
    for val in words:
        newVal = val
        if(len(val) == 1 and val.isalpha):
            newVal = ""
        
        fixedWords.append(newVal)
    
    return " ".join(fixedWords)

In [9]:

def preprocessText(s):
    s = s.lower()   #to lower case
    s = filterRegex(s) #filter regex in the beginning to remove all weird formats specified in the function
    s = removeDigitsFromString(s) # to remove digits inside strings , for ex in enumeration of paragraphs: a0, a1..
    s = decontract_words(s)   #to remove decontractions
    s = remove_stopwords(s)    
    s = remove_alone_char(s) # to remove enumeration of paragraphs with letters:
                                #a, b and also remove the remaining string from the function removeDigitsFromString
    s = remove_nonsense(s)  # removes any word that doesnt not make sense
    return s

In [10]:
def preprocessFile(filePath):
    docToChange = Document(filePath)
    for paragraph in docToChange.paragraphs:
         paragraph.text = preprocessText(paragraph.text)
    docToChange.save(filePath)


def preprocessDirectory(directoryName):
    for file in os.listdir("./" + directoryName):
        if  file.endswith(".docx"):
            print("preprocessing "+file +" ...")
            preprocessFile("./" + directoryName + "/"+file)
    print("Done!!")

In [11]:
preprocessDirectory("corpus")

preprocessing SDG1.docx ...
preprocessing SDG10.docx ...
preprocessing SDG11.docx ...
preprocessing SDG12.docx ...
preprocessing SDG13.docx ...
preprocessing SDG14.docx ...
preprocessing SDG15.docx ...
preprocessing SDG16.docx ...
preprocessing SDG17.docx ...
preprocessing SDG2.docx ...
preprocessing SDG3.docx ...
preprocessing SDG4.docx ...
preprocessing SDG5.docx ...
preprocessing SDG6.docx ...
preprocessing SDG7.docx ...
preprocessing SDG8.docx ...
preprocessing SDG9.docx ...
Done!!


# Vectorization using GloVe
we will use GloVe, which is an already read set of mappings between words and their vectors. These mappings were created based on millions of words using unsupervised learning. We will use glove which has each word and its corresponding vector in order to be able to generate an array for the words in our document. At the end, we will need to transform this array of vectors into one vector only for each SDG document.
links used for glove:
https://analyticsindiamag.com/hands-on-guide-to-word-embeddings-using-glove/
https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db#:~:text=Brief%20Introduction%20to%20GloVe,in%20a%20high%2Ddimensional%20space
from where to download Glove files that we will use to form our dictionary of glove: https://nlp.stanford.edu/projects/glove/

In [12]:
#first let's import the libraries needed
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [13]:
#first of all download the glove zipped file that will contains 3 text files, we used the one with approx 800 mb
#we will use the first text file of 300D (300D is the one with the less dimensions for the vector of each word)

#(note that embedding means the vector corresponding to a word)
#now let's read the text file that we will use to get our pre trained embeddings



embeddings_dict = {}
with open("glove.6B/glove.6B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector



In [14]:
#little extra to check how powerful is glove
#this function gives us all related words to a word embedding given
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.euclidean(embeddings_dict[word], embedding))
embeddings_dict

find_closest_embeddings(embeddings_dict["king"])

['king',
 'queen',
 'monarch',
 'prince',
 'kingdom',
 'reign',
 'ii',
 'iii',
 'brother',
 'crown',
 'uncle',
 'nephew',
 'henry',
 'later',
 'throne',
 'father',
 'son',
 'succeeded',
 'ahrts',
 'cousin',
 'http://www.co.mo.md.us',
 ',',
 'latter',
 'dihg',
 'ruler',
 'however',
 'grandson',
 'likewise',
 'prohertrib',
 'although',
 '_____________________________________________',
 'kings',
 'afterwards',
 'drohs',
 'vi',
 '.',
 '65stk',
 'instead',
 'when',
 'bdb94',
 'k978-1',
 'bulletinyyy',
 'str95bb',
 'k977-1',
 'js94bb',
 'indeed',
 'http://www.mediabynumbers.com',
 'bb96',
 'k587-1',
 'thus',
 'mo95',
 'finally',
 'piyanart',
 'srivalo',
 'hahlt',
 'mentioned',
 'though',
 'grandfather',
 'tehf',
 'appears',
 'iv',
 'followed',
 'brought',
 'instance',
 'presumably',
 'brother-in-law',
 'both',
 'frederick',
 'accompanied',
 'fact',
 '__________________________________',
 'edward',
 'duke',
 'once',
 'whom',
 'and',
 'also',
 'named',
 'interbk',
 'http://www.nwguild.org',
 '

In [15]:
# function used to transform our document into an array of vectors, each vector corresponds for a word in that document

from nltk.tokenize import word_tokenize


def GloVe(directoryName, fileName):
    vectors = []
    if  fileName.endswith(".docx"):
        source_document = Document("./"+ directoryName + "/" + fileName)
        for paragraph in source_document.paragraphs:
            text = paragraph.text
            l = word_tokenize(text)
            embeddedL = []
            for word in l:
                if word in embeddings_dict.keys(): #some words may not be in the dictionary, so we check if the word exists in glove first
                    embeddedL.append(embeddings_dict[word])
            vectors = vectors + embeddedL
    
    return vectors



In [16]:
#use the list of vectors we got from the previous function and transform into one vector
#so instead of having a vector for each word, we will have one vector for the whole document

#in this code, we added the vectors and then divided by the number of words in our document
#note that the number of words in our document is equal to the number of vectors in the list we have

def calculateDocumentVector(listOfVectors):
    finalVector = listOfVectors[0]
    finalVector = np.subtract(finalVector, listOfVectors[0])
    for vector in listOfVectors:
        finalVector = np.add(finalVector, vector)
    numberOfWordsInDoc = len(listOfVectors)
    finalVector = np.divide(finalVector, numberOfWordsInDoc)
    return finalVector


# Pickle
we will be implementing the functions that allow us to store and load the vectors we have in a folder
so we will create 2 functions: one to store the vectors for each SDG document and one to load the vector we want later on

In [17]:
import pickle

#to learn about pickling https://ianlondon.github.io/blog/pickling-basics/

#this function will take the vector and store it as binary file inside the same directory our project is located at
#specify wb in order to write it as binary
def writePickle(fileName, vectorToStore):
   # if(!os.path.exists("./" + directoryName)):
   #     os.mkdir("./" + directoryName)
    with open(fileName, 'wb') as f:
        pickle.dump(vectorToStore, f)


In [18]:
#this function will look in the same directory were the project is, and find the file name given and will return its content
#specify rb in order to write it as binary

def readPickle(fileName):
    with open(fileName, 'rb') as f:
        readVector = pickle.load(f)
    return readVector
    


# Creating the vectors of the SDG documents:
now is the time to apply all the functions we created in order to generate document vectors for each SDG document in the corpus directory

In [19]:
#function that loops over a directory, calculates the vector for each document and saves it as a pickle inside the same directory

def creatingVectors(directory):
    for file in os.listdir("./"+directory):
        if file.endswith(".docx"):
            f = GloVe(directory,file)
            writePickle(file.replace('.docx', '.pickle'),calculateDocumentVector(f))
    

In [20]:
#using the function to calculate all the vectors for each document inside the corpus directory

creatingVectors("corpus")

# Cosine Similarity:

In [21]:
from numpy import dot
from numpy.linalg import norm

def calcSimilarity(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

# Testing with new documents
put the word you want to test inside the tests file in the same repository and it will be cleaned and you will have the vector inside a pickle file 

In [22]:
preprocessDirectory("tests")
creatingVectors("tests")


preprocessing testingHealth.docx ...
Done!!


In [24]:
calcSimilarity(readPickle("SDG3.pickle"), readPickle("testingHealth.pickle"))
#we can see that we will get a high accuracy between the health document and the sdg3 which has a health subject

0.9166996