In [12]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

##Task 0 - Dataset
I download a dataset with 17 SDG and filter it to work with SGD16 only. I also preliminary remove all unnecessary symbols from the texts.

In [2]:
df_osdg = pd.read_csv('C:/Users/Anjou/Downloads/osdg-community-data-v2022-10-01.csv', sep='\t')
df_osdg[df_osdg.columns[0].split('\t')] = df_osdg.iloc[:,0].str.split('\t', expand=True)
df_osdg.drop(df_osdg.columns[0], axis=1, inplace=True)

print('Shape:', df_osdg.shape)
print(df_osdg.head())
print(df_osdg[df_osdg['sdg']=='16'])

data = df_osdg.text.values.tolist()
data = [re.sub('[^A-Za-z0-9]+', ' ', sent) for sent in data]
print(data[:1])

Shape: (37575, 7)
                          doi                           text_id  \
0  10.6027/9789289342698-7-en  00021941702cd84171ff33962197ca1f   
1        10.18356/eca72908-en  00028349a7f9b2485ff344ae44ccfd6b   
2  10.1787/9789264289062-4-en  0004eb64f96e1620cd852603d9cbe4d4   
3     10.1787/5k9b7bn5qzvd-en  0006a887475ccfa5a7f5f51d4ac83d02   
4  10.1787/9789264258211-6-en  0006d6e7593776abbdf4a6f985ea6d95   

                                                text sdg labels_negative  \
0  "From a gender perspective, Paulgaard points o...   5               1   
1  Labour legislation regulates maximum working h...  11               2   
2  The average figure also masks large difference...   3               1   
3  The extent to which they are akin to corruptio...   3               1   
4  A region reporting a higher rate will not earn...   3               2   

  labels_positive           agreement  
0               8  0.7777777777777778  
1               1  0.3333333333333333  
2 

##Task 1 - Topic Modelling and Optimization

For Topic Modelling I decided to use Latent Dirichlet Allocation (LDA). LDA is a generative probabilistic model that assumes each topic is a mixture over an underlying set of words, and each document is a mixture of over a set of topic probabilities. The purpose of LDA is mapping each document in our corpus to a set of topics which covers a good deal of the words in the document. As it doesn't require training, LDA is a convenient way to get a fast and relatively simple initial approach to the text analysis. LDA gives me a collection of documents that the algorithm has grouped together, as well as clusters of words and expressions that it used to infer these relations.

In [8]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
                             stop_words='english',             
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}') 
data_vectorized = vectorizer.fit_transform(data)


lda_model = LatentDirichletAllocation(n_components=20,max_iter=10,learning_method='online',random_state=100,batch_size=128,evaluate_every = -1,n_jobs = -1,               )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
 evaluate_every=-1, learning_decay=0.7,
 learning_method="online", learning_offset=10.0,
 max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
 n_components=10, n_jobs=-1, perp_tol=0.1,
 random_state=100, topic_word_prior=None,
 total_samples=1000000.0, verbose=0)

print(f'The LDA was launched with params: {lda_model.get_params()}')

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [9]:
print("Log Likelihood: ", lda_model.score(data_vectorized)) # Better result is the higher one
print("Perplexity: ", lda_model.perplexity(data_vectorized)) # Better result is the lower one

Log Likelihood:  -15291625.518245563
Perplexity:  3206.432822988328


In [14]:
# building Topic Matrix based on a resulted documents
lda_output = lda_model.transform(data_vectorized)

topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic

def color(val):
 color = "yellow" if val > .1 else "blue"
 return "color: {col}".format(col=color)
def to_bold(val):
 weight = 700 if val > .1 else 400
 return "font-weight: {weight}".format(weight=weight)

df_document_topics = df_document_topic.head(20).style.applymap(color).applymap(to_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic
Doc0,0.0,0.08,0.0,0.06,0.0,0.0,0.0,0.0,0.49,0.0,0.0,0.01,0.02,0.0,0.0,0.3,0.0,0.0,0.0,0.03,8
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.15,0.12,0.0,0.04,0.0,0.14,0.27,0.0,0.0,0.0,0.0,0.22,14
Doc2,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.0,0.0,17
Doc3,0.09,0.0,0.0,0.0,0.03,0.0,0.0,0.1,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.42,19
Doc4,0.36,0.1,0.0,0.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.16,0.05,0.0,0
Doc5,0.1,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.07,0.11,0.0,0.0,0.0,0.62,19
Doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.73,0.03,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.09,0.03,9
Doc7,0.03,0.0,0.0,0.0,0.11,0.0,0.1,0.58,0.0,0.1,0.0,0.0,0.0,0.03,0.03,0.0,0.0,0.0,0.0,0.0,7
Doc8,0.0,0.06,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
Doc9,0.61,0.0,0.16,0.0,0.0,0.0,0.12,0.0,0.0,0.06,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [17]:
# Words per each topic visualisation
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model)

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Topics = ["Topic_"+str(i) for i in range(lda_model.n_components)]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,...,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19,Topics
Topic 0,cent,countries,average,year,share,united,agricultural,higher,total,2014,...,2013,2011,2012,figure,2008,percentage,africa,period,south,Topic_0
Topic 1,areas,urban,rural,development,services,access,business,local,region,new,...,sector,economic,economy,cities,infrastructure,green,firms,agricultural,regional,Topic_1
Topic 2,food,need,risk,term,policies,long,trade,market,countries,support,...,costs,important,systems,example,improve,needed,cost,risks,reduce,Topic_2
Topic 3,international,law,article,political,security,conflict,state,justice,criminal,theory,...,global,legal,rule,world,politics,power,scholarship,terrorism,war,Topic_3
Topic 4,climate,development,national,management,finance,local,adaptation,change,private,regional,...,support,sector,government,planning,plan,plans,resources,level,funding,Topic_4
Topic 5,energy,electricity,countries,sector,supply,gdp,demand,renewable,production,efficiency,...,sectors,generation,gas,oil,technologies,fuel,investment,low,global,Topic_5
Topic 6,water,land,environmental,use,waste,billion,costs,pollution,resources,supply,...,cost,environment,expected,services,infrastructure,natural,users,benefits,sanitation,Topic_6
Topic 7,health,care,services,quality,communities,treatment,mental,patients,service,courts,...,prevent,diseases,providers,mortality,drug,widely,emergency,life,community,Topic_7
Topic 8,women,gender,work,men,labour,social,family,employment,young,time,...,participation,female,people,age,countries,girls,violence,migration,working,Topic_8
Topic 9,national,government,decision,policy,evaluation,ministry,management,authorities,federal,making,...,regulatory,process,implementation,level,tourism,agency,framework,agencies,commission,Topic_9
