# Motivation

# Python Imports

In [137]:
import pandas as pd

#%matplotlib inline


#import matplotlib
import numpy as np
#import missingno as msno
#import altair as alt
#from vega_datasets import data


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import re
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
#nltk.download('stopwords')
import nltk.stem
from nltk.corpus import stopwords

import numpy as np
import lda

# Load and Preprocess Data

In [362]:
with open("data/sdg_goal_en.txt", "r") as myfile:
    data = myfile.read().replace('\n', " ")
    data = data.replace('"', '')

splits = data.split('§')
lst = []
for i in splits[0:-1]:
    #print(i)
    entry = i.split('@')
    entry[0] = entry[0].replace('Ziel', '')
    entry[0] = entry[0].replace(':', '')
    lst.append((entry[0].strip(), entry[1]))

# Explore

In [363]:
df = pd.DataFrame(lst, columns =['id', 'text']) 
df.head()

Unnamed: 0,id,text
0,Goal 1,No poverty End poverty in all its forms every...
1,Goal 2,"Zero hunger End hunger, achieve food security..."
2,Goal 3,Good health and well-being for people Ensure ...
3,Goal 4,Quality education Ensure inclusive and equita...
4,Goal 5,Gender equality Achieve gender equality and e...


# Create Features

In [397]:
# function to remove integers
def drop_integers(s):
    return re.sub(r'\d+', '', s)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [303]:
pd.set_option("display.max_rows", 2000)


In [403]:
goals = {
1: "No poverty: End poverty in all its forms everywhere.", 
2: "Zero hunger : End hunger, achieve food security and improved nutrition, and promote sustainable agriculture.",
3: "Good health and well-being for people Ensure healthy lives and promote well-being for all at all ages.",
4: "Quality education: Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all.",
 5: "Gender equality :  Achieve gender equality and empower all women and girls.",
 6: "Clean water and sanitation : Ensure availability and sustainable management of water and sanitation for all.",
 7: "Affordable and clean energy :  Ensure access to affordable, reliable, sustainable and modern energy for all.",
 8: "Decent work and economic growth : Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all.",
 9: "Industry, Innovation, and Infrastructure : Build resilient infrastructure, promote inclusive and sustainable industrialization, and foster innovation.",
 10: "Reducing inequalities : Reduce income inequality within and among countries.",
 11: "Sustainable cities and communities : Make cities and human settlements inclusive, safe, resilient, and sustainable.",
 12: "Responsible consumption and production : Ensure sustainable consumption and production patterns.",
 13: "Climate action : Take urgent action to combat climate change and its impacts by regulating emissions and promoting developments in renewable energy.",
 14:"Life below water : Conserve and sustainably use the oceans, seas and marine resources for sustainable development.",
 15:"Life on land .  Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, and halt and reverse land degradation and halt biodiversity loss.",
 16:"Peace, justice and strong institutions: Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels.",
17:"Partnerships for the goals : Strengthen the means of implementation and revitalize the global partnership for sustainable development."  
}

## Remove Stop Words

In [410]:
stop_words = stopwords.words('english')
stop_words.append('By')
stop_words.append('including')
stop_words.append('dimensions')
l = ['appropriate','terms']
stop_words.extend(l)
print(len(stop_words))

184


In [411]:
vectorizer = CountVectorizer( stop_words=stop_words, preprocessor=drop_integers, ngram_range=(1,1))
X = vectorizer.fit_transform(df.text)
features = vectorizer.get_feature_names()

In [412]:
cv_dataframe=pd.DataFrame(X.toarray(),columns=features)
topic = 1
nrdocs = len(X.toarray())
for i in range(0,nrdocs):
    r = cv_dataframe.loc[i].sort_values(ascending=False)
    terms = r.index[0:6]

    lst = []
    for t in terms:
        lst.append(t) 
    print(f"topic: {topic}  {lst}")
    topic = topic + 1

topic: 1  ['poverty', 'poor', 'resources', 'vulnerable', 'reduce', 'extreme']
topic: 2  ['food', 'agricultural', 'access', 'markets', 'particular', 'ensure']
topic: 3  ['health', 'countries', 'developing', 'access', 'diseases', 'communicable']
topic: 4  ['education', 'countries', 'ensure', 'developing', 'sustainable', 'development']
topic: 5  ['women', 'girls', 'public', 'equality', 'forms', 'resources']
topic: 6  ['water', 'sanitation', 'substantially', 'management', 'reuse', 'achieve']
topic: 7  ['energy', 'technology', 'countries', 'developing', 'modern', 'clean']
topic: 8  ['growth', 'employment', 'labour', 'economic', 'countries', 'work']
topic: 9  ['countries', 'developing', 'development', 'infrastructure', 'sustainable', 'access']
topic: 10  ['countries', 'policies', 'developing', 'financial', 'institutions', 'per']
topic: 11  ['sustainable', 'persons', 'cities', 'safe', 'number', 'inclusive']
topic: 12  ['sustainable', 'countries', 'production', 'consumption', 'taking', 'develo

# TFIDF

better results and on count vector

In [413]:
tf = TfidfTransformer()
X_tfidf = tf.fit_transform(X)

In [414]:
df_tf = pd.DataFrame(X_tfidf.toarray(),columns=features)

In [421]:
topic = 1
for i in range(0,nrdocs):
    r = df_tf.loc[i].sort_values(ascending=False)
    terms = r.index[0:15]

    lst = []
    for t in terms:
        lst.append(t) 
    print(f"SDG Goal: {topic}: {goals[topic]} \n\n Keywords: {lst} \n\n")
    topic = topic + 1

SDG Goal: 1: No poverty: End poverty in all its forms everywhere. 

 Keywords: ['poverty', 'poor', 'living', 'vulnerable', 'extreme', 'resources', 'men', 'everywhere', 'social', 'End', 'forms', 'economic', 'women', 'shocks', 'floors'] 


SDG Goal: 2: Zero hunger : End hunger, achieve food security and improved nutrition, and promote sustainable agriculture. 

 Keywords: ['agricultural', 'food', 'hunger', 'markets', 'export', 'help', 'banks', 'plant', 'maintain', 'internationally', 'genetic', 'extreme', 'productivity', 'production', 'productive'] 


SDG Goal: 3: Good health and well-being for people Ensure healthy lives and promote well-being for all at all ages. 

 Keywords: ['health', 'diseases', 'communicable', 'mortality', 'medicines', 'live', 'births', 'vaccines', 'countries', 'essential', 'deaths', 'Agreement', 'low', 'Health', 'well'] 


SDG Goal: 4: Quality education: Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all. 

 Keyword

# LDA 

In [264]:
vocab = vectorizer.get_feature_names()
#titles = lda.datasets.load_reuters_titles()

AttributeError: 'TfidfTransformer' object has no attribute 'get_feature_names'

In [165]:
X.shape

(17, 5468)

In [261]:
model = lda.LDA(n_topics=17, n_iter=1500, random_state=1)
model.fit(X)  # model.fit_transform(X) is also available
topic_word = model.topic_word_  # model.components_ also works

INFO:lda:n_documents: 17
INFO:lda:vocab_size: 7528
INFO:lda:n_words: 9919
INFO:lda:n_topics: 17
INFO:lda:n_iter: 1500
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:lda:<0> log likelihood: -130203
INFO:lda:<10> log likelihood: -117856
INFO:lda:<20> log likelihood: -115957
INFO:lda:<30> log likelihood: -114392
INFO:lda:<40> log likelihood: -113431
INFO:lda:<50> log likelihood: -112953
INFO:lda:<60> log likelihood: -112031
INFO:lda:<70> log likelihood: -112175
INFO:lda:<80> log likelihood: -111930
INFO:lda:<90> log likelihood: -111544
INFO:lda:<100> log likelihood: -111501
INFO:lda:<110> log likelihood: -111337
INFO:lda:<120> log likelihood: -111147
INFO:lda:<130> log likelihood: -110882
INFO:lda:<140> log likelihood: -110558
INFO:lda:<150> log likelihood: -110984
INFO:lda:<160> log likelihood: -110755
INFO:lda:<170> log likelihood: -110134
INFO:lda:<180> log likelihood: -110786
INFO:lda:<190> log likelihood: -110924
INFO:lda:<200> log likelihood: -110666
INFO:lda:<210> log

In [263]:
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

IndexError: index 5552 is out of bounds for axis 0 with size 5468

In [45]:
# stemmer
english_stemmer = nltk.stem.SnowballStemmer('german')

def stemmer(doc):
    return [porter_stemmer.stem(w) for w in analyzer(doc)]

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        # will need to rewrite if pickled - due to lambda
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)])
   

In [47]:
vectorizer_s = StemmedCountVectorizer(min_df=5,  preprocessor=drop_integers,
                                      analyzer='word', stop_words='de') 

X = vectorizer_s.fit_transform(df.text)
print(X.shape)
print(vectorizer_s.get_feature_names()[0:20])

ValueError: not a built-in stop list: de