## Prérequis

In [1]:
!pip install git+https://github.com/kerighan/textgraph
!pip install openpyxl
!pip install scikit-learn -U
!pip install bert_embedding
!pip install torch
!pip install git+https://github.com/kerighan/convectors

Collecting git+https://github.com/kerighan/textgraph
  Cloning https://github.com/kerighan/textgraph to /tmp/pip-req-build-ans8ld2m
  Running command git clone --filter=blob:none --quiet https://github.com/kerighan/textgraph /tmp/pip-req-build-ans8ld2m
  Resolved https://github.com/kerighan/textgraph to commit 6312c29eb43ab83858f6e7ae66f602101fe3f04b
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: textgraph
  Building wheel for textgraph (setup.py) ... [?25ldone
[?25h  Created wheel for textgraph: filename=textgraph-0.0.0-py3-none-any.whl size=4984 sha256=15e18af2951ac22349e5e953cb5f390f4edbf6ecc00c7706e168857b81ffe31d
  Stored in directory: /tmp/pip-ephem-wheel-cache-bqmqwvp4/wheels/0c/77/ea/79e80176903744e8f6bdeda8083a0e0e4fc1331fa5647542fa
Successfully built textgraph
Installing collected packages: textgraph
Successfully installed textgraph-0.0.0
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' comman

In [1]:
import pandas as pd
from textgraph.graph import TextGraph
import networkx as nx
import numpy as np
import regex as re

## Constitution du dataset

In [2]:
xls = pd.ExcelFile('data/afd_snaps_labeled_cibles.xlsx')
df12 = pd.read_excel(xls, 'SDG 12')
df15 = pd.read_excel(xls, 'SDG 15')
df16 = pd.read_excel(xls, 'SDG 16')

In [3]:
unlabelled_df = pd.read_csv("data/raw_filtered.csv")
def get_unlabelled(data, SDG):
    
    data = data[data["SDG"+str(SDG)] == 1].drop(columns = "Unnamed: 0").reset_index(drop = True)
    data.columns = ["Text"] + list(data.columns[1:])
    data = data[["Text"]]
    data.Text = [re.sub("[^a-zA-Z0-9]", " ", text) for text in data.Text]
    empty_cols = ["" for i in range(len(data))]
    data["Manual_1"] =empty_cols
    data["Manual_2 "] = empty_cols
    return(data)

    
unlabelled_df12 = get_unlabelled(unlabelled_df, 12)
unlabelled_df15 = get_unlabelled(unlabelled_df, 15)
unlabelled_df16 = get_unlabelled(unlabelled_df, 16)

## Aggrégation des données scrappées

In [4]:
unlabelled_aug = pd.read_csv("data/unlabeled_snaps.csv", sep = ";")

unlabelled_aug_df12 = get_unlabelled(unlabelled_aug, 12)
unlabelled_aug_df15 = get_unlabelled(unlabelled_aug, 15)
unlabelled_aug_df16 = get_unlabelled(unlabelled_aug, 16)

In [5]:
labelled_desc_12 = pd.read_csv("data/aug_cibles_12.csv").drop(columns = "Unnamed: 0")
labelled_desc_15 = pd.read_csv("data/aug_cibles_15.csv").drop(columns = "Unnamed: 0")
labelled_desc_16 = pd.read_csv("data/aug_cibles_16.csv").drop(columns = "Unnamed: 0")

In [6]:
full_df12 = pd.concat([df12,labelled_desc_12, unlabelled_aug_df12, unlabelled_df12], axis = 0).reset_index(drop = True)
full_df15 = pd.concat([df15,labelled_desc_15, unlabelled_aug_df15, unlabelled_df15], axis = 0).reset_index(drop = True)
full_df16 = pd.concat([df16,labelled_desc_16, unlabelled_aug_df16, unlabelled_df16], axis = 0).reset_index(drop = True)

## Calcul de l'embedding de chaque mot

In [7]:
import pickle
def load_wv():
    with open("wv.pkl", "rb") as f:
        wv = pickle.load(f)
    return wv

In [8]:
import pickle
def save_wv(wv):
    with open("wv.pkl", "wb") as f:
        pickle.dump(wv, f)

In [9]:
from bert_embedding import BertEmbedding

def generate_embeddings(df, first_time = False):
    df.Text = [text.replace(".","").replace(",", "").lower() for text in df.Text]
    if first_time:
        wv = {}
    else:
        wv = load_wv()
    new_words = []
    for text in df["Text"]:
        sent = text.split()
        for word in sent:
            if word in wv or word in new_words:
                continue
            new_words.append(word)
    new_words = list(set(new_words))
    bert_embedding = BertEmbedding()
    result = bert_embedding(new_words)
    for pair in result:

        wv[pair[0][0]] = pair[1][0]
    save_wv(wv)
    return wv

wv = generate_embeddings(full_df12)
print("done with 12")
wv = generate_embeddings(full_df15)
print("done with 15")
wv = generate_embeddings(full_df16)
print("done with 16")

done with 12
done with 15
done with 16


## Création du graph

In [10]:
def generate_graph(df):
    G = TextGraph(wv_threshold=.5, stopwords=["en"]).fit(full_df12["Text"], wv = wv)
    A = nx.adjacency_matrix(G)
    degrees = []
    for d in G.degree(weight=None):
        if d[1] == 0:
            degrees.append(0)
        else:
            degrees.append(d[1]**(-0.5))

    print(len(degrees))
    print(A.shape)
    degrees = np.diag(degrees)
    A_hat = degrees @ A @ degrees
    return G, A_hat

In [11]:
###Modele avec co occurence

#G= TextGraph(wv_threshold=.5, stopwords=["en"]).fit(full_df12["Text"])
#nx.draw(G)
#A = nx.adjacency_matrix(G)

## Création du GCN

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class gcn(nn.Module):
    def __init__(self, X_size, A_hat, num_classes, bias=True): # X_size = num features
        super(gcn, self).__init__()
        self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
        self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, hidden_size_1))
        var = 2./(self.weight.size(1)+self.weight.size(0))
        self.weight.data.normal_(0,var)
        self.weight2 = nn.parameter.Parameter(torch.FloatTensor(hidden_size_1, hidden_size_2))
        var2 = 2./(self.weight2.size(1)+self.weight2.size(0))
        self.weight2.data.normal_(0,var2)
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(hidden_size_1))
            self.bias.data.normal_(0,var)
            self.bias2 = nn.parameter.Parameter(torch.FloatTensor(hidden_size_2))
            self.bias2.data.normal_(0,var2)
        else:
            self.register_parameter("bias", None)
        self.fc1 = nn.Linear(hidden_size_2, num_classes)
        
    def forward(self, X): ### 2-layer GCN architecture
        X = torch.mm(X, self.weight)
        if self.bias is not None:
            X = (X + self.bias)
        X = F.relu(torch.mm(self.A_hat, X))
        X = torch.mm(X, self.weight2)
        if self.bias2 is not None:
            X = (X + self.bias2)
        X = F.relu(torch.mm(self.A_hat, X))
        return self.fc1(X)

In [13]:
hidden_size_1 = 200
hidden_size_2 = 100

num_epochs=100
lr=0.011
model_no =0




## Entrainement du GCN

In [14]:
def load_pickle(filename):
    completeName = os.path.join("save/",\
                                filename)
    with open(completeName, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data

def save_as_pickle(filename, data):
    completeName = os.path.join("save/",\
                                filename)
    with open(completeName, 'wb') as output:
        pickle.dump(data, output)

In [15]:
def mapper(df):
    map = {}
    df2 = df.copy()
    j = 0
    for i in list(set(df["Manual_1"])):
        map[i] = j
        j+=1
    labelled = df[df["Manual_1"] != ""]
    df2.loc[:(len(labelled)-1),"Manual_1"] = [map[df["Manual_1"][i]] for i in range(len(labelled))]
    return map , df2

In [23]:
import torch.nn as nn
import torch.optim as optim
import os


def evaluate(output, labels_e):
    _, labels = output.max(1)
    labels = labels.numpy()
    return sum([(e) for e in labels_e] == labels)/len(labels)



def train(data):
    map, data = mapper(data)
    print(map)

    G, A_hat = generate_graph(data)

    I = np.eye(A_hat.shape[0])
    net = gcn(A_hat.shape[0], A_hat, num_classes = len(list(set(data["Manual_1"]))) )

    

    f = torch.from_numpy(I).float()
    selected = np.array(range(len(data[data["Manual_1"] != ""]))) # Indices des datas labelisées
    test_idx = np.random.choice(selected, size = 10)
    
    

    selected = np.array(list(set(selected) - set(test_idx)))

    
    labels_selected = data.loc[selected, "Manual_1"]
    labels_not_selected = data.loc[test_idx, "Manual_1"]

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000,2000,3000,4000,5000,6000], gamma=0.77)

    start_epoch, best_pred = 0, 0
    losses_per_epoch, evaluation_untrained = [], []

    net.train()

    evaluation_trained = []
    for e in range(start_epoch, num_epochs):
        optimizer.zero_grad()
        output = net(f)
        loss = criterion(output[selected], torch.tensor(labels_selected).long())
        losses_per_epoch.append(loss.item())
        loss.backward()
        optimizer.step()
        if e % 10 == 0:
            ### Evaluate other untrained nodes and check accuracy of labelling
            net.eval()
            with torch.no_grad():
                pred_labels = net(f)
                trained_accuracy = evaluate(output[selected], labels_selected)
                untrained_accuracy = evaluate(pred_labels[test_idxs], labels_not_selected)
            evaluation_trained.append((e, trained_accuracy))
            evaluation_untrained.append((e, untrained_accuracy))
            print("[Epoch ", e, "]: Evaluation accuracy of trained nodes:", trained_accuracy)
            print("[Epoch ", e, "]: Evaluation accuracy of test nodes:", untrained_accuracy)
            #print("Labels of trained nodes: \n", output[selected].max(1)[1])
            net.train()
            if trained_accuracy > best_pred:
                best_pred = trained_accuracy
                torch.save({
                    'epoch': e + 1,\
                    'state_dict': net.state_dict(),\
                    'best_acc': trained_accuracy,\
                    'optimizer' : optimizer.state_dict(),\
                    'scheduler' : scheduler.state_dict(),\
                }, os.path.join("save/" ,\
                "test_model_best_%d.pth.tar" % model_no))
    scheduler.step()
    net(f)
    return output.max(1)[1], map

In [24]:
train(full_df15)

{0.0: 0, '15.c': 1, '': 2, '0.0': 3, 15.6: 4, 15.3: 5, 15.5: 6, 15.1: 7, 15.2: 8, '15.2': 9, '15.6': 10, '15.7': 11, '15.8': 12, '15.5': 13, '15.a': 14, '15.b': 15, '15.1': 16, '15.4': 17, '15.3': 18, '15.9': 19}
5583
(5583, 5583)
[Epoch  0 ]: Evaluation accuracy of trained nodes: 0.047619047619047616
[Epoch  0 ]: Evaluation accuracy of test nodes: 0
[Epoch  10 ]: Evaluation accuracy of trained nodes: 0.23015873015873015
[Epoch  10 ]: Evaluation accuracy of test nodes: 0.03895377781711676
[Epoch  20 ]: Evaluation accuracy of trained nodes: 0.3253968253968254
[Epoch  20 ]: Evaluation accuracy of test nodes: 0.16504250861183756
[Epoch  30 ]: Evaluation accuracy of trained nodes: 0.49206349206349204
[Epoch  30 ]: Evaluation accuracy of test nodes: 0.3216584816047163
[Epoch  40 ]: Evaluation accuracy of trained nodes: 0.5714285714285714
[Epoch  40 ]: Evaluation accuracy of test nodes: 0.4055918912607606
[Epoch  50 ]: Evaluation accuracy of trained nodes: 0.6031746031746031
[Epoch  50 ]: Ev

(tensor([ 0,  0,  7,  ...,  1, 16, 16]),
 {0.0: 0,
  '15.c': 1,
  '': 2,
  '0.0': 3,
  15.6: 4,
  15.3: 5,
  15.5: 6,
  15.1: 7,
  15.2: 8,
  '15.2': 9,
  '15.6': 10,
  '15.7': 11,
  '15.8': 12,
  '15.5': 13,
  '15.a': 14,
  '15.b': 15,
  '15.1': 16,
  '15.4': 17,
  '15.3': 18,
  '15.9': 19})

In [25]:
train(full_df16)

{'16.10': 0, 0: 1, '': 2, '16.4': 3, '16.a': 4, '16.8': 5, '16.2': 6, '0': 7, '16.6': 8, '16.b': 9, '16.5': 10, '16.1': 11, '16.9': 12, '16.7': 13, '16.3': 14}
5583
(5583, 5583)
[Epoch  0 ]: Evaluation accuracy of trained nodes: 0.04411764705882353
[Epoch  0 ]: Evaluation accuracy of test nodes: 0
[Epoch  10 ]: Evaluation accuracy of trained nodes: 0.4485294117647059
[Epoch  10 ]: Evaluation accuracy of test nodes: 0.27531079160977207
[Epoch  20 ]: Evaluation accuracy of trained nodes: 0.7132352941176471
[Epoch  20 ]: Evaluation accuracy of test nodes: 0.54241338736425
[Epoch  30 ]: Evaluation accuracy of trained nodes: 0.8382352941176471
[Epoch  30 ]: Evaluation accuracy of test nodes: 0.6485992464387235
[Epoch  40 ]: Evaluation accuracy of trained nodes: 0.9191176470588235
[Epoch  40 ]: Evaluation accuracy of test nodes: 0.7339660626462265
[Epoch  50 ]: Evaluation accuracy of trained nodes: 0.9191176470588235
[Epoch  50 ]: Evaluation accuracy of test nodes: 0.7203979717140057
[Epoch 

(tensor([ 0,  8, 10,  ...,  0,  9,  9]),
 {'16.10': 0,
  0: 1,
  '': 2,
  '16.4': 3,
  '16.a': 4,
  '16.8': 5,
  '16.2': 6,
  '0': 7,
  '16.6': 8,
  '16.b': 9,
  '16.5': 10,
  '16.1': 11,
  '16.9': 12,
  '16.7': 13,
  '16.3': 14})

In [26]:
train(full_df12)

{0: 0, '': 1, '12.a': 2, '12.1': 3, '12.2': 4, '12.3': 5, '12.8': 6, '12.b': 7, '12.c': 8, '12.5': 9, '12.6': 10, '12.7': 11, '12.4': 12, '0': 13}
5583
(5583, 5583)
[Epoch  0 ]: Evaluation accuracy of trained nodes: 0.04032258064516129
[Epoch  0 ]: Evaluation accuracy of test nodes: 0
[Epoch  10 ]: Evaluation accuracy of trained nodes: 0.49193548387096775
[Epoch  10 ]: Evaluation accuracy of test nodes: 0.31032624747786325
[Epoch  20 ]: Evaluation accuracy of trained nodes: 0.6774193548387096
[Epoch  20 ]: Evaluation accuracy of test nodes: 0.5249016054858532
[Epoch  30 ]: Evaluation accuracy of trained nodes: 0.8548387096774194
[Epoch  30 ]: Evaluation accuracy of test nodes: 0.6727774250003
[Epoch  40 ]: Evaluation accuracy of trained nodes: 0.8790322580645161
[Epoch  40 ]: Evaluation accuracy of test nodes: 0.7242068828498807
[Epoch  50 ]: Evaluation accuracy of trained nodes: 0.9435483870967742
[Epoch  50 ]: Evaluation accuracy of test nodes: 0.7530391033255852
[Epoch  60 ]: Evalua

(tensor([12, 12,  9,  ...,  7,  7,  3]),
 {0: 0,
  '': 1,
  '12.a': 2,
  '12.1': 3,
  '12.2': 4,
  '12.3': 5,
  '12.8': 6,
  '12.b': 7,
  '12.c': 8,
  '12.5': 9,
  '12.6': 10,
  '12.7': 11,
  '12.4': 12,
  '0': 13})

## Testing

In [18]:
import os
files = os.listdir("data/test/")

In [19]:
df_target = pd.read_csv("data/test/processed_odd15.csv", sep = ";").drop(columns = "Unnamed: 0")
empty_cols = ["" for i in range(len(df_target))]

df_target["Manual_1"] =empty_cols
df_target["Manual_2 "] = empty_cols
df_target.columns = ["Text", "Manual_1", "Manual_2 "]

In [20]:
test_df15 = pd.concat([full_df15, df_target]).reset_index(drop = True)

In [21]:
wv = generate_embeddings(test_df15)

In [23]:
out, map = train(test_df15)

{0.0: 0, '': 1, '15.a': 2, '15.c': 3, '15.8': 4, 15.6: 5, 15.3: 6, 15.5: 7, 15.1: 8, 15.2: 9, '15.6': 10, '15.1': 11, '15.b': 12, '15.2': 13, '15.9': 14, '15.7': 15, '15.4': 16, '15.5': 17, '0.0': 18, '15.3': 19}
5583
(5583, 5583)
range(0, 126)
[Epoch 0]: Evaluation accuracy of trained nodes: 0.1111111
Labels of trained nodes: 
 tensor(2.9993, grad_fn=<NllLossBackward0>) tensor([18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18])
[Epoch 10]: Evaluation accuracy of trained nodes: 0.2539683
Label

In [31]:
out

tensor([17, 17,  7,  ..., 14,  9,  9])

In [43]:
for i in range(len(df_target)):
    print(out[-i-1])

tensor(9)
tensor(9)
tensor(16)
tensor(16)
tensor(16)


In [44]:
map

{0.0: 0,
 '15.5': 1,
 '': 2,
 '15.8': 3,
 15.6: 4,
 15.3: 5,
 15.5: 6,
 15.1: 7,
 15.2: 8,
 '15.1': 9,
 '15.3': 10,
 '15.6': 11,
 '15.b': 12,
 '15.7': 13,
 '15.c': 14,
 '15.9': 15,
 '15.a': 16,
 '0.0': 17,
 '15.4': 18,
 '15.2': 19}