In [1]:
import collections
import csv
import random
import signal

import networkx as nx
import numpy as np
import pandas
import pandas as pd
import progressbar
from sklearn.metrics import confusion_matrix, f1_score
from venn import venn
from networkx.exception import NetworkXNoPath, NodeNotFound
import matplotlib.pyplot as plt

from build_graph import build_graph
from helper_functions import read_pickle, get_link_from_article, write_pickle, ner_extract_all, clean_all_ner
from recommendation import get_article_similarity

In [None]:
try:
    articles = read_pickle('./analysis.pickle')
    graph = read_pickle('./graph.pickle')
    try:
        node_to_article_map = read_pickle('./node_to_article_map.pickle')
        article_dict = read_pickle('./article_dict.pickle')
    except FileNotFoundError:
        node_to_article_map = {}
        article_dict = {}
        for i in range(0, len(articles)):
            article_dict[get_link_from_article("RTVSLO", articles[i])] = articles[i]
            for per in articles[i]["per"]:
                try:
                    node_to_article_map[per[0]].append(i)
                except KeyError:
                    node_to_article_map[per[0]] = [i]
            for org in articles[i]["org"]:
                try:
                    node_to_article_map[org[0]].append(i)
                except KeyError:
                    node_to_article_map[org[0]] = [i]
        write_pickle(article_dict, './article_dict.pickle')
        write_pickle(node_to_article_map, './node_to_article_map.pickle')
except FileNotFoundError:
    print("err")
    exit()

In [None]:
    dej = 0
    x = article_dict["https://www.rtvslo.si/moja-generacija/80-letna-sophia-loren-gleda-na-zivljenje-pozitivno/346813"]
    y = article_dict["https://www.rtvslo.si/zabava-in-slog/znani/foto-80-let-sophie-loren-najboljsega-italijanskega-izvoza-po-pasti/346712"]
    x = ([f[0] for f in x["per"]] + [f[0] for f in x["org"]], x["name"])
    y = ([f[0] for f in y["per"]] + [f[0] for f in y["org"]], y["name"])
    subgraph = nx.Graph()
    nodes1 = x[0]
    nodes2 = y[0]
    nodes = nodes1 + nodes2
    visual = nx.Graph()
    f = 0
    g = 1
    if dej == 0:
        visual.add_node(nodes1[f], layer=1)
        visual.add_node(nodes2[g], layer=3)
        a = set(nx.neighbors(graph, nodes1[f]))
        b = set(nx.neighbors(graph, nodes2[g]))
        for x in a.intersection(b):
            visual.add_node(x, layer=2)
            visual.add_edge(nodes2[g], x)
            visual.add_edge(nodes1[f], x)
        a.add(nodes1[f])
        b.add(nodes2[g])
        ba = set()
        ba.add(nodes1[f])
        bb = set()
        bb.add(nodes2[g])
        ven_dict = {
            "N_i": a,
            "N_j": b,
        }
        print(ven_dict)
        venn(ven_dict)
        plt.show()
        dej = 1
    for i in range(0, len(nodes)):
        # Construct a set from neighbor list
        a = set(nx.neighbors(graph, nodes[i]))
        # for a1 in a.copy():
        #     for neig in nx.neighbors(graph,a1):
        #         a.add(neig)
        count_sum_a = sum([graph[nodes[i]][x]["count"] for x in a])
        occ_count_a = sum([graph[nodes[i]][x]["occ"] for x in a])
        for j in range(i + 1, len(nodes)):
            if nodes[i] == nodes[j]:
                continue
            b = set(nx.neighbors(graph, nodes[j]))
            count_sum_b = sum([graph[nodes[j]][x]["count"] for x in b])
            occ_count_b = sum([graph[nodes[j]][x]["occ"] for x in b])
            try:
                # Combined weights
                intersection = a.intersection(b)
                tmp = 0
                for inter in intersection:
                    tmp += graph[nodes[i]][inter]["occ"] / count_sum_a
                    tmp += graph[nodes[j]][inter]["occ"] / count_sum_b
                cond_prob = max(1 - (tmp / 2.0), 1e-16)
                subgraph.add_edge(nodes[i], nodes[j], weight=cond_prob)
            except ZeroDivisionError:
                continue


    plt.plot()
    pos = nx.multipartite_layout(visual, subset_key="layer")
    nx.draw(visual, pos)
    plt.show()
    colormap = []

    for node in subgraph:
        if node in nodes1 and node in nodes2:
            colormap.append("green")
        elif node in nodes1:
            colormap.append("blue")
        elif node in nodes2:
            colormap.append("red")
    plt.plot()
    pos = nx.circular_layout(subgraph)
    nx.draw(subgraph, pos, node_color=colormap)
    plt.show()

In [None]:
    CNREC_PATH = './data/CNRec'
    try:
        articles = read_pickle(CNREC_PATH + '/analysis_cnrec.pickle')
    except FileNotFoundError:
        try:
            ner_data = read_pickle(CNREC_PATH + '/ner_data_cnrec.pickle')
        except FileNotFoundError:
            ner_data = ner_extract_all(path=CNREC_PATH + "/CNRec_RawText", gpu=True, website="other", language="en")
            write_pickle(ner_data, CNREC_PATH + '/ner_data_cnrec.pickle')
        articles = clean_all_ner(ner_data)
        # Write data to file
        write_pickle(articles, CNREC_PATH + '/analysis_cnrec.pickle')

    try:
        g = read_pickle(CNREC_PATH + '/graph.pickle')
    except FileNotFoundError:
        g, co = build_graph(articles)
        write_pickle(g, CNREC_PATH + '/graph.pickle')

    # load article id dictionary
    article_mapper = {}
    with open(CNREC_PATH + '/articleToID.csv') as file:
        reader = csv.reader(file)
        for k, v in reader:
            article_mapper[v] = k
    # list articles to dict
    article_dict = {}
    for article in articles:
        article_dict[article_mapper[article["name"]]] = article
    try:
        ground_truth = pd.read_pickle(CNREC_PATH + '/ground_truth_069.pickle')
    except FileNotFoundError:

        ground_truth = pd.read_csv(CNREC_PATH + '/CNRec_groundTruth.csv')
        ground_truth["ourSim"] = 0.0

        for i, row in ground_truth.iterrows():
            x = article_dict[str(round(row["art1"]))]
            x = ([f[0] for f in x["per"]] + [f[0] for f in x["org"]], x["name"])
            y = article_dict[str(round(row["art2"]))]
            y = ([f[0] for f in y["per"]] + [f[0] for f in y["org"]], y["name"])
            ground_truth.loc[i, "ourSim"] = get_article_similarity(x, y, g)[0]
        ground_truth.to_pickle(CNREC_PATH + "/ground_truth.pickle")

    col1 = ground_truth["ourSim"]
    ground_truth["meanSimRating"] = ground_truth["meanSimRating"] / 2
    col2 = ground_truth["meanSimRating"]
    view = ground_truth[["ourSim", "meanSimRating", "art1", "art2"]]
    art1 = 0
    art2 = 1
    ground_truth_num = 1
   # print("Ground truth: ", ground_truth["meanSimRating"][ground_truth_num])
    x = article_dict[str(art1)]
    x = ([f[0] for f in x["per"]] + [f[0] for f in x["org"]], x["name"])
    y = article_dict[str(art2)]
    y = ([f[0] for f in y["per"]] + [f[0] for f in y["org"]], y["name"])
    # print("Entities art1: ", x[0])
    # print("Entities art2: ", y[0])
    set_x = set(x[0])
    set_y = set(y[0])
    intersection = set_x.intersection(set_y)
    # print("Entities in both: ", intersection)
    # print("Result: ", get_article_similarity(x, y, g))
    ground_truth["goodRatingOur"] = (ground_truth["ourSim"] >= 0.5).astype(int)
    ground_truth["goodRatingOur75"] = (ground_truth["ourSim"] >= 0.75).astype(int)
    ground_truth["goodRatingDiversityOur"] = (ground_truth["ourSim"] >= 0.5).astype(int)
    ground_truth["goodRatingDiversityOur75"] = (ground_truth["ourSim"] >= 0.75).astype(int)
    ground_truth["correct50"] = (ground_truth["goodRatingOur"] == ground_truth["GoodR+AF8-50"]).astype(int)
    ground_truth["correct75"] = (ground_truth["goodRatingOur75"] == ground_truth["GoodR+AF8-75"]).astype(int)
    ground_truth["correctDiv50"] = (ground_truth["goodRatingDiversityOur"] == ground_truth["diversity+AF8-50"]).astype(
        int)
    ground_truth["correctDiv75"] = (
                ground_truth["goodRatingDiversityOur75"] == ground_truth["diversity+AF8-75"]).astype(int)
    # print("Accurate recommendation GR50: ", sum(ground_truth["correct50"]) / len(ground_truth["correct50"]))
    # print("Accurate recommendation GR75: ", sum(ground_truth["correct75"]) / len(ground_truth["correct75"]))
    # print("Accurate recommendation DR50: ", sum(ground_truth["correctDiv50"]) / len(ground_truth["correctDiv50"]))
    # print("Accurate recommendation DR75: ", sum(ground_truth["correctDiv75"]) / len(ground_truth["correctDiv75"]))
    #
    # print("Pearson correlation: ", col1.corr(col2))
    #
    # print("Spearman correlation: ", col1.corr(col2, method="spearman"))



    def get_preds(threshold, probabilities):
        return [1 if prob > threshold else 0 for prob in probabilities]
    roc_values_50 = []
    roc_values_75 = []
    roc_values_d50 = []
    roc_values_d75 = []
    roc_values_avg = []
    max = 0
    maxi = 0
    for thresh in np.linspace(0, 1, 100):
        preds = get_preds(thresh, ground_truth["ourSim"])
        tn, fp, fn, tp = confusion_matrix(ground_truth["GoodR+AF8-50"], preds).ravel()
        tpr = tp/(tp+fn)
        preds = get_preds(thresh, ground_truth["ourSim"])
        f1_50 = f1_score(ground_truth["GoodR+AF8-50"], preds)
        f1_75 = f1_score(ground_truth["GoodR+AF8-75"], preds)
        f1_d50 = f1_score(ground_truth["diversity+AF8-50"], preds)
        f1_d75 = f1_score(ground_truth["diversity+AF8-75"], preds)
        fpr = fp/(fp+tn)

        roc_values_50.append([f1_50, thresh])
        roc_values_75.append([f1_75, thresh])
        roc_values_d50.append([f1_d50, thresh])
        roc_values_d75.append([f1_d75, thresh])
        if max < (f1_50 + f1_75)/2:
            max = (f1_50 + f1_75)/2
            maxi = thresh
        roc_values_avg.append([(f1_50 + f1_75 + f1_d75 + f1_d50)/4, thresh])
    print(max, " : ", maxi)

    fig, ax = plt.subplots(figsize=(10,7))
    tpr_values, fpr_values = zip(*roc_values_50)
    ax.plot(fpr_values, tpr_values, label="GoodR 50")
    tpr_values, fpr_values = zip(*roc_values_75)
    ax.plot(fpr_values, tpr_values, label="GoodR 75")
    tpr_values, fpr_values = zip(*roc_values_d50)
    ax.plot(fpr_values, tpr_values, label="Diversity 50")
    tpr_values, fpr_values = zip(*roc_values_d75)
    ax.plot(fpr_values, tpr_values, label="Diversity 75")
    tpr_values, fpr_values = zip(*roc_values_avg)
    ax.axvline(x=0.4848484848, label="Optimal threshold")

    # ax.plot(fpr_values, tpr_values, label="Average")
    # ax.plot(np.linspace(0, 1, 100),
    #          np.linspace(0, 1, 100),
    #          label='baseline',
    #          linestyle='--')
    # plt.title('Receiver Operating Characteristic Curve', fontsize=18)
    plt.ylabel('F1 Score', fontsize=16)
    plt.xlabel('Prediction threshold', fontsize=16)
    plt.legend(fontsize=12)
    ground_truth["goodRatingOur"] = (ground_truth["ourSim"] >= 0.6).astype(int)
    ground_truth["goodRatingOur75"] = (ground_truth["ourSim"] >= 0.75).astype(int)
    ground_truth["correct50"] = (ground_truth["goodRatingOur"] == ground_truth["GoodR+AF8-50"]).astype(int)
    print("Accurate recommendation GR50 new: ", sum(ground_truth["correct50"]) / len(ground_truth["correct50"]))

In [None]:
df = pd.read_csv('./skupen.csv')
df.loc[43, "recommendPerson"] = 0.0
df.loc[43, "scoreAlgo"] = 0.0

article_dict = read_pickle('./article_dict.pickle')
for i, row in df.iterrows():
    x = articles[article_dict[row["article1"]]]
    x = ([f[0] for f in x["per"]] + [f[0] for f in x["org"]], x["name"])
    y = articles[article_dict[row["article1"]]]
    y = ([f[0] for f in y["per"]] + [f[0] for f in y["org"]], y["name"])
    df.loc[i, "scoreAlgo"] = get_article_similarity(x, y, graph)[0]


def get_preds(threshold, probabilities):
    return [1 if prob > threshold else 0 for prob in probabilities]
roc_values = []
max = 0
maxi = 0
for thresh in np.linspace(0, 1, 100):
    preds = get_preds(thresh, df["scoreAlgo"])
    tn, fp, fn, tp = confusion_matrix( df["recommendPerson"], preds).ravel()
    tpr = tp/(tp+fn)
    f1_50 = f1_score(df["recommendPerson"], preds)
    fpr = fp/(fp+tn)
    if max < f1_50:
        max = f1_50
        maxi = thresh
    roc_values.append([f1_50, thresh])

print(max," : ", maxi)
fig, ax = plt.subplots(figsize=(10,7))
tpr_values, fpr_values = zip(*roc_values)
ax.plot(fpr_values, tpr_values, label="Good recommendation")

# ax.plot(fpr_values, tpr_values, label="Average")
# ax.plot(np.linspace(0, 1, 100),
#          np.linspace(0, 1, 100),
#          label='baseline',
#          linestyle='--')
# plt.title('Receiver Operating Characteristic Curve', fontsize=18)
plt.ylabel('F1 Score', fontsize=16)
plt.xlabel('Prediction threshold', fontsize=16)
plt.legend(fontsize=12)
   # print("Accurate recommendation: ", sum(df["correct50"]) / len(df["correct50"]))