In [1]:
import random
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from matplotlib import pyplot as plt
import datetime
import time
import lightgbm
import spacy
nlp_fr = spacy.load('fr_core_news_md', disable=["tagger", "parser", "ner", "textcat"])
nlp_en = spacy.load('en_core_web_md', disable=["tagger", "parser", "ner", "textcat"])
from tqdm import tqdm
import os
import networkx as nx
import pickle
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin
import sklearn.feature_extraction.text as fe

Using TensorFlow backend.


In [2]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds_en = set(nltk.corpus.stopwords.words("english"))
stpwds_fr = set(nltk.corpus.stopwords.words("french"))
stemmer = nltk.stem.PorterStemmer()
spacy_stopwords_en = spacy.lang.en.stop_words.STOP_WORDS        
spacy_stopwords_fr = spacy.lang.fr.stop_words.STOP_WORDS
spacy_stopwords_union = set(spacy_stopwords_en).union(set(spacy_stopwords_fr))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fvice\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fvice\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
with open(r"training.txt", "r") as f:
    reader = csv.reader(f)
    training  = list(reader)
    
training = [element[0].split(" ") for element in training]
training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
print("Training examples shape: {}".format(training.shape))

with open(r"testing.txt", "r") as f:
    reader = csv.reader(f)
    testing  = list(reader)

testing = [element[0].split(" ") for element in testing]
testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
print("Testing examples shape: {}".format(testing.shape))

Training examples shape: (453797, 3)
Testing examples shape: (113450, 2)


In [5]:
corpus_path = r"pickles/reduced_corpus.PICKLE"
ids_path = r"pickles/IDs.PICKLE"
if os.path.exists(corpus_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
    f.close()
    with open(ids_path, 'rb') as f:
        ids = pickle.load(f)
    f.close()
else:
    directory = r"node_information/text/"
    corpus = []
    ids = []
    for filename in tqdm(os.listdir(directory)):
        with open(directory + filename, 'r', encoding='UTF-8', errors='ignore') as f:
            doc_string = []
            for line in f:
                [doc_string.append(token) for token in line.lower().strip().split(" ") if token not in spacy_stopwords_union]
            corpus.append(' '.join(doc_string))
            ids.append(filename[:-4])
    with open(corpus_path, '+wb') as f:
        pickle.dump(corpus, f)
    f.close()
    with open(ids_path, '+wb') as f:
        pickle.dump(ids, f)
    f.close() 

node_info = pd.DataFrame({'ID': ids, 'Corpus': corpus})
print("Training node info shape: {}".format(node_info.shape))

Training node info shape: (33226, 2)


In [6]:
keep_indices = random.sample(range(len(training)), k=int(round(len(training)*0.05)))
data_train_val = training.iloc[keep_indices]
data_train = training.loc[~training.index.isin(keep_indices)]

In [7]:
linked_nodes = data_train.loc[data_train['Link']=='1']
linked_nodes = linked_nodes[['Node1', 'Node2']]
linked_nodes.to_csv('linked_nodes.txt', sep=' ', index=False, header=False)

In [8]:
G=nx.read_edgelist('linked_nodes.txt', create_using=nx.Graph(), nodetype = str)

## Training

In [None]:
katz_nd = nx.katz_centrality_numpy(G, alpha=0.1, beta=1.0)

In [None]:
def katz_centrality(index, dataset):
    node1 = dataset['Node1'][index]
    node2 = dataset['Node2'][index]
    if (node1 in katz_nd.keys()) and (node2 in katz_nd.keys()): 
        katz = (katz_nd[node1] + katz_nd[node2])/2
        return katz
    else: 
        return 0

perf_val_data['page_rank']=list(map(lambda i:katz_centrality(i, training), tqdm(perf_val_data['original_index'])))
perf_test_data['page_rank']=list(map(lambda i:katz_centrality(i, testing), tqdm(perf_test_data['original_index'])))

In [None]:
#switch to fit_transform for tfidf

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans

print("Performing dimensionality reduction using LSA")
svd = TruncatedSVD(800)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(corpus_tfidf_matrix)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))


In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics

# #############################################################################
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
n_k = 1000
print("Extracting %d best features by a chi-squared test" %
      n_k)
ch2 = SelectKBest(chi2, k=n_k)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)


In [9]:
import stellargraph