In [70]:
import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import numpy as np
import networkx as nx
from collections import OrderedDict
from itertools import combinations
import math
from tqdm import tqdm
import logging
import torch
import matplotlib.pyplot as plt
import re

In [52]:
logging.basicConfig(format='%(asctime)s [%(levelname)s]: %(message)s', \
                    datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
logger = logging.getLogger(os.path.join(os.path.abspath(''), 'preprocessing.ipynb'))
dataset = 'ectf'
data_dir = os.path.abspath(os.path.join('..', '..', 'data'))
dataset_dir = os.path.abspath(os.path.join('..', '..', 'datasets'))
train_data = os.path.join(dataset_dir, dataset, 'train3.csv')
test_ratio = 0.2
max_vocab_len = 10000
display(data_dir)
display(dataset_dir)
display(train_data)

'c:\\Users\\nafta\\Desktop\\Project\\TweetAnalyzer.git\\backend\\our_bgsrd\\data'

'c:\\Users\\nafta\\Desktop\\Project\\TweetAnalyzer.git\\backend\\our_bgsrd\\datasets'

'c:\\Users\\nafta\\Desktop\\Project\\TweetAnalyzer.git\\backend\\our_bgsrd\\datasets\\ectf\\train3.csv'

In [53]:
def load_pickle(filename):
    completeName = os.path.join(data_dir, filename)
    with open(completeName, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data

def save_as_pickle(filename, data):
    completeName = os.path.join(data_dir, filename)
    with open(completeName, 'wb') as output:
        pickle.dump(data, output)

In [85]:
### remove stopwords and non-words from tokens list
def filter_tokens(tokens):
    stopwords = list(set(nltk.corpus.stopwords.words("english")))
    non_alphbetical = [".",",",";","&","'s", ":", "?", "!","(",")", "@","'","'m","'no","***","--","...","[","]", "#", "%", "''", "$", "+"]
    clean_tokens = []
    for token in tokens:
        token = token.lower()
        if token not in stopwords and token not in non_alphbetical:
            token = re.sub('[\',.;]', '', token)
            if len(token.strip()) > 0:
                clean_tokens.append(token)
    return clean_tokens

def dummy_fun(doc):
    return doc

# binomial coefficient - number of options to choose
def nCr(n,r):
    f = math.factorial
    return int(f(n)/(f(r)*f(n-r)))

def word_word_edges(p_ij):
    word_word = []
    cols = list(p_ij.columns)
    cols = [str(word) for word in cols]
    for w1, w2 in tqdm(combinations(cols, 2), total=nCr(len(cols), 2)):
        if (p_ij.loc[w1,w2] > 0):
            word_word.append((w1,w2,{"weight":p_ij.loc[w1,w2]}))
    return word_word

In [90]:
def generate_text_graph(train_data, max_vocab_len, window=20):
    """ generates graph based on text corpus (columns = (text, label)); window = sliding window size to calculate point-wise mutual information between words """
    logger.info("Preparing data...")
    df = pd.read_csv(train_data)
    df.dropna(inplace=True)

    ### tokenize & remove funny characters
    df["text"] = df["text"].apply(lambda x: nltk.word_tokenize(x)).apply(lambda x: filter_tokens(x))
    save_as_pickle("df_data.pkl", df)
    
    ### Tfidf
    logger.info("Calculating Tf-idf...")
    vectorizer = TfidfVectorizer(input="content", max_features=max_vocab_len, tokenizer=dummy_fun, preprocessor=dummy_fun)
    vectorizer.fit(df["text"])
    # initalize tfidf matrix, at this point the matrix contains only zeros
    df_tfidf = vectorizer.transform(df["text"])
    df_tfidf = df_tfidf.toarray()
    
    vocab = vectorizer.get_feature_names()
    vocab = np.array(vocab)
    df_tfidf = pd.DataFrame(df_tfidf, columns=vocab)
    del vectorizer
    
    #### Build graph
    logger.info("Building graph (No. of document: %d, word nodes: %d)..." %(len(df_tfidf.index), len(vocab)))
    G = nx.Graph()
    logger.info("Adding document nodes to graph...")
    G.add_nodes_from(df_tfidf.index) ## document nodes
    logger.info("Adding word nodes to graph...")
    G.add_nodes_from(vocab) ## word nodes
    ### build edges between document-word pairs
    logger.info("Building document-word edges...")
    for doc in tqdm(df_tfidf.index, total = len(df_tfidf.index)):
        for word in df_tfidf.columns:
            G.add_edge(doc, word, weight = df_tfidf.loc[doc,word])
    #print(df_tfidf.apply(lambda col: col.unique()))
    del df_tfidf


    #elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] > 0.5]
    #esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d["weight"] <= 0.5]
    #pos = nx.spring_layout(G, seed=7)
    #nx.draw_networkx_nodes(G, pos, node_size=700)
    ## edges
    #nx.draw_networkx_edges(G, pos, edgelist=elarge, width=6)
    #nx.draw_networkx_edges(G, pos, edgelist=esmall, width=6, alpha=0.5, edge_color="b", style="dashed")
#
    ## node labels
    #nx.draw_networkx_labels(G, pos, font_size=20, font_family="sans-serif")
    ## edge weight labels
    #edge_labels = nx.get_edge_attributes(G, "weight")
    #nx.draw_networkx_edge_labels(G, pos, edge_labels)
    #
    #ax = plt.gca()
    #ax.margins(0.08)
    #plt.axis("off")
    #plt.tight_layout()
    #plt.show()

    
   ### PMI between words
    n_i  = OrderedDict((name, 0) for name in vocab)
    word2index = OrderedDict( (name,index) for index,name in enumerate(vocab) )
    display(word2index)
    occurrences = np.zeros( (len(vocab),len(vocab)) ,dtype=np.int32)
    # Find the co-occurrences:
    number_of_windows = 0 
    logger.info("Calculating co-occurences...")
    for l in tqdm(df["text"], total = len(df["text"])):
        for i in range(len(l) - window):
            number_of_windows += 1
            doc = set(l[i:(i + window)])
            for word in doc:
                n_i[word] += 1
            for w1,w2 in combinations(doc,2):
                i1 = word2index[w1]
                i2 = word2index[w2]
                occurrences[i1][i2] += 1
                occurrences[i2][i1] += 1
    
    #display(occurrences)
    #display(occurrences.apply(lambda col: col.unique()))
    #display(np.unique(occurrences))
    #display(len(np.unique(occurrences)))
    del df, word2index
    logger.info("Calculating PMI*...")
    ### convert to PMI
    p_ij = pd.DataFrame(occurrences, index = vocab, columns = vocab) / number_of_windows
    p_i = pd.Series(n_i, index=n_i.keys()) / number_of_windows
    del occurrences, n_i, vocab
    
    for col in p_ij.columns:
        p_ij[col] = p_ij[col]/p_i[col]
    for row in p_ij.index:
        p_ij.loc[row,:] = p_ij.loc[row,:]/p_i[row]
    p_ij = p_ij + 1E-9
    for col in p_ij.columns:
        p_ij[col] = p_ij[col].apply(lambda x: math.log(x))
    
        
    logger.info("Building word-word edges...")
    word_word = word_word_edges(p_ij)
    save_as_pickle("word_word_edges.pkl", word_word)
    G.add_edges_from(word_word)
    save_as_pickle("text_graph.pkl", {"graph": G})
    logger.info("Done and saved!")

In [91]:
logger.info("Loading data...")
df_data_path = os.path.join(data_dir, "df_data.pkl")
graph_path = os.path.join(data_dir, "text_graph.pkl")
if not os.path.isfile(df_data_path) or not os.path.isfile(graph_path):
    logger.info("Building datasets and graph from raw data... Note this will take quite a while...")
    generate_text_graph(train_data, max_vocab_len)

12/30/2022 03:01:43 PM [INFO]: Loading data...
12/30/2022 03:01:43 PM [INFO]: Building datasets and graph from raw data... Note this will take quite a while...
12/30/2022 03:01:43 PM [INFO]: Preparing data...
12/30/2022 03:01:49 PM [INFO]: Calculating Tf-idf...
12/30/2022 03:01:50 PM [INFO]: Building graph (No. of document: 400, word nodes: 2985)...
12/30/2022 03:01:50 PM [INFO]: Adding document nodes to graph...
12/30/2022 03:01:50 PM [INFO]: Adding word nodes to graph...
12/30/2022 03:01:50 PM [INFO]: Building document-word edges...
100%|██████████| 400/400 [03:39<00:00,  1.82it/s]


OrderedDict([('+', 0),
             ('-', 1),
             ('-3', 2),
             ('-dr', 3),
             ('-health', 4),
             ('-local', 5),
             ('-public', 6),
             ('/', 7),
             ('//tco/0b9g0ykb5q', 8),
             ('//tco/0c53jd5wf0', 9),
             ('//tco/0d…', 10),
             ('//tco/0nygfdvuel', 11),
             ('//tco/0xvn7zlmh6', 12),
             ('//tco/16vfgbnskt', 13),
             ('//tco/1bbs6vu4py', 14),
             ('//tco/1eaobmly9e', 15),
             ('//tco/1ifchvq9jm', 16),
             ('//tco/1lcs3lci2o', 17),
             ('//tco/1zquky66jn', 18),
             ('//tco/26kls2w0ba', 19),
             ('//tco/2a2djyjn6w', 20),
             ('//tco/2arv…', 21),
             ('//tco/2e8ilk708i', 22),
             ('//tco/2iw57qvgsb', 23),
             ('//tco/2kseuxvipp', 24),
             ('//tco/2l4jnt7xkl', 25),
             ('//tco/3apctk5ohb', 26),
             ('//tco/3brfn2bl0g', 27),
             ('//tco/3eoskxqby

12/30/2022 03:05:30 PM [INFO]: Calculating co-occurences...
100%|██████████| 400/400 [00:01<00:00, 362.63it/s]
12/30/2022 03:05:31 PM [INFO]: Calculating PMI*...
12/30/2022 03:15:26 PM [INFO]: Building word-word edges...
100%|██████████| 4453620/4453620 [02:54<00:00, 25503.27it/s]


686

[('+', '2-14', {'weight': 4.333653050402785}),
 ('+', 'appear', {'weight': 4.333653050402785}),
 ('+', 'associated', {'weight': 3.9281879423011805}),
 ('+', 'breath', {'weight': 4.228292534746417}),
 ('+', 'care', {'weight': 2.975529566287492}),
 ('+', 'check', {'weight': 2.541893581240328}),
 ('+', 'cough', {'weight': 3.5351453542010485}),
 ('+', 'covid-19', {'weight': 1.289130612941753}),
 ('+', 'covid19', {'weight': 0.08040720650444959}),
 ('+', 'days', {'weight': 2.7932080095037417}),
 ('+', 'diarrhea', {'weight': 4.333653050402785}),
 ('+', 'exposure', {'weight': 3.6405058698559594}),
 ('+', 'fever', {'weight': 4.228292534746417}),
 ('+', 'include', {'weight': 4.228292534746417}),
 ('+', 'infographic', {'weight': 4.228292534746417}),
 ('+', 'may', {'weight': 2.4878263599744255}),
 ('+', 'medical', {'weight': 3.2350407617609145}),
 ('+', 'seek', {'weight': 4.151331493611455}),
 ('+', 'seeking', {'weight': 4.228292534746417}),
 ('+', 'shortness', {'weight': 4.228292534746417}),
 ('+

OSError: [Errno 28] No space left on device

In [96]:
df_data = load_pickle(os.path.join(data_dir, "df_data.pkl"))
G_dict = load_pickle(os.path.join(data_dir, "text_graph.pkl"))
G = G_dict["graph"]
display(df_data)
display(G)

Unnamed: 0,text,label
0,"[1/6, covid19, key, concerns, 🇨🇦, today, goc, ...",1
1,"[2/2, sustainable, changes, including, appropr...",1
2,"[rt, scmpnews, china, coronavirus, hong, kong,...",0
3,"[today, 2:30, secretarylevine, provide, update...",1
4,"[yo, literally, racialize, politicize, coronav...",0
...,...,...
395,"[rt, nineralex, ever, drank, ’, immune, corona...",0
396,"[moteging, became, clear, trying, scatter, us,...",0
397,"[rt, peterzeihan, clearly, immune, https, //tc...",0
398,"[``, vaccine, still, least, 18, months, away, ...",1


Unnamed: 0,text,label
0,"[1/6, covid19, key, concerns, 🇨🇦, today, goc, ...",1
1,"[2/2, sustainable, changes, including, appropr...",1
2,"[rt, scmpnews, china, coronavirus, hong, kong,...",0
3,"[today, 2:30, secretarylevine, provide, update...",1
4,"[yo, literally, racialize, politicize, coronav...",0
...,...,...
395,"[rt, nineralex, ever, drank, ’, immune, corona...",0
396,"[moteging, became, clear, trying, scatter, us,...",0
397,"[rt, peterzeihan, clearly, immune, https, //tc...",0
398,"[``, vaccine, still, least, 18, months, away, ...",1


<networkx.classes.graph.Graph at 0x27eb3b40790>

In [113]:
logger.info("Building adjacency and degree matrix...")
A = nx.to_numpy_matrix(G, weight="weight")
# A = degree matrix
A = A + np.eye(G.number_of_nodes())
degrees = []
for d in G.degree(weight=None):
    if d == 0:
        degrees.append(0)
    else:
        degrees.append(d[1]**(-0.5))

# D^-1/2
degrees = np.diag(degrees)
display(np.count_nonzero(degrees - np.diag(np.diagonal(degrees))))
# X = identity matrix
X = np.eye(G.number_of_nodes()) # features are just identity matrix
# A_hat = D^-1/2 * A * D^-1/2
A_hat = degrees @ A @ degrees
display(A_hat)
display(np.count_nonzero(A_hat - np.diag(np.diagonal(A_hat))))
f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net

12/30/2022 04:18:35 PM [INFO]: Building adjacency and degree matrix...


0

matrix([[0.00033344, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.00033344, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.00033344, ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.0025    , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.0025    ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.00239808]])

65764

In [143]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

display(check_symmetric(A_hat))
display(check_symmetric(A))

True

True

In [None]:
logger.info("Splitting labels for training and inferring...")
### stratified test samples
test_idxs = []
for b_id in df_data["label"].unique():
    dum = df_data[df_data["label"] == b_id]
    if len(dum) >= 4:
        test_idxs.extend(list(np.random.choice(dum.index, size = round(test_ratio * len(dum)), replace = False)))
save_as_pickle("test_idxs.pkl", test_idxs)
# select only certain labelled nodes for semi-supervised GCN
selected = []
for i in range(len(df_data)):
    if i not in test_idxs:
        selected.append(i)
save_as_pickle("selected.pkl", selected)

f_selected = f[selected] 
f_selected = torch.from_numpy(f_selected).float()
f_not_selected = f[test_idxs]
f_not_selected = torch.from_numpy(f_not_selected).float()
labels_selected = list(df_data.loc[selected]['label'])
labels_not_selected = list(df_data.loc[test_idxs]['label'])
    
f = torch.from_numpy(f).float()
save_as_pickle("labels_selected.pkl", labels_selected)
save_as_pickle("labels_not_selected.pkl", labels_not_selected)
logger.info("Split into %d train and %d test labels." % (len(labels_selected), len(labels_not_selected)))
#return f, X, A_hat, selected, labels_selected, labels_not_selected, test_idxs

In [None]:
display(f)
display(X)
display(A_hat)
display(selected)
display(labels_selected)
display(labels_not_selected)
display(test_idxs)

In [115]:
def parse_index_file(filename):
    """Parse index file."""
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def sample_mask(idx, l):
    """Create mask."""
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)

In [117]:
old_model_datadir = os.path.abspath(os.path.join('..', '..', '..','BGSRD', 'data'))
old_model_datadir
import scipy.sparse as sp


In [139]:
dataset_str = 'botometer'
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'adj']
objects = []
for i in range(len(names)):
    with open(os.path.join(old_model_datadir,"ind.{}.{}".format(dataset_str, names[i])), 'rb') as f:
        objects.append(pickle.load(f, encoding='latin1'))
x, y, tx, ty, allx, ally, adj = tuple(objects)
print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)

(302, 300) (302, 2) (111, 300) (111, 2) (3618, 300) (3618, 2)


In [140]:
features = sp.vstack((allx, tx))
features = features.tolil()
display(features)
df = pd.DataFrame(features)
display(df)
labels = np.vstack((ally, ty))
display(labels)
print(len(labels))
#train_idx_orig = parse_index_file("data/{}.train.index".format(dataset_str))
#train_size = len(train_idx_orig)
#val_size = train_size - x.shape[0]
#test_size = tx.shape[0]
#idx_train = range(len(y))
#idx_val = range(len(y), len(y) + val_size)
#idx_test = range(allx.shape[0], allx.shape[0] + test_size)
#train_mask = sample_mask(idx_train, labels.shape[0])
#val_mask = sample_mask(idx_val, labels.shape[0])
#test_mask = sample_mask(idx_test, labels.shape[0])
#y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
#y_test = np.zeros(labels.shape)
#y_train[train_mask, :] = labels[train_mask, :]
#y_val[val_mask, :] = labels[val_mask, :]
#y_test[test_mask, :] = labels[test_mask, :]
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
display(adj.toarray())
print(sp.csr_matrix.transpose(adj) != adj) # if  not (sp.csc_matrix.transpose(a) == a).all():
display(adj.shape)
display(features.shape)

not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
not zero
n

<3729x300 sparse matrix of type '<class 'numpy.float64'>'
	with 1118700 stored elements in List of Lists format>

Unnamed: 0,0
0,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
1,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
2,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
3,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
4,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
...,...
3724,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
3725,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
3726,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."
3727,"(0, 0)\t0.0\n (0, 1)\t0.0\n (0, 2)\t0.0\n ..."


array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

3729


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])




(3729, 3729)

(3729, 300)

In [141]:
adj2 = adj.todense()
display(check_symmetric(adj2))# if  not (sp.csc_matrix.transpose(a) == a).all():
display(adj2.shape)
features2 = features.todense()
display(features2.shape)

True

(3729, 3729)

(3729, 300)