# 加载数据 

In [1]:
# %load load_data.py
import numpy as np
from text_analyse import text_embedding

def row_normalize(mx):
    """Row-normalize matrix"""
    rowsum = np.array(mx.sum(1))                # 每一行求和
    r_inv = np.power(rowsum, -0.5).flatten()    # 返回一个一维数组
    r_inv[np.isinf(r_inv)] = 0.                 # 一维数组中，如果有inf，变为0
    r_mat_inv = np.eye(r_inv.shape[0])
    for i in range(r_inv.shape[0]):
        r_mat_inv[i][i] = r_inv[i]
    mx = mx.dot(r_mat_inv).transpose().dot(r_mat_inv)   # 行归一化
    return mx


def load_data_1(dataset='cora'):
    path = r'data_1\{}'.format(dataset)

    feature = []
    with open(path+'.feature', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            # print(line)
            feature.append([int(k) for k in line.strip().split()])
            # print(feature)
            # break
    feature_arr = np.array(feature)
    print(feature_arr.shape)

    label = []
    with open(path+'.label', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            # print(line)
            label.append([int(line.strip())])
            # print(feature)
            # break
    label_arr = np.array(label)
    print(label_arr.shape)

    adj = np.zeros((label_arr.shape[0], label_arr.shape[0]), dtype=np.int8)
    with open(path+'.edge', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            nodes = [int(k) for k in line.strip().split()]
            # print(nodes)
            adj[nodes[0], nodes[1]] = 1
    print(adj, adj.shape)

    return feature_arr, label_arr, adj


def load_data_2(dataset='cora'):
    path = r'data_2\{}'.format(dataset)

    text = []
    with open(path+'.text', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            text.append(line.strip().split(maxsplit=1)[1])
    feature_arr = text_embedding(text)
    print(feature_arr.shape)
    # exit(0)

    label = []
    with open(path+'.label', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            # print(line)
            label.append([int(line.strip().split()[1])])
            # print(feature)
            # break
    label_arr = np.array(label)
    print(label_arr.shape)

    adj = np.zeros((label_arr.shape[0], label_arr.shape[0]), dtype=np.int8)
    with open(path+'.edge', 'r', encoding='utf-8') as f:
        for line in f.readlines():
            nodes = [int(k) for k in line.strip().split()]
            # print(nodes)
            adj[nodes[0], nodes[1]] = 1
    print(adj, adj.shape)

    return feature_arr, label_arr, adj


if __name__ == "__main__":
    # load_data_1()
    load_data_2()

ModuleNotFoundError: No module named 'sklearn'

# 模型

In [4]:
# %load model.py
from collections import OrderedDict

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


class MLP(nn.Module):
    def __init__(self, in_size, hidden_size, out_size, num_layers=3, dropout=0.5):
        super(MLP, self).__init__()

        self.layer_1 = nn.Linear(in_size, hidden_size, bias=True)
        self.layer_2 = nn.Linear(hidden_size, out_size, bias=True)

        '''
        if num_layers == 1:
            hidden_size = out_size

        self.pipeline = nn.Sequential(OrderedDict([
            ('layer_0', nn.Linear(in_size, hidden_size, bias=(num_layers != 1))),
            ('dropout_0', nn.Dropout(dropout)),
            ('relu_0', nn.ReLU())
        ]))

        for i in range(1, num_layers):
            if i == num_layers - 1:
                self.pipeline.add_module('layer_{}'.format(i), nn.Linear(hidden_size, out_size, bias=True))
            else:
                self.pipeline.add_module('layer_{}'.format(i), nn.Linear(hidden_size, hidden_size, bias=True))
                self.pipeline.add_module('dropout_{}'.format(i), nn.Dropout(dropout))
                self.pipeline.add_module('relu_{}'.format(i), nn.ReLU())
        '''

        self.weights_init()

    def weights_init(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.fill_(0.0)

    def forward(self, feature):
        '''
        return F.softmax(self.pipeline(feature), dim=1)
        '''
        h = F.relu(self.layer_1(feature))
        out = self.layer_2(h)
        return out


class GNN(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super(GNN, self).__init__()
        self.layer_1 = nn.Linear(in_size, hidden_size, bias=True)
        self.layer_2 = nn.Linear(hidden_size, out_size, bias=True)

    def weights_init(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_uniform_(m.weight.data)
                if m.bias is not None:
                    m.bias.data.fill_(0.0)

    def forward(self, x, adj):
        x_agg = torch.mm(adj, x)
        h = F.relu(self.layer_1(x_agg))

        h_agg = torch.mm(adj, h)
        out = self.layer_2(h_agg)

        return out

# 文本分析

In [5]:
# %load text_analyse.py
import warnings
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

warnings.filterwarnings('ignore')

params_count = {
    'analyzer': 'word',     # 取值'word'-分词结果为词级、'char'-字符级(结果会出现he is，空格在中间的情况)、'char_wb'-字符级(以单词为边界)，默认值为'word'
    'binary': False,        # boolean类型，设置为True，则所有非零计数都设置为1.（即，tf的值只有0和1，表示出现和不出现）
    'decode_error': 'strict',
    'dtype': np.float64,    # 输出矩阵的数值类型
    'encoding': 'utf-8',
    'input': 'content',     # 取值filename，文本内容所在的文件名；file，序列项必须有一个'read'方法，被调用来获取内存中的字节；content，直接输入文本字符串
    'lowercase': True,      # boolean类型，计算之前是否将所有字符转换为小写。
    'max_df': 1.0,          # 词汇表中忽略文档频率高于该值的词；取值在[0,1]之间的小数时表示文档频率的阈值，取值为整数时(>1)表示文档频数的阈值；如果设置了vocabulary，则忽略此参数。
    'min_df': 1,            # 词汇表中忽略文档频率低于该值的词；取值在[0,1]之间的小数时表示文档频率的阈值，取值为整数时(>1)表示文档频数的阈值；如果设置了vocabulary，则忽略此参数。
    'max_features': None,   # int或None(默认值).设置int值时建立一个词汇表，仅用词频排序的前max_features个词创建语料库；如果设置了vocabulary，则忽略此参数。
    'ngram_range': (1, 2),  # 要提取的n-grams中n值范围的下限和上限，min_n <= n <= max_n。
    'preprocessor': None,   # 覆盖预处理（字符串转换）阶段，同时保留标记化和 n-gram 生成步骤。仅适用于analyzer不可调用的情况。
    'stop_words': 'english',    # 仅适用于analyzer='word'。取值english，使用内置的英语停用词表；list，自行设置停停用词列表；默认值None，不会处理停用词
    'strip_accents': None,
    'token_pattern': '(?u)\\b\\w\\w+\\b',   # 分词方式、正则表达式，默认筛选长度>=2的字母和数字混合字符（标点符号被当作分隔符）。仅在analyzer='word'时使用。
    'tokenizer': None,      # 覆盖字符串标记化步骤，同时保留预处理和 n-gram 生成步骤。仅适用于analyzer='word'
    'vocabulary': None,     # 自行设置词汇表（可设置字典），如果没有给出，则从输入文件/文本中确定词汇表
}
params_tfidf = {
    'norm': 'l2',           # 输出结果是否标准化/归一化。l2：向量元素的平方和为1，当应用l2范数时，两个向量之间的余弦相似度是它们的点积；l1：向量元素的绝对值之和为1
    'smooth_idf': True,     # 在文档频率上加1来平滑 idf ，避免分母为0
    'sublinear_tf': True,  # 应用次线性 tf 缩放，即将 tf 替换为 1 + log(tf)
    'use_idf': True,        # 是否计算idf，布尔值，False时idf=1。
}


def text_embedding(data):
    '''
    class LemmaTokenizer:
        def __init__(self):
            self.wnl = WordNetLemmatizer()

        def __call__(self, doc):
            words = []
            for t in word_tokenize(doc):
                if len(t) < 3 or "'" in t or "~" in t:
                    continue
                words.append(self.wnl.lemmatize(t))
            return words

    params_count['tokenizer'] = LemmaTokenizer()
    '''

    params_count['max_features'] = 500
    params_count['max_df'] = 0.8
    params_count['min_df'] = 0.01

    cv = CountVectorizer(**params_count)
    x_cv = cv.fit_transform(data)

    vocabulary = cv.get_feature_names_out()
    print(vocabulary)
    # print(x_cv.toarray())

    tt = TfidfTransformer(**params_tfidf)
    x_tfidf = tt.fit_transform(x_cv.toarray())
    # print(x_tfidf.toarray())

    return x_tfidf.toarray()


if __name__ == "__main__":
    train_data = ["Chinese Beijing Chinese",
                  "Chinese Chinese Shanghai",
                  "Chinese Macao",
                  "Tokyo Japan Chinese"]
    text_embedding(train_data)


# gnn

In [6]:
# %load train_gnn.py
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from load_data import load_data_1, load_data_2, row_normalize
from model import GNN

dataset = 'cora'
feature, label, adj = load_data_2(dataset)
num_classes = len(np.unique(label))
adj = row_normalize(adj)

idx_train, idx_test, _, _ = train_test_split(
    torch.LongTensor(np.arange(label.shape[0])), label, test_size=0.4, random_state=2333)
print(idx_train, idx_test)

adj = torch.FloatTensor(adj)
feature = torch.FloatTensor(feature)
label = torch.LongTensor(label).flatten()

model = GNN(in_size=feature.shape[1], hidden_size=64, out_size=num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    output = model(feature, adj)
    loss_train = F.cross_entropy(output[idx_train], label[idx_train])

    _, output = torch.max(output, dim=1)
    acc_train = accuracy_score(label[idx_train].detach().numpy(), output[idx_train])

    print('epoch:{:3d}: | loss:{:1.5f} | acc:{:.3f}'.format(epoch, loss_train, acc_train))
    loss_train.backward()
    optimizer.step()

model.eval()
output = model(feature, adj)

_, output = torch.max(output, dim=1)
acc_test = accuracy_score(label[idx_test].detach().numpy(), output[idx_test])
print(acc_test)



# mlp 

In [7]:
# %load train_mlp.py
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from load_data import load_data_1, load_data_2
from model import MLP

dataset = 'cora'
feature, label, _ = load_data_2(dataset)
num_classes = len(np.unique(label))

x_train, x_test, y_train, y_test = train_test_split(feature, label, test_size=0.4, random_state=2333)
print(y_train.shape, y_test.shape)
y_train = y_train.flatten()
y_test = y_test.flatten()

x_train, x_test = [torch.FloatTensor(k) for k in [x_train, x_test]]
y_train, y_test = [torch.LongTensor(k) for k in [y_train, y_test]]
print(y_train, y_test)

model = MLP(in_size=x_train.shape[1], hidden_size=64, out_size=num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    output = model(x_train)
    loss_train = F.cross_entropy(output, y_train)

    _, output = torch.max(output, dim=1)
    acc_train = accuracy_score(y_train.detach().numpy(), output)

    print('epoch:{:3d}: | loss:{:1.5f} | acc:{:.3f}'.format(epoch, loss_train, acc_train))
    loss_train.backward()
    optimizer.step()

model.eval()
output = model(x_test)

_, output = torch.max(output, dim=1)
acc_test = accuracy_score(y_test.detach().numpy(), output)
print(acc_test)

