In [1]:
import numpy as np
import torch
import seaborn as sns
from torch_geometric.data import Data
import torch_geometric.transforms as T
import pandas as pd
import torch_geometric
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler, MinMaxScaler

### **Чтение данных**

In [2]:
dataframe = pd.read_csv('train.csv')

In [3]:
att = pd.read_csv('attr.csv')

### **Проверка набора данных**

In [4]:
dataframe.head()

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,0,131,84,148.0,5.6692e-07,0.0,0.0
1,0,135,164,396.7,0.06246274,0.0,0.0
2,0,47,15,,0.0,0.0,1.0
3,0,5,4,594.5,0.04962974,0.0,0.0
4,0,176,219,45.5,1.237935,0.0,0.0


In [5]:
att.head()

Unnamed: 0,ego_id,u,age,city_id,sex,school,university
0,0,227,68,-1,1,778293348,-1
1,0,45,38,237065842,1,82803468,238500268
2,0,142,60,237065842,1,196560139,-1
3,0,280,66,-1,2,963209731,720783270
4,0,41,18,-1,2,308862409,-1


In [4]:
att['u'] = att['u'].astype(str)
att['ego_id'] = att['ego_id'].astype(str)
att['u'] = att['u'] + '_' + att['ego_id']
att['u'] = att['u'].astype('category').cat.codes

### **Изменение индексов для категорий набора данных attribute (уменьшение числового диапазона)**

In [5]:
att['school'] = att['school'].astype(str).astype('category').cat.codes
att['university'] = att['university'].astype(str).astype('category').cat.codes
att['city_id'] = att['city_id'].astype(str).astype('category').cat.codes

### **Проверка на nan в тренировочном наборе данных**

In [7]:
print('ego_id nulls: ' + str(dataframe['ego_id'].isnull().sum()))
print('u nulls: ' + str(dataframe['u'].isnull().sum()))
print('v nulls: ' + str(dataframe['v'].isnull().sum()))
print('t nulls: ' + str(dataframe['t'].isnull().sum()))
print('x1 nulls: ' + str(dataframe['x1'].isnull().sum()))
print('x2 nulls: ' + str(dataframe['x2'].isnull().sum()))
print('x3 nulls: ' + str(dataframe['x3'].isnull().sum()))

ego_id nulls: 0
u nulls: 0
v nulls: 0
t nulls: 18717218
x1 nulls: 0
x2 nulls: 0
x3 nulls: 0


### **Заполнение nan признака t средним значением**

In [8]:
dataframe['t'] = dataframe['t'].fillna(dataframe['t'].mean())

### **Проверка на nan в наборе данных с атрибутами**

In [9]:
print('ego_id nulls: ' + str(att['ego_id'].isnull().sum()))
print('u nulls: ' + str(att['u'].isnull().sum()))
print('age nulls: ' + str(att['age'].isnull().sum()))
print('city_id nulls: ' + str(att['city_id'].isnull().sum()))
print('sex nulls: ' + str(att['sex'].isnull().sum()))
print('school nulls: ' + str(att['school'].isnull().sum()))
print('university nulls: ' + str(att['university'].isnull().sum()))

ego_id nulls: 0
u nulls: 0
age nulls: 0
city_id nulls: 0
sex nulls: 0
school nulls: 0
university nulls: 0


### **Создаём набор данных для обучения (все данные взять не получается из-за недостатка RAM)**

In [10]:
#2002538
train_dataframe = dataframe.iloc[0:3000000, ::]

### **Изменение ID для U и V (делаем уникальные неповторяющиеся айди всех эко-графов)**

In [11]:
train_dataframe['u'] = train_dataframe['u'].astype(str)
train_dataframe['v'] = train_dataframe['v'].astype(str)
train_dataframe['ego_id'] = train_dataframe['ego_id'].astype(str)

train_dataframe['u'] = train_dataframe['u'] + '_' + train_dataframe['ego_id']
train_dataframe['v'] = train_dataframe['v'] + '_' + train_dataframe['ego_id']

train_dataframe['u'] = train_dataframe['u'].astype('category').cat.codes
train_dataframe['v'] = train_dataframe['v'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataframe['u'] = train_dataframe['u'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataframe['v'] = train_dataframe['v'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataframe['ego_id'] = train_dataframe['ego_id'].astype(str)
A value is trying to be set

### **Добавим вершины edge_index, признаки x_features для вершин и целевое значение y**

In [12]:
train_dataframe = train_dataframe.sort_values(by=['u'])
x_features=(att[att['u'].isin(pd.concat([train_dataframe['u'], train_dataframe['v']]).drop_duplicates().values)].sort_values(by=['u'])).loc[::, ['age', 'city_id', 'sex', 'school', 'university']].values
x_features = torch.tensor(x_features, dtype=torch.float)

In [13]:
#x_features = torch.tensor(train_dataframe.iloc[::, 7::].values, dtype=torch.float)
edge_index = torch.tensor(train_dataframe.iloc[::, 1:3].values, dtype=torch.long).permute(1, 0)
y_train = torch.tensor(train_dataframe.iloc[::, 4].values)
data = Data(x=x_features, edge_index=edge_index, y=y_train)

### **Добавим признаки для рёбер**

In [14]:
scaler = MinMaxScaler()
scaler.fit(train_dataframe['t'].values.reshape(-1,1))
train_dataframe['t'] = scaler.transform(train_dataframe['t'].values.reshape(-1,1))

In [15]:
data.edge_attr = torch.tensor(train_dataframe.loc[::, ['t', 'x2', 'x3']].values)

In [16]:
data.edge_attr.shape

torch.Size([3000000, 3])

In [17]:
# data = T.ToUndirected()(data)

train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0)(data)

In [18]:
train_data

Data(x=[297736, 5], edge_index=[2, 2400000], y=[2400000], edge_attr=[2400000, 3], edge_label=[2400000], edge_label_index=[2, 2400000])

Модель GNN с использованием Attention


In [19]:
class GNNModel(torch.nn.Module):
    def __init__(self, input_features, hidden_channels):
        super().__init__()
        self.conv1 = torch_geometric.nn.GATConv(input_features, hidden_channels, heads=1)
        self.conv2 = torch_geometric.nn.GATConv(hidden_channels, hidden_channels, heads=1)
        self.conv3 = torch_geometric.nn.GATConv(hidden_channels, hidden_channels, heads=1)
        self.dropout = torch.nn.Dropout(0.5)
        self.lin1 = torch.nn.Linear(hidden_channels, 2*hidden_channels)
        self.lin2 = torch.nn.Linear(2*hidden_channels, 1)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr).relu()
        features = self.conv3(x, edge_index, edge_attr).relu()
        row, column = edge_index
        features = features[row]
        x = self.dropout(features)
        x = self.lin1(x).relu()
        x = self.lin2(x)
        return x.view(-1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GNNModel(input_features=5, hidden_channels=32).to(device)
model = model.float()
print(model)

GNNModel(
  (conv1): GATConv(5, 32, heads=1)
  (conv2): GATConv(32, 32, heads=1)
  (conv3): GATConv(32, 32, heads=1)
  (dropout): Dropout(p=0.5, inplace=False)
  (lin1): Linear(in_features=32, out_features=64, bias=True)
  (lin2): Linear(in_features=64, out_features=1, bias=True)
)


In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x, train_data.edge_index, train_data.edge_attr)
    target = train_data.y.float()
    loss = torch.sqrt(F.mse_loss(pred, target))
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data):
    data = data.to(device)
    model.eval()
    pred = model(data.x, data.edge_index, train_data.edge_attr)
    target = data.y
    loss = torch.sqrt(F.mse_loss(pred, target))
    return float(loss)

for epoch in range(1, 501):
    train_data = train_data.to(device)
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}')

Epoch: 001, Loss: 83855.0859, Train: 539845.0189, Val: 539845.0189
Epoch: 002, Loss: 589733.5000, Train: 147560.1688, Val: 147560.1688
Epoch: 003, Loss: 179277.5312, Train: 25868.2613, Val: 25868.2613
Epoch: 004, Loss: 39906.9922, Train: 1912.9634, Val: 1912.9634
Epoch: 005, Loss: 3023.2668, Train: 220.3778, Val: 220.3778
Epoch: 006, Loss: 588.7136, Train: 1395.2102, Val: 1395.2102
Epoch: 007, Loss: 2135.9824, Train: 19.7890, Val: 19.7890
Epoch: 008, Loss: 497.7990, Train: 51.8293, Val: 51.8293
Epoch: 009, Loss: 168.8239, Train: 1.5793, Val: 1.5793
Epoch: 010, Loss: 1.5793, Train: 1.5795, Val: 1.5795
Epoch: 011, Loss: 1.5795, Train: 1.5754, Val: 1.5754
Epoch: 012, Loss: 1.5754, Train: 1.5648, Val: 1.5648
Epoch: 013, Loss: 1.5648, Train: 1.5493, Val: 1.5493
Epoch: 014, Loss: 1.5493, Train: 1.5301, Val: 1.5301
Epoch: 015, Loss: 1.5301, Train: 1.5087, Val: 1.5087
Epoch: 016, Loss: 1.5087, Train: 1.4859, Val: 1.4859
Epoch: 017, Loss: 1.4859, Train: 1.4628, Val: 1.4628
Epoch: 018, Loss: 1.4

In [17]:
torch.save(model.state_dict(), 'model_weights.pth')