In [117]:
# pytorch mlp for multiclass classification
from numpy import vstack
from numpy import argmax
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from torch import Tensor
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.nn import CrossEntropyLoss
from torch.nn import L1Loss
from torch.nn import functional
from torch.nn import InstanceNorm1d
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


In [59]:
# train_data = pd.read_csv('data/train.csv')
# X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)
# X_train = X_train.drop(['text','hashtags','urls','user_mentions','retweet_count'], axis=1) # only numerical values
# X_test = X_test.drop(['text','hashtags','urls','user_mentions','retweet_count'], axis=1) # only numerical values

In [129]:
# dataset definition
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self):
        # load the csv file as a dataframe
#         df = read_csv(path, header=None)
        df = read_csv('data/train.csv')
        # store the inputs and outputs
#         self.X = df.values[:, :-1]
#         self.y = df.values[:, -1]
#         vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
#         self.X = vectorizer.fit_transform(df['text'])
        self.X = df.drop(['text','hashtags','urls','user_mentions','retweet_count'], axis=1)
        self.y = df['retweet_count']
        print(self.X)
        # ensure input data is floats
#         self.X = self.X.astype('float32')
#         self.y = self.y.astype('float32')
        self.X = self.X.values.astype(np.float32)
        self.y = self.y.values.astype(np.int)
#         print(self.y.shape)
#         print(self.y)
        # label encode target and ensure the values are floats
        self.y = self.y.reshape((len(self.y), 1))
        print("y:",self.y)
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

# model definition
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 8)
#         xavier_uniform_(self.hidden1.weight)
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = Linear(8, 4)
#         xavier_uniform_(self.hidden2.weight)
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = Linear(4, 1)
#         xavier_uniform_(self.hidden3.weight)
 
    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
         # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        return X
 
# prepare the dataset
def prepare_data():
    # load the dataset
    dataset = CSVDataset()
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=10000, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl
 
# train the model
def train_model(train_dl, model):
    # define the optimization
#     criterion = MSELoss()
    criterion = L1Loss()
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    # enumerate epochs
    for epoch in range(10):
        # enumerate mini batches
        print("Epoch: ", epoch)
        for i, (inputs, targets) in enumerate(train_dl):
#             print(inputs)
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
#             print(targets,yhat)
            # calculate loss
            loss = criterion(yhat, targets)
#             print(loss,'\n')
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
 
# evaluate the model
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
#         print(yhat)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate mse
#     mse = mean_squared_error(actuals, predictions)
#     return mse
#     loss = L1Loss()
    print(actuals.shape,predictions.shape)
    print(actuals[:40],predictions[:40])
    mae = mean_absolute_error(actuals,predictions)
    return mae
 
# make a class prediction for one row of data
def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat

# make a single prediction (expect class=1)
# row = [0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98]
# yhat = predict(row, model)
# print('Predicted: %.3f' % yhat)

In [125]:
train_dl, test_dl = prepare_data()
print(len(train_dl.dataset), len(test_dl.dataset))

model = MLP(6)

train_model(train_dl, model)

            id      timestamp  user_verified  user_statuses_count  \
0            0  1588696955143          False                68460   
1            1  1588464948124          False                  309   
2            2  1588634673360          False                 3241   
3            3  1588433158672          False                32327   
4            4  1588582751599          False                  581   
...        ...            ...            ...                  ...   
665772  665772  1588412684317          False                65355   
665773  665773  1588324521711          False                 1807   
665774  665774  1588353174952          False                  888   
665775  665775  1588691378352          False                  452   
665776  665776  1588432578764          False                  590   

        user_followers_count  user_friends_count  
0                       1101                1226  
1                         51                 202  
2                 

In [131]:
# evaluate the model
mae = evaluate_model(test_dl, model)
print('MAE: %.3f' % (mse))

(219706, 1) (219706, 1)
[[  0]
 [  1]
 [  0]
 [  6]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  7]
 [  0]
 [  0]
 [  3]
 [  1]
 [  0]
 [  0]
 [195]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [ 78]
 [ 15]
 [  0]
 [  0]
 [312]
 [553]
 [ 22]
 [  7]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]
 [  0]] [[-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]
 [-0.00366375]]
MAE: 87.737


In [102]:
### ONLINE TEST

from numpy import vstack
from numpy import sqrt
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
from torch.nn import Linear
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import MSELoss
from torch.nn.init import xavier_uniform_

# dataset definition
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path, header=None)
        # store the inputs and outputs
        self.X = df.values[:, :-1].astype('float32')
        self.y = df.values[:, -1].astype('float32')
        # ensure target has the right shape
        print(self.X)
        print(self.y)
        self.y = self.y.reshape((len(self.y), 1))

    # number of rows in the dataset
    def __len__(self):
        return len(self.X)

    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

# model definition
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 10)
        xavier_uniform_(self.hidden1.weight)
        self.act1 = Sigmoid()
        # second hidden layer
        self.hidden2 = Linear(10, 8)
        xavier_uniform_(self.hidden2.weight)
        self.act2 = Sigmoid()
        # third hidden layer and output
        self.hidden3 = Linear(8, 1)
        xavier_uniform_(self.hidden3.weight)

    # forward propagate input
    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
         # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # third hidden layer and output
        X = self.hidden3(X)
        return X

# prepare the dataset
def prepare_data(path):
    # load the dataset
    dataset = CSVDataset(path)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl

# train the model
def train_model(train_dl, model):
    # define the optimization
    criterion = MSELoss()
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    # enumerate epochs
    for epoch in range(100):
        # enumerate mini batches
        print("epoch: ",epoch)
        for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
            # calculate loss
#             print(targets,yhat)
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()

# evaluate the model
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate mse
    mse = mean_squared_error(actuals, predictions)
    return mse

# make a class prediction for one row of data
def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat

# prepare the data
path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
train_dl, test_dl = prepare_data(path)
print(len(train_dl.dataset), len(test_dl.dataset))

# define the network
model = MLP(13)
# train the model
train_model(train_dl, model)
# evaluate the model
mse = evaluate_model(test_dl, model)
print('MSE: %.3f, RMSE: %.3f' % (mse, sqrt(mse)))
# make a single prediction (expect class=1)
row = [0.00632,18.00,2.310,0,0.5380,6.5750,65.20,4.0900,1,296.0,15.30,396.90,4.98]
yhat = predict(row, model)
print('Predicted: %.3f' % yhat)

[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.