# Summary
> The whole project can be seperated to two parts: Data preprocessing, model training and testing.

## Data Preprocessing
The first step is to select related features manually. We find target are two engagement features: favorite_count and retweet_count (quote_count and reply_count are all missing). And the features we need are mainly three parts: 1)Tweet attributes, like the text content, the account information. 2)Retweet information. 3) Quote information.

The Second step is data transformation. Like we change binary label (True,False) to (0,1). And normalization.

The Third step is to transform text by sentiment analysis. Here I use Textblob library.

## Model training
I use a 4 layers fully-connected neuron network. Use L2 regurilization as loss function and SGDM methon as optimizer.





In [1]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import csv
import re
from collections import Counter



myseed = 10  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

Features:'created_at','screen_name','text'(SA),'display_text_width','lang', 'is_quote', 'is_retweet','quoted_text'(SA),'quoted_favorite_count','quoted_retweet_count','quoted_followers_count','quoted_friends_count','quoted_statuses_count','quoted_verified','retweet_text'(SA),'retweet_favorite_count', 'retweet_retweet_count','retweet_followers_count', 'retweet_friends_count', 'retweet_statuses_count', 'retweet_verified','followers_count', 'friends_count', 'listed_count', 'statuses_count','verified'


Target: 'favorite_count','retweet_count','quote_count'(All Missing),'reply_count'(All Missing),'favourites_count'

Features:'created_at','screen_name','text'(SA),'display_text_width','lang', 'is_quote', 'is_retweet','quoted_text'(SA),'quoted_favorite_count','quoted_retweet_count','quoted_followers_count','quoted_friends_count','quoted_statuses_count','quoted_verified','retweet_text'(SA),'retweet_favorite_count', 'retweet_retweet_count','retweet_followers_count', 'retweet_friends_count', 'retweet_statuses_count', 'retweet_verified','followers_count', 'friends_count', 'listed_count', 'statuses_count','verified'


Target: 'favorite_count','retweet_count','quote_count'(All Missing),'reply_count'(All Missing),'favourites_count'

In [12]:
full_data=pd.read_csv('random_sample.csv')
use_label=['text','display_text_width','lang', 'is_quote', 'is_retweet','quoted_text','quoted_favorite_count','quoted_retweet_count','quoted_followers_count','quoted_friends_count','quoted_statuses_count','quoted_verified','retweet_text','retweet_favorite_count', 'retweet_retweet_count','retweet_followers_count', 'retweet_friends_count', 'retweet_statuses_count', 'retweet_verified','followers_count', 'friends_count', 'listed_count', 'statuses_count','verified']
use_data=full_data[use_label]
binary_label=['is_quote','is_retweet','verified']

Data Preprocessing:
Text (text,quoted_text,retweet_text):

    1)NaN to 0

    2)T F to 1 0

    3)Remove emoji

    4)Removes all special characters and numericals leaving the alphabets

    5)Tokenization

    6)Converting each token into a tuple having the form

    7)Remove stopwords

    8)Obtaining the stem words

In [14]:
use_data=use_data.fillna(0)
use_data[binary_label]=use_data[binary_label].astype(int)

In [25]:
lang_onehot=y = pd.get_dummies(use_data['lang'], prefix='lang')

In [35]:
onehot_column=list(lang_onehot.columns)

In [45]:
def emoji_remove(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    if text == 0:
        return 0
    return emoji_pattern.sub(r'', text)

use_data['text']=use_data['text'].apply(emoji_remove)
use_data['quoted_text']=use_data['quoted_text'].apply(emoji_remove)
use_data['retweet_text']=use_data['retweet_text'].apply(emoji_remove)

In [55]:
from nltk.tokenize import word_tokenize
def token_stop_pos(text):
    return word_tokenize(text)

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    if text == 0:
        return 0
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
    if pos_data == 0:
        return 0
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RmmLeo10\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RmmLeo10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RmmLeo10\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
use_data['text']=use_data['text'].apply(token_stop_pos).apply(lemmatize)
use_data['quoted_text']=use_data['quoted_text'].apply(token_stop_pos).apply(lemmatize)
use_data['retweet_text']=use_data['retweet_text'].apply(token_stop_pos).apply(lemmatize)

Get the sentiment: 

    +1 for Positive

    0 for Neutral

    -1 for Negative

In [63]:
from textblob import TextBlob
def getPolarity(lemma):
    if lemma==0:
        return 0
    return TextBlob(lemma).sentiment.polarity

# function to analyze the reviews
def analysis(score):
    if score < 0:
        return -1
    elif score == 0:
        return 0
    else:
        return 1
    
use_data['text'] = use_data['text'].apply(getPolarity) 
use_data['text'] = use_data['text'].apply(analysis)
use_data['quoted_text'] = use_data['quoted_text'].apply(getPolarity) 
use_data['quoted_text'] = use_data['quoted_text'].apply(analysis)
use_data['retweet_text'] = use_data['retweet_text'].apply(getPolarity) 
use_data['retweet_text'] = use_data['retweet_text'].apply(analysis)


In [101]:
lang_onehot=y = pd.get_dummies(use_data['lang'], prefix='lang')
onehot_column=list(lang_onehot.columns)
final_data = pd.concat([use_data, lang_onehot], axis=1)
final_column=use_label+onehot_column
final_column.remove('lang')
final_data=final_data[final_column].astype(int)

In [94]:
final_column=use_label+onehot_column

In [95]:
final_column.remove('lang')

In [116]:
target_label=['favorite_count','retweet_count','favourites_count'] 
target_data=full_data[target_label]
target_data['favorite_count']=target_data['favorite_count']+target_data['favourites_count']
target_label=['favorite_count','retweet_count'] 
target_data=target_data[target_label]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_data['favorite_count']=target_data['favorite_count']+target_data['favourites_count']


In [119]:
final_column=final_column+target_label
final_data = pd.concat([final_data, target_data], axis=1)

Split to Train and Test, test_size=0.2
save to csv file

In [124]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(final_data, test_size=0.2)

In [127]:
train.to_csv('train.csv',index=False)  
test.to_csv('test.csv',index=False)  

DNN

In [5]:
tr_path = 'train.csv'  # path to training data
tt_path = 'test.csv'   # path to testing data

In [113]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(0.0, 5000)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()


def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Prediction')
    plt.show()

In [78]:
class TweetDataset(Dataset):
    ''' Dataset for loading and preprocessing the COVID19 dataset '''
    def __init__(self,
                 path,
                 mode='train',
                 target_only=False):
        self.mode = mode

        # Read data into numpy arrays
        with open(path, 'r') as fp:
            data = list(csv.reader(fp))
            data = np.array(data[1:])[:,:].astype(float)
        
        if not target_only:
            feats = list(range(40))
        """else:
            TODO: Using 40 states & 2 tested_positive features (indices = 57 & 75)
            pass"""

        if mode == 'test':
            # Testing data
            # data: 893 x 93 (40 states + day 1 (18) + day 2 (18) + day 3 (17))
            data = data[:, feats]
            self.data = torch.FloatTensor(data)
        else:
            # Training data (train/dev sets)
            # data: 2700 x 94 (40 states + day 1 (18) + day 2 (18) + day 3 (18))
            target = data[:, [40,41]]
            data = data[:, feats]
            
            # Splitting training data into train & dev sets
            if mode == 'train':
                indices = [i for i in range(len(data)) if i % 10 != 0]
            elif mode == 'dev':
                indices = [i for i in range(len(data)) if i % 10 == 0]
            
            # Convert data into PyTorch tensors
            self.data = torch.FloatTensor(data[indices])
            self.target = torch.FloatTensor(target[indices])

        # Normalize features (you may remove this part to see what will happen)
        num_features=[1,5,6,7,8,9,12,13,14,15,16,18,19,20,21]
        for i in num_features:
             self.data[:, i] = \
            (self.data[:, i] - self.data[:, i].mean(dim=0, keepdim=True)) \
            / self.data[:, i].std(dim=0, keepdim=True)

        self.dim = self.data.shape[1]

        print('Finished reading the {} set of Tweet Dataset ({} samples found, each dim = {})'
              .format(mode, len(self.data), self.dim))

    def __getitem__(self, index):
        # Returns one sample at a time
        if self.mode in ['train', 'dev']:
            # For training
            return self.data[index], self.target[index]
        else:
            # For testing (no target)
            return self.data[index]

    def __len__(self):
        # Returns the size of the dataset
        return len(self.data)

DataLoader

In [79]:
def prep_dataloader(path, mode, batch_size, n_jobs=0, target_only=False):
    ''' Generates a dataset, then is put into a dataloader. '''
    dataset = TweetDataset(path, mode=mode, target_only=target_only)  # Construct dataset
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=True)                            # Construct dataloader
    return dataloader

DNN

In [128]:
class NeuralNet(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        # Define your neural network here
        # TODO: How to modify this model to achieve better performance?
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2),
        )

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        return self.net(x).squeeze(1)

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        regularization_loss = 0
        for param in model.parameters():
        # L2 Regularization
            regularization_loss += torch.sum(param ** 2)
        return self.criterion(pred, target) + 0.00075 * regularization_loss

Training

In [130]:
def train(tr_set, dv_set, model, config, device):
    ''' DNN training '''

    n_epochs = config['n_epochs']  # Maximum number of epochs

    # Setup optimizer
    optimizer = getattr(torch.optim, config['optimizer'])(
        model.parameters(), **config['optim_hparas'])

    min_mse = 1000.
    loss_record = {'train': [], 'dev': []}      # for recording training loss
    early_stop_cnt = 0
    epoch = 0
    while epoch < n_epochs:
        model.train()                           # set model to training mode
        for x, y in tr_set:                     # iterate through the dataloader
            optimizer.zero_grad()               # set gradient to zero
            x, y = x.to(device), y.to(device)
            print(x)# move data to device (cpu/cuda)
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            mse_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            loss_record['train'].append(mse_loss.detach().cpu().item())

        # After each epoch, test your model on the validation (development) set.
        dev_mse = dev(dv_set, model, device)
        if dev_mse < min_mse:
            # Save model if your model improved
            min_mse = dev_mse
            print('Saving model (epoch = {:4d}, loss = {:.4f})'
                .format(epoch + 1, min_mse))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1

        epoch += 1
        loss_record['dev'].append(dev_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break

    print('Finished training after {} epochs'.format(epoch))
    return min_mse, loss_record

Validation

In [81]:
def dev(dv_set, model, device):
    model.eval()                                # set model to evalutation mode
    total_loss = 0
    for x, y in dv_set:                         # iterate through the dataloader
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
        total_loss += mse_loss.detach().cpu().item() * len(x)  # accumulate loss
    total_loss = total_loss / len(dv_set.dataset)              # compute averaged loss

    return total_loss

Testing

In [82]:
def test(tt_set, model, device):
    model.eval()                                # set model to evalutation mode
    preds = []
    for x in tt_set:                            # iterate through the dataloader
        x = x.to(device)                        # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            preds.append(pred.detach().cpu())   # collect prediction
    preds = torch.cat(preds, dim=0).numpy()     # concatenate all predictions and convert to a numpy array
    return preds

Hyperparameters

In [None]:
device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/
target_only = False                   # TODO: Using 40 states & 2 tested_positive features

# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
    'n_epochs': 5000,                # maximum number of epochs
    'batch_size': 200,               # mini-batch size for dataloader
    'optimizer': 'SGD',              # optimization algorithm (optimizer in torch.optim)
    'optim_hparas': {                # hyper-parameters for the optimizer (depends on which optimizer you are using)
        'lr': 0.0001,                 # learning rate of SGD
        'momentum': 0.9,
        'weight_decay': 1e-4
    },
    'early_stop': 300,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.pth'  # your model will be saved here
}

In [87]:
tr_set = prep_dataloader(tr_path, 'train', config['batch_size'], target_only=target_only)
dv_set = prep_dataloader(tr_path, 'dev', config['batch_size'], target_only=target_only)
tt_set = prep_dataloader(tt_path, 'test', config['batch_size'], target_only=target_only)

Finished reading the train set of Tweet Dataset (18000 samples found, each dim = 40)
Finished reading the dev set of Tweet Dataset (2000 samples found, each dim = 40)
Finished reading the test set of Tweet Dataset (5000 samples found, each dim = 40)


In [131]:
model = NeuralNet(tr_set.dataset.dim).to(device)  # Construct model and move to device

In [124]:
model

NeuralNet(
  (net): Sequential(
    (0): Linear(in_features=40, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=2, bias=True)
  )
  (criterion): MSELoss()
)

In [None]:
model_loss, model_loss_record = train(tr_set, dv_set, model, config, device)

In [None]:
plot_learning_curve(model_loss_record, title='deep model')

In [105]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    print('Saving results to {}'.format(file))
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

preds = test(tt_set, model, device)  # predict engagement with your model
save_pred(preds, 'pred.csv')         # save prediction file to pred.csv

Saving results to pred.csv
