<a href="https://colab.research.google.com/github/gnoziere/cs230-ak-troll-detector/blob/main/Troll_Detector_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load data from CSV files

In [15]:
!pip3 install transformers tweet-preprocessor emoji



In [16]:
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import AutoModel, AutoTokenizer

In [17]:
device = "cuda"

In [18]:
# Define paths
positive_data_path = "/content/drive/MyDrive/turkey_052020_tweets_csv_hashed_2011.csv"
negative_data_path = "/content/drive/MyDrive/turkey_052020_tweets_csv_hashed_2020_01.csv"
model_path = "dbmdz/distilbert-base-turkish-cased"

In [19]:
positive_raw_data = pd.read_csv(positive_data_path)
positive_raw_data["label"] = 1.0

negative_raw_data = pd.read_csv(negative_data_path)
negative_raw_data["label"] = 0.0

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [20]:
positive_raw_data.columns

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language', 'tweet_language',
       'tweet_text', 'tweet_time', 'tweet_client_name', 'in_reply_to_userid',
       'in_reply_to_tweetid', 'quoted_tweet_tweetid', 'is_retweet',
       'retweet_userid', 'retweet_tweetid', 'latitude', 'longitude',
       'quote_count', 'reply_count', 'like_count', 'retweet_count', 'hashtags',
       'urls', 'user_mentions', 'label'],
      dtype='object')

In [39]:
positive_raw_data.shape

(35100, 31)

In [40]:
negative_raw_data.shape

(859516, 31)

In [99]:
merged_data = pd.concat([positive_raw_data, negative_raw_data], ignore_index=True)
merged_data = merged_data.sample(frac=1)[:100000]

In [100]:
merged_data.shape

(100000, 31)

# Pre-process data

In [101]:
merged_data = merged_data.drop([
    'latitude', # always 'absent' in positive dataset
    'longitude',
    'user_profile_url', # always 'NaN in positive dataset
    'in_reply_to_userid',
    'in_reply_to_tweetid',
    'quoted_tweet_tweetid',
    'retweet_userid',
    'retweet_tweetid',
    'quote_count', # can’t access these without a premium developer account
    'reply_count' 
], axis = 1)

In [102]:
merged_data.columns

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description', 'follower_count',
       'following_count', 'account_creation_date', 'account_language',
       'tweet_language', 'tweet_text', 'tweet_time', 'tweet_client_name',
       'is_retweet', 'like_count', 'retweet_count', 'hashtags', 'urls',
       'user_mentions', 'label'],
      dtype='object')

In [103]:
merged_data.shape

(100000, 21)

In [104]:
import preprocessor as p
import emoji
import string

# remove emojis  
def remove_emoji(text):
    return emoji.get_emoji_regexp().sub(u'', text)

# remove url, smiley, hashtag, mention and reserved
p.set_options(p.OPT.URL, p.OPT.SMILEY, p.OPT.HASHTAG, p.OPT.MENTION, p.OPT.RESERVED)
for i in merged_data.index:
  text = merged_data.at[i, 'tweet_text']
  text = p.clean(str(text))
  text = remove_emoji(text)
  text = text.lower().replace('[^\w\s]',' ').replace('\s\s+', ' ').replace('\r', ' ').replace('\n', ' ');
  merged_data.at[i, 'tweet_text'] = text

  text = merged_data.at[i, 'user_display_name']
  text = p.clean(str(text))
  text = remove_emoji(text)
  text = text.lower().replace('[^\w\s]',' ').replace('\s\s+', ' ').replace('\r', ' ').replace('\n', ' ');
  merged_data.at[i, 'user_display_name'] = text

  text = merged_data.at[i, 'user_profile_description']
  text = p.clean(str(text))
  text = remove_emoji(text)
  text = text.lower().replace('[^\w\s]',' ').replace('\s\s+', ' ').replace('\r', ' ').replace('\n', ' ');
  merged_data.at[i, 'user_profile_description'] = text

In [105]:
# remove [,], ', NaN from hashtags, urls, and user_mentions
merged_data['hashtags'] = merged_data['hashtags'].str.replace('[','').str.replace(']','').str.replace("'",'').str.lower()
merged_data['urls'] = merged_data['urls'].str.replace('[','').str.replace(']','').str.replace("'",'')
merged_data['user_mentions'] = merged_data['user_mentions'].str.replace('[','').str.replace(']','').str.replace("'",'')

merged_data = merged_data.fillna('')

In [106]:
merged_data.shape

(100000, 21)

# Split sets

In [107]:
def split_data(data, train=0.98, test=0.01, eval=0.01):
  assert train + test + eval == 1.0

  num_examples = data.shape[0]
  data = data.sample(frac=1)

  train_data = data[:int(num_examples * train)]
  test_data = data[int(num_examples * train):int(num_examples * (train+test))]
  eval_data = data[int(num_examples * (train+test)):]  # sacred data

  return (train_data, test_data, eval_data)

In [108]:
train_data, test_data, eval_data = split_data(merged_data, train=0.8, test=0.1, eval=0.1)

In [109]:
train_data.shape

(80000, 21)

In [110]:
test_data.shape

(10000, 21)

In [111]:
eval_data.shape

(10000, 21)

In [112]:
# TODO: Transform columns into encodings
# TODO: Drop columns we still don't use

In [113]:
train_data.columns  # ground truth columns

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description', 'follower_count',
       'following_count', 'account_creation_date', 'account_language',
       'tweet_language', 'tweet_text', 'tweet_time', 'tweet_client_name',
       'is_retweet', 'like_count', 'retweet_count', 'hashtags', 'urls',
       'user_mentions', 'label'],
      dtype='object')

# Define features

In [None]:
# Feature info
float_features_names = [
  "follower_count",
  "following_count",
  "is_retweet",
  "like_count",
  "retweet_count",
]

In [None]:
# Data pre-processing
tokenizer = AutoTokenizer.from_pretrained(model_path)

Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/410 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245k [00:00<?, ?B/s]

# Define model steps

In [None]:
def process_data(batch):
  feature_dict = tokenizer(
      batch["tweet_text"].tolist(),
      padding=True,
      truncation=True,
      return_tensors="pt",
  )

  numerical_data = batch[float_features_names].astype(float)
  feature_dict["float_features"] = torch.from_numpy(numerical_data.values).float()

  labels = batch["label"].astype(float)
  feature_dict["labels"] = torch.from_numpy(labels.values).float()

  return feature_dict

In [None]:
# class BertTweetClassifier(nn.Module):
#   def __init__(self, hidden_size, dense_size, numeric_feature_size, output_size, dropout=0.1):
#     super().__init__()
#     self.hidden_size = hidden_size
#     self.output_size = output_size

#     # Use pre-trained BERT model
#     self.bert = AutoModel.from_pretrained(
#         model_path,
#         output_hidden_states=True,
#         output_attentions=True,
#     )

#     for param in self.bert.parameters():
#         param.requires_grad = False  # No backprop here for now

#     self.weights = nn.Parameter(torch.rand(13, 1))
#     self.dropout = nn.Dropout(dropout)
#     self.fc1 = nn.Linear(hidden_size, dense_size)
#     self.fc2 = nn.Linear(dense_size + numeric_feature_size, output_size)
#     self.relu = nn.ReLU()
#     self.sigmoid = nn.Sigmoid()

#   def forward(self, feature_dict):
#     times = []
#     t0 = time.time()

#     input_ids = feature_dict["input_ids"]
#     float_features = feature_dict["float_features"]

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     all_hidden_states, all_attentions = self.bert(input_ids)[-2:]
#     batch_size = input_ids.shape[0]

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     ht_cls = torch.cat(all_hidden_states)[:, :1, :].view(13, batch_size, 1, self.hidden_size)
#     atten = torch.sum(ht_cls * self.weights.view(13, 1, 1, 1), dim=[1, 3])

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     atten = F.softmax(atten.view(-1), dim=0)
#     feature = torch.sum(ht_cls * atten.view(13, 1, 1, 1), dim=[0, 2])

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     dense_out = self.fc1(self.dropout(feature))
#     activ_out = self.relu(dense_out)

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     concat_layer = torch.cat((activ_out, float_features), 1)
#     out = self.fc2(concat_layer)
#     prediction = self.sigmoid(out)

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     print(times)

#     return prediction

In [None]:
# class BertTweetClassifier(nn.Module):
#   def __init__(self, hidden_size, dense_size, numeric_feature_size, output_size, dropout=0.1):
#     super().__init__()
#     self.hidden_size = hidden_size
#     self.output_size = output_size

#     # Use pre-trained BERT model
#     self.bert = AutoModel.from_pretrained(
#         model_path,
#         output_hidden_states=True,
#         output_attentions=True,
#     )

#     for param in self.bert.parameters():
#         param.requires_grad = False  # No backprop here for now

#     self.weights = nn.Parameter(torch.rand(7, 1))
#     self.dropout = nn.Dropout(dropout)
#     self.fc1 = nn.Linear(hidden_size, dense_size)
#     self.fc2 = nn.Linear(dense_size + numeric_feature_size, output_size)
#     self.relu = nn.ReLU()
#     self.sigmoid = nn.Sigmoid()

#   def forward(self, feature_dict):
#     times = []
#     t0 = time.time()

#     input_ids = feature_dict["input_ids"]
#     float_features = feature_dict["float_features"]

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     all_hidden_states, all_attentions = self.bert(input_ids, output_attentions=True, output_hidden_states=True)[-2:]
#     batch_size = input_ids.shape[0]

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     ht_cls = torch.cat(all_hidden_states)[:, :1, :].view(7, batch_size, 1, self.hidden_size)
#     atten = torch.sum(ht_cls * self.weights.view(7, 1, 1, 1), dim=[1, 3])

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     atten = F.softmax(atten.view(-1), dim=0)
#     feature = torch.sum(ht_cls * atten.view(7, 1, 1, 1), dim=[0, 2])

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     dense_out = self.fc1(self.dropout(feature))
#     activ_out = self.relu(dense_out)

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     concat_layer = torch.cat((activ_out, float_features), 1)
#     out = self.fc2(concat_layer)
#     prediction = self.sigmoid(out)

#     torch.cuda.synchronize()
#     times.append(time.time() - t0)

#     print(times)

#     return prediction

In [None]:
class BertTweetClassifier(nn.Module):
  def __init__(self, hidden_size, dense_size, numeric_feature_size, output_size, dropout=0.1):
    super().__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size

    # Use pre-trained BERT model
    self.bert = AutoModel.from_pretrained(
        model_path,
        output_hidden_states=True,
        output_attentions=True,
    )

    for param in self.bert.parameters():
        param.requires_grad = False  # No backprop here for now

    self.weights = nn.Parameter(torch.rand(7, 1))
    self.dropout = nn.Dropout(dropout)
    self.fc1 = nn.Linear(hidden_size, dense_size)
    self.fc2 = nn.Linear(dense_size + numeric_feature_size, output_size)
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, feature_dict):
    input_ids = feature_dict["input_ids"]
    float_features = feature_dict["float_features"]

    all_hidden_states, all_attentions = self.bert(input_ids, output_attentions=True, output_hidden_states=True)[-2:]
    batch_size = input_ids.shape[0]

    ht_cls = torch.cat(all_hidden_states)[:, :1, :].view(7, batch_size, 1, self.hidden_size)
    atten = torch.sum(ht_cls * self.weights.view(7, 1, 1, 1), dim=[1, 3])

    atten = F.softmax(atten.view(-1), dim=0)
    feature = torch.sum(ht_cls * atten.view(7, 1, 1, 1), dim=[0, 2])

    dense_out = self.fc1(self.dropout(feature))
    activ_out = self.relu(dense_out)

    concat_layer = torch.cat((activ_out, float_features), 1)
    out = self.fc2(concat_layer)
    prediction = self.sigmoid(out)

    return prediction

In [None]:
model = BertTweetClassifier(768, 32, 5, 1).to(device)

Downloading:   0%|          | 0.00/260M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/distilbert-base-turkish-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model

BertTweetClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

# Train model

In [None]:
# for epoch_num in range(num_epochs):
#   train_data = train_data.sample(frac=1)  # Shuffle the batches each epoch
#   for batch_num in range(0, num_examples, batch_size):
#     batch_data = train_data[batch_num : min(batch_num + batch_size, num_examples)]
#     feature_dict = process_data(batch_data)
#     predictions = np.squeeze(model(feature_dict))
#     labels = feature_dict["labels"]
#     loss = criterion(predictions, labels)
#     print(loss)

In [None]:
import time
import gc
gc.collect()

567

In [None]:
def report_eval_metrics(model, test_data, batch_size=256):
  num_examples = test_data.shape[0]

  torch.cuda.empty_cache()
  model.zero_grad()

  with torch.no_grad():
    for batch_start_index in range(0, num_examples, batch_size):
      batch_num = int(batch_start_index / batch_size)
      if batch_num % 100 == 0:
        print(f"Batch {batch_num} started")

      batch_data = test_data[batch_num : min(batch_num + batch_size, num_examples)]

      feature_dict = process_data(batch_data).to(device)
      predictions = np.squeeze(model(feature_dict))  ## TODO: Speed up
      labels = feature_dict["labels"]

      loss = criterion(predictions, labels)
      total_loss += loss.item()

      del predictions
      torch.cuda.empty_cache()

  return total_loss / num_examples

In [None]:
report_eval_metrics(model, test_data)

Batch 0 started


RuntimeError: ignored

In [None]:
batch_size = 64
learning_rate = 1e-5
num_epochs = 3
num_examples = train_data.shape[0]
criterion = nn.BCELoss()

torch.cuda.empty_cache()
model.zero_grad()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

batch_count = math.ceil(num_examples / batch_size)

for epoch_num in range(num_epochs):
  print(f"Epoch {epoch_num} started")
  train_data = train_data.sample(frac=1)  # Shuffle the batches each epoch
  total_epoch_loss = 0

  ### 0.4 seconds total per loop ###
  for batch_start_index in range(0, num_examples, batch_size):
    model.zero_grad()

    batch_num = int(batch_start_index / batch_size)
    if batch_num % 100 == 0:
      print(f"Batch {batch_num} started")

    batch_data = train_data[batch_num : min(batch_num + batch_size, num_examples)]

    feature_dict = process_data(batch_data).to(device)
    predictions = np.squeeze(model(feature_dict))  ## TODO: Speed up
    labels = feature_dict["labels"]

    loss = criterion(predictions, labels)
    total_epoch_loss += loss.item()

    loss.backward()
    optimizer.step()

    ### 0.1 seconds start ###
    del predictions
    torch.cuda.empty_cache()
    ### 0.1 seconds end ###

  print(f"Epoch {epoch_num} finished")
  print("Average batch loss: " + str(total_epoch_loss / batch_count))

Epoch 0 started
Batch 0 started
Batch 100 started
Batch 200 started
Batch 300 started
Batch 400 started
Batch 500 started
Batch 600 started
Batch 700 started
Epoch 0 finished
Average batch loss: 3.007789291671845
Epoch 1 started
Batch 0 started
Batch 100 started
Batch 200 started
Batch 300 started
Batch 400 started
Batch 500 started
Batch 600 started
Batch 700 started
Epoch 1 finished
Average batch loss: 3.158608843241777
Epoch 2 started
Batch 0 started
Batch 100 started
Batch 200 started
Batch 300 started
Batch 400 started
Batch 500 started
Batch 600 started
Batch 700 started
Epoch 2 finished
Average batch loss: 3.4898591280646025
Epoch 3 started
Batch 0 started
Batch 100 started
Batch 200 started
Batch 300 started
Batch 400 started
Batch 500 started
Batch 600 started
Batch 700 started
Epoch 3 finished
Average batch loss: 3.1809251138435344
Epoch 4 started
Batch 0 started
Batch 100 started
Batch 200 started
Batch 300 started
Batch 400 started
Batch 500 started
Batch 600 started
Batch 

# Saving objects

In [114]:
train_data_path = "/content/drive/MyDrive/train_data.zip"
test_data_path = "/content/drive/MyDrive/test_data.zip"
eval_data_path = "/content/drive/MyDrive/eval_data.zip"

In [115]:
train_data.to_csv(path_or_buf=train_data_path, index=False)
test_data.to_csv(path_or_buf=test_data_path, index=False)
eval_data.to_csv(path_or_buf=eval_data_path, index=False)

In [116]:
test_read = pd.read_csv(train_data_path)

In [117]:
train_data.shape

(80000, 21)

In [118]:
test_read.shape

(80000, 21)

In [119]:
train_data.iloc[8169]

tweetid                                                   1217514045751930882
userid                            vzE2Thz2OP5EUbQsVtqv90Ri7KiFTUHafGgb+aJxag=
user_display_name                 vze2thz2op5eubqsvtqv90ri7kiftuhafggb+ajxag=
user_screen_name                  vzE2Thz2OP5EUbQsVtqv90Ri7KiFTUHafGgb+aJxag=
user_reported_location                                                       
user_profile_description                                                  nan
follower_count                                                            528
following_count                                                          1106
account_creation_date                                              2016-02-20
account_language                                                           tr
tweet_language                                                             tr
tweet_text                  : hiçbir esnaf ticari itibarını i̇smini çocuğu...
tweet_time                                                   202

In [120]:
test_read['tweetid'].iloc[8169]

1217514045751930882

In [124]:
i = 0
j = 0
bad_j_vals = []

while (i < train_data.shape[0]):
  if str(train_data['tweetid'].iat[i]) != test_read['tweetid'].iat[j]:
    bad_j_vals.append(j)
    j += 1
  else:
    i += 1
    j += 1

bad_j_vals

IndexError: ignored

# Debugging

In [None]:
times

In [None]:
torch.cuda.memory_summary(device="cuda", abbreviated=False)



In [None]:
for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            print(type(obj), obj.size())
    except:
        pass

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


<class 'torch.nn.parameter.Parameter'> torch.Size([32000, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([512, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([2, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072, 768])
<class 'torch.nn.parameter.Parameter'> torch.Size([3072])
<class 'torch.nn.parameter.Parameter'> torch.Size([768, 3072])
<class 'torch.nn.parameter.Parameter'> tor