# Tweet Turing Test: Detecting Disinformation on Twitter  

|          | Group #2 - Disinformation Detectors                     |
|---------:|---------------------------------------------------------|
| Members  | John Johnson, Katy Matulay, Justin Minnion, Jared Rubin |
| Notebook | `xx_modelA_nlp_preprocess.ipynb`                        |
| Purpose  | NLP-specific preprocessing for BERT base classification model test                 |

(todo: description)

# 1 - Setup

In [1]:
# imports from Python standard library

# imports requiring installation
#   connection to Google Cloud Storage
from google.cloud import storage            # pip install google-cloud-storage
from google.oauth2 import service_account   # pip install google-auth

#  data science packages
import numpy as np                          # pip install numpy
import pandas as pd                         # pip install pandas

In [2]:
!pip install demoji
import pandas as pd 
import os
import numpy as np
import matplotlib.pyplot as plt
import demoji
import seaborn as sns
import nltk
from datetime import date
import regex as re
import pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# pandas options
pd.set_option('display.max_colwidth', None)

## Local or Cloud?

Decide here whether to run notebook with local data or GCP bucket data
 - if the working directory of this notebook has a "../data/" folder with data loaded (e.g. working on local computer or have data files loaded to a cloud VM) then use the "local files" option and comment out the "gcp bucket files" option
 - if this notebook is being run from a GCP VM (preferrably in the `us-central1` location) then use the "gcp bucket files" option and comment out the "local files" option

In [None]:
# option: local files
local_or_cloud: str = "local"   # comment/uncomment this line or next

# option: gcp bucket files
#local_or_cloud: str = "cloud"   # comment/uncomment this line or previous

# don't comment/uncomment for remainder of cell
if (local_or_cloud == "local"):
    data_paths = local_data_paths
    snapshot_paths = local_snapshot_paths
elif (local_or_cloud == "cloud"):
    data_paths = gcp_data_paths
    snapshot_paths = gcp_snapshot_paths
else:
    raise ValueError("Variable 'local_or_cloud' can only take on one of two values, 'local' or 'cloud'.")
    # subsequent cells will not do this final "else" check

In [None]:
# this cell only needs to run its code if local_or_cloud=="cloud"
#   (though it is harmless if run when local_or_cloud=="local")
gcp_storage_client: storage.Client = None
gcp_bucket: storage.Bucket = None

if (local_or_cloud == "cloud"):
    gcp_storage_client = tur.get_gcp_storage_client(project_name=gcp_project_name, key_file=gcp_key_file)
    gcp_bucket = tur.get_gcp_bucket(storage_client=gcp_storage_client, bucket_name=gcp_bucket_name)

# 2 - Load Dataset

Sampled dataset, as prepared by prior notebook `04_nlp_preprocess.ipynb`, will be loaded as "`data`".

In [5]:
#Google Drive url for data file
file_folder = '/content/gdrive/MyDrive/Grad School/DSCI592/data/'
#Read in the csv 
data = pd.read_parquet(file_folder+'2data_sample_ten_percent.parquet.gz')
#data = pd.read_parquet('/content/gdrive/MyDrive/Grad School/DSCI592/data/2data_sample_ten_percent.parquet.gz')

In [None]:
data['class'].value_counts()

Troll       211440
Verified    150874
Name: class, dtype: int64

In [None]:
data['account_category'].value_counts()

Verified_User    147275
RightTroll        70296
NewsFeed          59439
LeftTroll         42297
HashtagGamer      23569
Commercial        11353
Unknown            4233
NonEnglish         2692
Fearmonger         1160
Name: account_category, dtype: int64

# 2 - Pre-process for BERT


In [None]:
test="I bet you didn't know that 🙋, 🙋‍♂️, and 🙋‍♀️ are three different emojis."
test_replaced = demoji.replace_with_desc(test, "'") 

In [None]:
test_replaced

"I bet you didn't know that 'person raising hand', 'man raising hand', and 'person raising hand'\u200d♀️ are three different emojis."

## 2a - Test on mini dataset

In [None]:
df_mini = data[data['emoji_count']>1][:5]
df_mini

Unnamed: 0,external_author_id,author,content,region,language,following,followers,updates,post_type,is_retweet,...,tco1_step1,data_source,has_url,emoji_text,emoji_count,publish_date,class,following_ratio,class_numeric,RUS_lett_count
24,42638627,CycloneFB,All work is easy work. 🌪💪 #RaiseTheStandard https://t.co/pg5AJleNxY,"Ames, Iowa",en,280,152067,88,,0.0,...,https://twitter.com/CycloneFB/status/880516718011056128/photo/1,verified_random,1,"[tornado, flexed biceps]",2,2017-06-29 20:01:59+00:00,Verified,0.001841,0,0
48,2196922086,oneplus,"@robcrilly 😮 That is amazing! You're amazing! Keep doing cool things, sir 👍 https://t.co/46AFSw4R7O",,en,52,2476117,5,replied_to,0.0,...,https://twitter.com/oneplus/status/843656999568990208/photo/1,verified_random,1,"[face with open mouth, thumbs up]",2,2017-03-20 02:54:37+00:00,Verified,2.1e-05,0,0
106,743167000000000000,COVFEFENATIONUS,AMERICA IS BACK!! What a difference a year makes. I'm also with her as she has much more class and is stunning. 👍🇺🇸🇺🇸🇺🇸 https://t.co/quBBLC4QwB,United States,en,245,2081,142343,,0.0,...,https://twitter.com/CovfefeNationUS/status/928851894549614592/photo/1,Troll,1,"[thumbs up, flag: United States, flag: United States, flag: United States]",4,2017-11-10 05:08:00+00:00,Troll,0.117675,1,0
173,895000000000000000,ANAAISLEC,KazmierskiR TajJacks DCNative01 bdcousins donnaharmon16 207_Melissa jsand123123 N_AmerSolutions MaryEBarnes it's beautiful here🌲🐟🌲🦌,Unknown,en,37,62,1501,,0.0,...,,Troll,0,"[evergreen tree, fish, evergreen tree, deer]",4,2017-08-15 04:02:00+00:00,Troll,0.587302,1,0
291,126733638,pmnewsnigeria,RT @intellectual316: Confederation is the solution 💯💯 https://t.co/pvzIoq3Riw,"Lagos, Nigeria",en,240,776096,1,retweeted,1.0,...,https://twitter.com/intellectual316/status/880472436839510021/photo/1,verified_random,1,"[hundred points, hundred points]",2,2017-06-29 21:54:01+00:00,Verified,0.000309,0,0


In [None]:
def convert_emoji_text(tweet_series: pd.Series) -> str:
    ''' The following converts an emoji in a text string to a str enclosed with ''. '''
    ##return demoji.replace_with_desc(tweet_series['content'], "'") 
    return demoji.replace_with_desc(tweet_series['content'], " ") 

In [None]:
# apply convert_emoji_text
new_column = df_mini.apply(convert_emoji_text, axis='columns')
df_mini.loc[:, 'content2'] = new_column

In [None]:
df_mini[['content','content2']]

Unnamed: 0,content,content2
24,All work is easy work. 🌪💪 #RaiseTheStandard https://t.co/pg5AJleNxY,All work is easy work. tornado flexed biceps \n\n#RaiseTheStandard https://t.co/pg5AJleNxY
48,"@robcrilly 😮 That is amazing! You're amazing! Keep doing cool things, sir 👍 https://t.co/46AFSw4R7O","@robcrilly face with open mouth That is amazing! You're amazing! Keep doing cool things, sir thumbs up https://t.co/46AFSw4R7O"
106,AMERICA IS BACK!! What a difference a year makes. I'm also with her as she has much more class and is stunning. 👍🇺🇸🇺🇸🇺🇸 https://t.co/quBBLC4QwB,AMERICA IS BACK!! What a difference a year makes. I'm also with her as she has much more class and is stunning. thumbs up flag: United States flag: United States flag: United States https://t.co/quBBLC4QwB
173,KazmierskiR TajJacks DCNative01 bdcousins donnaharmon16 207_Melissa jsand123123 N_AmerSolutions MaryEBarnes it's beautiful here🌲🐟🌲🦌,KazmierskiR TajJacks DCNative01 bdcousins donnaharmon16 207_Melissa jsand123123 N_AmerSolutions MaryEBarnes it's beautiful here evergreen tree fish evergreen tree deer
291,RT @intellectual316: Confederation is the solution 💯💯 https://t.co/pvzIoq3Riw,RT @intellectual316: Confederation is the solution hundred points hundred points https://t.co/pvzIoq3Riw


## 2b - Converting class text to binary label

In [None]:
df_mini['label']=df_mini['class'].apply(lambda x: 1 if x== 'Troll' else 0)
df_mini['label'].value_counts()

0    3
1    2
Name: label, dtype: int64

In [None]:
df_mini[['class','label']]

Unnamed: 0,class,label
24,Verified,0
48,Verified,0
106,Troll,1
173,Troll,1
291,Verified,0


Only retain text and label for BERT

In [None]:
df_test= df_mini[['content2','label']]
df_test

Unnamed: 0,content2,label
24,All work is easy work. tornado flexed biceps \n\n#RaiseTheStandard https://t.co/pg5AJleNxY,0
48,"@robcrilly face with open mouth That is amazing! You're amazing! Keep doing cool things, sir thumbs up https://t.co/46AFSw4R7O",0
106,AMERICA IS BACK!! What a difference a year makes. I'm also with her as she has much more class and is stunning. thumbs up flag: United States flag: United States flag: United States https://t.co/quBBLC4QwB,1
173,KazmierskiR TajJacks DCNative01 bdcousins donnaharmon16 207_Melissa jsand123123 N_AmerSolutions MaryEBarnes it's beautiful here evergreen tree fish evergreen tree deer,1
291,RT @intellectual316: Confederation is the solution hundred points hundred points https://t.co/pvzIoq3Riw,0


Split into train test datasets

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_test['content2'],df_test['label'])


#3 - Apply to full dataset

In [None]:
# apply convert_emoji_text
df = data[['content','class']]
new_column = df.apply(convert_emoji_text, axis='columns')
df.loc[:, 'content2'] = new_column

# map class to binary
df['label']=df['class'].apply(lambda x: 1 if x== 'Troll' else 0)
df_test= df[['content2','label']]

In [None]:
df_test.head()

Unnamed: 0,content2,label
0,"To live dangerously on Friday the 13th, we're doing the radio show from the UNLUCKIEST place on earth! The @TennesseeTitans Locker Room!",0
1,@legsanity I like it. Almost like a free Gio. Pujols is still about as good of a bet as Gonzalez the rest of the way.,0
2,Man servants can have a good purpose as long as they come with cash and don't touch me ever.,1
3,"Naked, dancing woman in Houston slows traffic #news",1
4,"""Coercing guilty pleas still a problem in Senate sentencing bill"" https://t.co/MOYwzcrNIm https://t.co/QAVtcI9cu6",0


In [None]:
#export for use later as parquet
df_test.to_parquet('/content/gdrive/MyDrive/Grad School/DSCI592/data/04_sampledf_text_label.parquet.gz', engine='pyarrow', index=False, compression='gzip')

In [6]:
df_test=pd.read_parquet('/content/gdrive/MyDrive/Grad School/DSCI592/data/04_sampledf_text_label.parquet.gz')

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_test['content2'],df_test['label'])

#3 - BERT Classification

https://www.analyticsvidhya.com/blog/2021/12/text-classification-using-bert-and-tensorflow/

In [8]:
##Must install same version prior to importing or will get errors in colab
!pip install -U "tensorflow==2.8.*"
!pip install -U "tensorflow-text==2.8.*"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.8.*
  Downloading tensorflow-2.8.4-cp38-cp38-manylinux2010_x86_64.whl (498.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m498.0/498.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-estimator<2.9,>=2.8
  Downloading tensorflow_estimator-2.8.0-py2.py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.3/462.3 KB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras<2.9,>=2.8.0rc0
  Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.9,>=2.8
  Downloading tensorboard-2.8.0-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
Installing collec

In [9]:
import tensorflow as tf
import tensorflow_text as text
import functools
import tensorflow_hub as hub


In [None]:
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=2, batch_size = 32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f826be8ebb0>

In [10]:
#load saved model
model = tf.keras.models.load_model('/content/gdrive/MyDrive/Grad School/DSCI592/data/my_model3.h5')

ValueError: ignored

In [None]:
y_predicted = model.predict(x_test)
y_predicted = y_predicted.flatten()
print(y_predicted)

[0.2249369  0.6432007  0.15587582 ... 0.9549607  0.45183754 0.7817259 ]


In [None]:
model.summary()

In [None]:
model.evaluate(x_test,y_test,verbose=2)
print("Accuracy:",acc*100)

2831/2831 - 932s - loss: 0.4958 - accuracy: 0.7675 - 932s/epoch - 329ms/step


NameError: ignored

In [None]:
!pip install pyyaml h5py
import os

from tensorflow import keras

print(tf.version.VERSION)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2.8.4


In [None]:
tf.keras.Model.save('/content/gdrive/MyDrive/Grad School/DSCI592/data/my_model')

TypeError: ignored

In [None]:
tf.keras.model.save('/content/gdrive/MyDrive/Grad School/DSCI592/data/my_model2')

AttributeError: ignored

In [None]:
model.save('/content/gdrive/MyDrive/Grad School/DSCI592/data/my_model3.h5')

# 4 - Transformer Testing

Testing transformer on '`df_mini`'

In [None]:
#tensorflow
import tensorflow as tf
from tensorflow import keras
from keras.layers import *
from keras.models import *
from tensorflow.keras.utils import load_img

In [None]:
#PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [None]:
import transformers

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9598049521446228}]

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
inputs = tokenizer(str(df_mini['content2'][:1]), padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  2484,  2035,  2147,  2003,  3733,  2147,  1012, 11352, 24244,
         27947,  1032,  1050,  1032,  1050,  1001,  5333, 20515, 13832,  4103,
         16770,  1024,  1013,  1013,  1056,  1012,  2522,  1013, 18720,  2629,
         13006,  7770, 18037,  2171,  1024,  4180,  2475,  1010, 26718, 18863,
          1024,  4874,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
input_ids=tokenizer.encode(str(df_mini['content2'][:1]),add_special_tokens=True)


In [None]:
print(len(input_ids))

43


In [None]:
print(input_ids)

[101, 2484, 2035, 2147, 2003, 3733, 2147, 1012, 11352, 24244, 27947, 1032, 1050, 1032, 1050, 1001, 5333, 20515, 13832, 4103, 16770, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 18720, 2629, 13006, 7770, 18037, 2171, 1024, 4180, 2475, 1010, 26718, 18863, 1024, 4874, 102]
