In [None]:
#!pip install tensorflow==2.9.2
import os
import shutil
import pandas as pd
import numpy as np

!pip install tensorflow
!pip install tensorflow-text # A dependency of the preprocessing for BERT inputs
!pip install -q tf-models-official # For the AdamW optimizer from tensorflow/models
!pip install bert-for-tf2
!pip install sentencepiece
!pip install numpy

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import re
from official.nlp import optimization  # to create AdamW optmizer
tf.get_logger().setLevel('ERROR')
from tqdm import tqdm

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout

from drive.MyDrive.features_africa.src.utils.general import *
from drive.MyDrive.features_africa.src.utils.clean import *
from drive.MyDrive.features_africa.src.utils.funcs import *

# Cleaning the frame for BERT
def clean_data(df, text_column):
  """Clean text column in dataframe"""
  df['text_clean'] = df[text_column].apply(clean_text)
  df = df.reset_index(drop=True)

  return(df)

def lang_detect_na(tweet):
    try:
        lang = detect(tweet)
    except:
        lang = 'NA'
    return lang

TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
  """
  """
  sentence = TAG_RE.sub('', sen) # html tags
  sentence = re.sub('[^a-zA-Z]', ' ', sentence) # punctuations and numbers
  sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) # single character
  sentence = re.sub(r'\s+', ' ', sentence) # multiple spaces
  return sentence.lower()

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
tf.get_logger().setLevel('ERROR')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout

# Importing the model from tensorflow

tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

#tfhub_model_compu = "../source/utils/small_bert_bert_en_uncased_L-4_H-512_A-8_2.tar.gz"
# Build Neural Network with BERT

#text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') #input
text_inputs = [tf.keras.layers.Input(shape=(), dtype=tf.string)]

#preprocessor = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing') #BERT tokenizer
#tokenized = preprocessor(text_input)
preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]

seq_length = 256  # Your choice here.

bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=seq_length))  # Optional argument.

encoder_inputs = bert_pack_inputs(tokenized_inputs)

encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder') #BERT embedding and encoding
#encoder = hub.KerasLayer(tfhub_model_compu, trainable=True, name='BERT_encoder') #BERT embedding and encoding
embedded = encoder(encoder_inputs)

net = embedded['pooled_output']
#net = tf.keras.layers.Dropout(0.1)(net)
#net = tf.keras.layers.Dense(32, activation='relu',)(net)
net = tf.keras.layers.Dropout(0.1)(net)
net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)

model_BERT = tf.keras.Model(text_inputs, net)

#### Function to predict certain text in a data frame:

# Takes a df with a column named "text", returns the same df but with the prediction in a column named "verifiability"

def predictions_bert(df_import, column = "text"):
    model_path = 'drive/MyDrive/features_africa/models/cp7.cpkt'
    model_BERT.load_weights(model_path)
    df = clean_data(df_import, column)
    sentences = list(df['text_clean'])
    text = np.array([str.encode(preprocess_text(sen)) for sen in sentences], dtype=object)
    predictions = model_BERT.predict(text).flatten()
    y_pred_nn = (predictions > 0.5).astype(np.int32)

    df_import['verifiability'] = y_pred_nn.tolist()

    return df_import

def predictions_bert2(df_import, column = "text"):
    model_path = 'drive/MyDrive/features_africa/models/cp2.cpkt'
    model_BERT.load_weights(model_path)
    df = clean_data(df_import, column)
    sentences = list(df['text_clean'])
    text = np.array([str.encode(preprocess_text(sen)) for sen in sentences], dtype=object)
    predictions = model_BERT.predict(text).flatten()
    y_pred_nn = (predictions > 0.5).astype(np.int32)

    df_import['true'] = y_pred_nn.tolist()

    return df_import

def compute_engagement_tw(engage):
    engage["total_reactions"] = engage[
        [x for x in engage.columns if "public_metrics" in x]
    ].sum(axis="columns")
    engage["total_comments"] = engage[
        ["public_metrics.reply_count", "public_metrics.quote_count"]
    ].sum(axis="columns")
    engage.rename(
        {"public_metrics.retweet_count": "total_shares"}, axis=1, inplace=True
    )
    return(engage)

def has_items_tw(df):

    has_items = PreprocessTweets(df).preprocess()
    has_items.drop(['entities.urls', 'description', 'display_url',
    'end', 'expanded_url', 'images', 'media_key', 'start',
    'status', 'title', 'unwound_url', 'url'], axis=1, inplace=True)

    return(has_items)




In [None]:
country = 'SA'
import time
for w in tqdm(range(0, 18)):
  df = pd.read_parquet(f'drive/MyDrive/features_africa/data/{country}/intermediate/april_{w}.parquet.gzip')
  #df = compute_engagement_tw(df)
  #df = has_items_tw(df)

  # Bert Verifiable prediction:
  df_bert = df[(df['lang'] == 'en') & (df['has_text'] == 1)]
  df1 = df[(df['lang'] != 'en') | (df['has_text'] == 0)]
  df_bert = predictions_bert(df_bert)
  df = pd.concat([df1, df_bert]).reset_index(drop = True).drop(['text_clean'],
                                                               axis = 1)

  # Bert Fake True Pred:
  df1 = df[(df['verifiability'] != 1)]
  df_bert = df[(df['verifiability'] == 1)]
  df_bert = predictions_bert2(df_bert)
  df = pd.concat([df_bert, df1]).reset_index(drop=True).drop(['text_clean'],
                                                             axis = 1)
  df.to_parquet(f'drive/MyDrive/features_africa/data/{country}/predicted/april_{w}.parquet.gzip',
                compression = 'gzip')
  df = pd.DataFrame()
  time.sleep(1)

In [None]:
# Nolang
country = 'SA'
import time
for w in tqdm(range(4, 5)):
  df = pd.read_parquet(f'drive/MyDrive/features_africa/data/{country}/intermediate/may_batch2{w}.parquet.gzip')
  #df = compute_engagement_tw(df)
  #df = has_items_tw(df)

  #df['text'] = df['text'].astype(str)
  #df['lang2'] = df['text'].apply(lang_detect_na)

  # Bert Verifiable prediction:
  df_bert = df[(df['lang2'] == 'en')]
  df1 = df[(df['lang2'] != 'en')]
  df_bert = predictions_bert(df_bert)
  df = pd.concat([df1, df_bert]).reset_index(drop = True).drop(['text_clean'],
                                                               axis = 1)

  # Bert Fake True Pred:
  df1 = df[(df['verifiability'] != 1)]
  df_bert = df[(df['verifiability'] == 1)]
  df_bert = predictions_bert2(df_bert)
  df = pd.concat([df_bert, df1]).reset_index(drop=True).drop(['text_clean'],
                                                             axis = 1)
  df.to_parquet(f'drive/MyDrive/features_africa/data/{country}/predicted/may_batch2{w}.parquet.gzip',
                compression = 'gzip')
  df = pd.DataFrame()
  time.sleep(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
100%|██████████| 1/1 [47:57<00:00, 2877.47s/it]


In [None]:
# Nolang
country = 'SA'
import time
for w in tqdm(range(5, 10)):
  df = pd.read_parquet(f'drive/MyDrive/features_africa/data/{country}/intermediate/baseline_batch2_{w}.parquet.gzip')
  # Bert Verifiable prediction:
  df_bert = df[(df['lang'] == 'en') & (df['has_text'] == True)]
  df1 = df[(df['lang'] != 'en') | (df['has_text'] == False)]
  df_bert = predictions_bert(df_bert)
  df = pd.concat([df1, df_bert]).reset_index(drop = True).drop(['text_clean'],
                                                               axis = 1)

  # Bert Fake True Pred:
  df1 = df[(df['verifiability'] != 1)]
  df_bert = df[(df['verifiability'] == 1)]
  df_bert = predictions_bert2(df_bert)
  df = pd.concat([df_bert, df1]).reset_index(drop=True).drop(['text_clean'],
                                                             axis = 1)
  df.to_parquet(f'drive/MyDrive/features_africa/data/{country}/predicted/baseline_batch2_{w}.parquet.gzip',
                compression = 'gzip')
  df = pd.DataFrame()
  time.sleep(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
100%|██████████| 5/5 [2:57:39<00:00, 2131.97s/it]


In [None]:
# Done: december0 and december0_abs
country = 'SA'
weeks = ['january0_2',
         'january1_1', 'january1_2', 'february0']
for w in tqdm(weeks):
  df = pd.read_parquet(f'drive/MyDrive/features_africa/data/{country}/endline/{w}.parquet.gzip')
  #df = compute_engagement_tw(df)
  #df = has_items_tw(df)

  # Bert Verifiable prediction:
  df_bert = df[(df['lang'] == 'en') & (df['has_text'] == 1)]
  df1 = df[(df['lang'] != 'en') | (df['has_text'] == 0)]
  df_bert = predictions_bert(df_bert)
  df = pd.concat([df1, df_bert]).reset_index(drop = True).drop(['text_clean'], axis = 1)

  # Bert Fake True Pred:
  df1 = df[(df['verifiability'] != 1)]
  df_bert = df[(df['verifiability'] == 1)]
  df_bert = predictions_bert2(df_bert)
  df = pd.concat([df_bert, df1]).reset_index(drop=True).drop(['text_clean'], axis = 1)
  df.to_parquet(f'drive/MyDrive/features_africa/data/{country}/endline/{w}.parquet.gzip',
                compression = 'gzip')
  df = pd.DataFrame()
  time.sleep(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_clean'] = df[text_column].apply(clean_text)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['verifiability'] = y_pred_nn.tolist()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_import['true'] = y_pred_nn.tolist()
 25%|██▌       | 1/4 [52:10<2:36:30, 3130.02s/it]



 50%|█████     | 2/4 [1:48:14<1:48:55, 3267.78s/it]



 75%|███████▌  | 3/4 [2:46:03<55:59, 3359.82s/it]  



100%|██████████| 4/4 [3:37:14<00:00, 3258.73s/it]


In [None]:
import numpy as np