In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install bert-tensorflow
!pip install tensorflow==1.15


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [None]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [None]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = '/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = False #@param {type:"boolean"}
BUCKET = 'BUCKET_NAME' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
#tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

#Dataframe


In [None]:
df_train = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/df_train_pos_neg.csv')
df_val = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/df_val_pos_neg.csv')
df_test = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/df_test_pos_neg.csv')


In [None]:
#CHANGE VALUE OF POS: 2-->1
df_test['label']  = df_test['label'].replace(to_replace=2, value=1, regex=True)
df_val['label']   = df_val['label'].replace(to_replace=2, value=1, regex=True)
df_train['label'] = df_train['label'].replace(to_replace=2, value=1, regex=True)

In [None]:
#REMOVE LINK
df_train['Text'] = df_train['Text'].str.replace('http\S+|www.\S+', '', case=False)
df_val['Text']   = df_val['Text'].str.replace('http\S+|www.\S+', '', case=False)
df_test['Text']  = df_test['Text'].str.replace('http\S+|www.\S+', '', case=False)

In [None]:
X_train=df_train['Text'].values
y_train=df_train['label'].values
X_val=df_val['Text'].values
y_val=df_val['label'].values
X_test=df_test['Text'].values
y_test=df_test['label'].values

#remove Emojy from tweet
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

for i in range(len(X_train)):
    X_train[i]=deEmojify(X_train[i])

for i in range(len(X_val)):
    X_val[i]=deEmojify(X_val[i])
    
for i in range(len(X_test)):
    X_test[i]=deEmojify(X_test[i])

In [None]:
df_train['Text']=X_train
df_val['Text']=X_val
df_test['Text']=X_test

In [None]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

In [None]:
df_t = pd.DataFrame()
df_t[DATA_COLUMN] = df_train['Text']
df_t[LABEL_COLUMN] = df_train[LABEL_COLUMN]
df_v = pd.DataFrame()
df_v[DATA_COLUMN] = df_val['Text']
df_v[LABEL_COLUMN] = df_val[LABEL_COLUMN]
df_te = pd.DataFrame()
df_te[DATA_COLUMN] = df_test['Text']
df_te[LABEL_COLUMN] = df_test[LABEL_COLUMN]

In [None]:
#concat dataframe validation and dataframe test
frames = [df_v, df_te]
val_test = pd.concat(frames)

#Train

In [None]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [None]:
X_train=df_t[DATA_COLUMN].values
y_train=df_t[LABEL_COLUMN].values
#X_val=df_v[DATA_COLUMN].values
#y_val=df_v[LABEL_COLUMN].values
X_test=val_test[DATA_COLUMN].values
y_test=val_test[LABEL_COLUMN].values

In [None]:
from tensorflow import keras
import os
import re

train_df = pd.DataFrame(data=X_train, columns=[DATA_COLUMN])
train_df[LABEL_COLUMN] = y_train

#val_df = pd.DataFrame(data=X_val, columns=[DATA_COLUMN])
#val_df[LABEL_COLUMN] = y_val

test_df = pd.DataFrame(data=X_test, columns=[DATA_COLUMN])
test_df[LABEL_COLUMN] = y_test

In [None]:
# Use the InputExample class from BERT's run_classifier code to create examples from the data
train_InputExamples = train_df.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

#val_InputExamples = val_df.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
#                                                                   text_a = x[DATA_COLUMN], 
#                                                                   text_b = None, 
#                                                                   label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = test_df.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)


In [None]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
label_list = [0, 1]
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
#val_features = bert.run_classifier.convert_examples_to_features(val_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]  

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

In [None]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

In [None]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 5000
SAVE_SUMMARY_STEPS = 1000

In [None]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [None]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [None]:
model_fn = model_fn_builder(
  num_labels=2,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [None]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [None]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

In [None]:
estimator.evaluate(input_fn=train_input_fn, steps=None)

#Validation

In [None]:
val_input_fn = run_classifier.input_fn_builder(
    features=val_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
estimator.evaluate(input_fn=val_input_fn, checkpoint_path='/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036')

#Test

In [None]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
result = estimator.predict(input_fn=test_input_fn, checkpoint_path='/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036')

In [None]:
preds = []
import numpy as np 
for prediction in result:
      preds.append(prediction['probabilities'])

#Results


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print("Accuracy of BERT is:",accuracy_score(y_test,preds_arg))
print(classification_report(y_test,preds))

In [None]:
confusion_matrix(y_test, preds, labels=[0,1])

#Extract embeddings

In [None]:
! git clone https://github.com/google-research/bert.git

In [None]:
LAYERS = [-1,-2,-3,-4]
#NUM_TPU_CORES = 8
MAX_SEQ_LENGTH = 128

BERT_CONFIG = '/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/bert_config.json'
CHKPT_DIR = '/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036'
VOCAB_FILE = '/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/vocab.txt'
INIT_CHECKPOINT = '/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036'
BATCH_SIZE = 32

In [None]:
import shutil, os
import json

check1='/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036.data-00000-of-00001'
check2='/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036.index'
check3='/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/model.ckpt-23036.meta'
files = [check1,check2,check3, VOCAB_FILE, BERT_CONFIG]
for f in files:
    shutil.copy(f, '/content/tmp')

In [None]:
df_t
df_v
df_te
df_emb=pd.DataFrame()

In [None]:
data1=df_t[0:49145]
data2=df_t[49145:98290]
data3=df_t[98290:147435]
data4=df_t[147435:196580]
data5=df_t[196580:]

In [None]:
import time

In [None]:
!pip install jsonlines
import jsonlines


In [None]:
with open('/content/tmp/inputTrain.txt', 'a+') as the_file:
    for i in range(245724):
      s= data1['text'][i]
      s= s+"\n"
      the_file.write(s)


In [None]:
with open('/content/tmp/inputVal.txt', 'a+') as the_file:
    for i in range(34000):
      s= df_v['text'][i]
      s= s+"\n"
      the_file.write(s)


In [None]:
with open('/content/tmp/inputTest.txt', 'a+') as the_file:
    for i in range(34000):
      s= df_te['text'][i]
      s= s+"\n"
      the_file.write(s)


In [None]:
!python /content/bert/extract_features.py \
    --input_file=/content/tmp/inputTrain.txt \
    --output_file=/content/tmp/output.jsonl \
    --vocab_file=/content/tmp/vocab.txt \
    --bert_config_file=/content/tmp/bert_config.json \
    --init_checkpoint=/content/tmp/model.ckpt-23036 \
    --layers=-1 \
    --max_seq_length=128 \
    --batch_size=32


In [None]:
start = time.time()
df_emb=pd.DataFrame()
with jsonlines.open('/content/tmp/output.jsonl') as f:
    for line in f.iter():
        s=line['features'][0]['layers'][0]['values'] # or whatever else you'd like to do
        df_tmp=pd.DataFrame(s).T
        df_emb=df_emb.append(df_tmp,ignore_index=True)
end = time.time()
print(end - start)

In [None]:
import os
os.remove("/content/tmp/output.jsonl")
#print("File Removed!")

In [None]:
!python /content/bert/extract_features.py \
    --input_file=/content/tmp/inputVal.txt \
    --output_file=/content/tmp/output.jsonl \
    --vocab_file=/content/tmp/vocab.txt \
    --bert_config_file=/content/tmp/bert_config.json \
    --init_checkpoint=/content/tmp/model.ckpt-23036 \
    --layers=-1 \
    --max_seq_length=128 \
    --batch_size=32

In [None]:
start = time.time()
df_emb=pd.DataFrame()
with jsonlines.open('/content/tmp/output.jsonl') as f:
    for line in f.iter():
        s=line['features'][0]['layers'][0]['values'] # or whatever else you'd like to do
        df_tmp=pd.DataFrame(s).T
        df_emb=df_emb.append(df_tmp,ignore_index=True)
end = time.time()
print(end - start)

In [None]:
np.savetxt("/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_val.csv", df_emb, delimiter=",")


In [None]:
os.remove("/content/tmp/output.jsonl")
print("File Removed!")

In [None]:
!python /content/bert/extract_features.py \
    --input_file=/content/tmp/inputTest.txt \
    --output_file=/content/tmp/output.jsonl \
    --vocab_file=/content/tmp/vocab.txt \
    --bert_config_file=/content/tmp/bert_config.json \
    --init_checkpoint=/content/tmp/model.ckpt-23036 \
    --layers=-1 \
    --max_seq_length=128 \
    --batch_size=32

In [None]:
start = time.time()
df_emb=pd.DataFrame()
with jsonlines.open('/content/tmp/output.jsonl') as f:
    for line in f.iter():
        s=line['features'][0]['layers'][0]['values'] # or whatever else you'd like to do
        df_tmp=pd.DataFrame(s).T
        df_emb=df_emb.append(df_tmp,ignore_index=True)
end = time.time()
print(end - start)

In [None]:
np.savetxt("/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_test.csv", df_emb, delimiter=",")


In [None]:
os.remove("/content/tmp/output.jsonl")
print("File Removed!")

In [None]:
train1 = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_train1.csv', header=None)
train2 = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_train2.csv', header=None)
train3 = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_train3.csv', header=None)
train4 = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_train4.csv', header=None)
train5 = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_train5.csv', header=None)


In [None]:
train= pd.concat([train1,train2,train3,train4,train5], ignore_index=True)

In [None]:
np.savetxt("/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_output_pos_neg/embedding_text_sentiment_train.csv", train, delimiter=",")


#Analisi errore


In [None]:
d1 = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/bert_prediction/predict_test_text.csv', header=None)


In [None]:
d1[0]=d1[0].astype(np.int64)
#d1[0]
#d1=d1.reset_index()
d1

In [None]:
df_te['predicted']=d1[0]

In [None]:
#from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from collections import Counter
import itertools
import string
from nltk import wordpunct_tokenize
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer


In [None]:
def text_process(dataframe):
  tokening = TweetTokenizer()
  stop =stopwords.words('english')
  punctuation = string.punctuation
  lancaster_stemmer = LancasterStemmer()
  word_lemmatizer = WordNetLemmatizer()
  stop =set(stop)
  #adding some of the stopwords after observing the tweets
  stop.add("The")
  stop.add("And")
  stop.add("I")
  stop.add("J")
  stop.add("K")
  stop.add("I'd")
  stop.add("That's")
  stop.add("\x81")
  stop.add("It")
  stop.add("I'm")
  stop.add("...")
  stop.add("\x89")
  stop.add("ĚĄ")
  stop.add("it's")
  stop.add("ă")
  stop.add("\x9d")
  stop.add("âÂĺ")
  stop.add("Ě")
  stop.add("˘")
  stop.add("Â")
  stop.add("âÂ")
  stop.add("Ň")
  stop.add("http")
  stop.add("https")
  stop.add("co")
  stop.add("000")
  stop.add("Ň")
  stop.add("Ň")
  stop.add("Ň")
  stop.add("de")
  stop.add("rt")
  stop.add("RT")
  stop.add("..")
  stop.add("i'm")
  stop.add("im")

  stop = list(stop)
  dataframe = dataframe.str.lower()
  tweets_tokenized = dataframe.apply(tokening.tokenize)
  tweets_tokenized_stop = tweets_tokenized.apply(lambda x: [item for item in x if item not in stop])
  tweets_tokenized_stop_punct = tweets_tokenized_stop.apply(lambda x: [item for item in x if item not in punctuation])
  #tweets_tokenized_new_stem = tweets_tokenized_stop_punct.apply(lambda x: [lancaster_stemmer.stem(item) for item in x])
  tweets_tokenized_new_lem = tweets_tokenized_stop_punct.apply(lambda x: [word_lemmatizer.lemmatize(item) for item in x])
  sentences = (list(itertools.chain(tweets_tokenized_new_lem)))
  flat_list = [item for sublist in sentences for item in sublist]
  flat_list
  c = Counter(flat_list)
  print(c.most_common(10))




In [None]:
#test vari effettuati 

foto=["image","pic","pics","picture","images","olympus","canon","kodak","sigma","nikon", "reflex","sony",
      "fujifilm","pentax","panasonic","iphone","samsung", "phone", "iphoneography", "instagram", "square", 
      "twitter", "tweet","fb", "facebook", "follow", "device","devices","adv","advertising","amazon","technolog","innovation",
      "makro", "macro", "mm", "f/","f1","f3","f2","mark","5d", "4d","3d","flickr","filter","nofilter","d300","35mm","58mm","80mm",
      "100mm","2/100mm","dpi", "app", "apps", "application","polaroid","portrait","iphoto"]
df_error=pd.DataFrame()
error = {0:0,1:0}
df_no_error=pd.DataFrame()

for index,row in df_te.iterrows():
  if(row['label']!= row['predicted']):
    print(row['text'])
    if any(word in row['text'].lower() for word in foto):
      df_error=df_error.append(row)

#text_process(df_error['text'])

# Fuse result

In [None]:
d_text = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/matrix_to_fuse.csv', header=None)

In [None]:
pred_t=[]
for index,row in d_text.iterrows():
  pred_t.append(np.argmax(row))

In [None]:
d_img = pd.read_csv('/content/gdrive/My Drive/Università/Tesi_magistrale/b_t4sa_imgs_old/efnet_output/predict_giugno_att', header=None)


In [None]:
df_res=d_img.reset_index()
df_res=df_res.drop('index',axis=1)
df_res

# Max Fusion


In [None]:
max_df=pd.DataFrame(columns=['text','label_t','img','label_i'])
#fusion max
label_i=[]
img=[]
for index,row in df_res.iterrows():
  img.append(np.max(row))
  label_i.append(np.argmax(row))
#fusion max
label_t=[]
text=[]
for index,row in d_text.iterrows():
  text.append(np.max(row))
  label_t.append(np.argmax(row))
max_df['text']=text
max_df['img']=img
max_df['label_t']=label_t
max_df['label_i']=label_i
result=[]
for index,row in max_df.iterrows():
  if(row['text']>=row['img']):
    result.append(int(row['label_t']))
  else:
    result.append(int(row['label_i']))


In [None]:
print("Accuracy of BERT is:",accuracy_score(y_test,result))
print(classification_report(y_test,result))

# Mean Fusion


In [None]:
#fusion media
label_0=[]
label_1=[]

for index,row in df_res.iterrows():
  label_0.append(row[0])
  label_1.append(row[1])
#fusion media
label_0_t=[]
label_1_t=[]
media_df=pd.DataFrame()
for index,row in d_text.iterrows():
  label_0_t.append(row[0])
  label_1_t.append(row[1])
from statistics import mean, median, mode, stdev
result_0 = [mean(k) for k in zip(label_0, label_0_t)]
result_1 = [mean(k) for k in zip(label_1, label_1_t)]

media_df[0]=result_0
media_df[1]=result_1

#fusion mean
res_mean=[]
for index,row in media_df.iterrows():
  res_mean.append(np.argmax(row))


In [None]:
print("Accuracy of BERT is:",accuracy_score(y_test,res_mean))
print(classification_report(y_test,res_mean))

# Conf matrix 

In [None]:
true_val=d_text['label']
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print("Accuracy of BERT is:",accuracy_score(true_val,res))
print(classification_report(true_val,res))

In [None]:
confusion_matrix(true_val, res, labels=[0,1])
