#### Copyright 2021 Google LLC

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Recognizing Multimodal Entailment dataset v1

## Overview

How information is created, shared and consumed has changed rapidly in recent decades, in part thanks to new social platforms and technologies on the web. With ever-larger amounts of unstructured and limited labels, organizing and reconciling information from different sources and modalities is a central challenge in machine learning.

This colab introduces the first version of the Recognizing Multimodal Entailment dataset, aiming to further encourage research in the topic.

## Install and import dependencies

In [None]:
!pip install -U --quiet tensorflow-addons
!pip install --quiet twint
!pip install --quiet whatthelang
!pip install --quiet TwitterAPI
!pip install --quiet Tweepy

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa

import os
import tweepy
import requests

import pandas as pd
import numpy as np

from IPython.display import Image

from tweepy import OAuthHandler
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse
from requests_oauthlib import OAuth1

## Dataset description

The dataset consists of related social media URL pairs and their corresponding entailment label.

## Download dataset

Read (url_1, url_2, label) CSV file from github.

In [None]:
DATASET_PATH = 'https://raw.githubusercontent.com/google-research-datasets/recognizing-multimodal-entailment/main/dataset_v1.csv'

In [None]:
df = pd.read_csv(DATASET_PATH)
df

## Download contents with Twitter API

Uses TwitterAPI to retrieve contents and filter out eventual skews e.g., tweets that are no longer available.

In [None]:
consumer_key = 'xxxx'
consumer_secret = 'yyyy'
access_token = '0000-aaaa'
access_secret = 'zzzz'
auth = OAuth1(consumer_key, consumer_secret,
              access_token, access_secret)

In [None]:
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth)

In [None]:
def get_tweet_id(url):
  url_split = url.split('/status/')
  if len(url_split) < 2:
    print('Error getting id from url', url)
    return None
    
  return url_split[-1]

In [None]:
def text_and_image_urls(api, tweet_id):
  try:
    tweet = api.get_status(tweet_id, tweet_mode="extended")
  except tweepy.TweepError as e:
    print('TweepError for', tweet_id, ':',
          e.args[0][0]['code'], e.args[0][0]['message'])
    return None

  try:  # Retweet.
    text = tweet.retweeted_status.full_text
    media_urls = []
    media = tweet.retweeted_status.entities.get('media', [])
    for m in media:
      media_urls.append(m['media_url'])
    return text, media_urls
  except AttributeError:  # Not a Retweet.
    text = tweet.full_text
    media_urls = []
    media = tweet.entities.get('media', [])
    for m in media:
      media_urls.append(m['media_url'])
    return text, media_urls

In [None]:
def retrieve_tweets(df):
  tweets = []
  for _, rows in df.iterrows():
    id_1, id_2 = get_tweet_id(rows['url_1']), get_tweet_id(rows['url_2'])
    if id_1 is None or id_2 is None:
      continue

    tweet1 = text_and_image_urls(api, id_1)
    if tweet1 is None:
      print(rows['url_1'])
      continue
    if len(tweet1[1]) != 1:
      print('Tweet', id1, 'without exactly one image')
      continue
    text_1, image_1 = tweet1[0], tweet1[1][0]

    tweet2 = text_and_image_urls(api, id_2)
    if tweet2 is None:
      print(rows['url_2'])
      continue
    if len(tweet2[1]) != 1:
      print('Tweet', id2, 'without exactly one image')
      continue
    text_2, image_2 = tweet2[0], tweet2[1][0]

    label = rows['label']
    
    tweets.append({
        'id_1': id_1,
        'text_1': text_1,
        'image_1': image_1,
        'id_2': id_2,
        'text_2': text_2,
        'image_2': image_2,
        'label' : label
    })
    
  return tweets

In [None]:
tweets = retrieve_tweets(df)

## Download images

In [None]:
def download(url, path, filename):
  if not os.path.isdir(path):
    os.makedirs(path)
  response = requests.get(url, stream=True)
  file_size = int(response.headers.get("Content-Length", 0))
  extension = '.' + url.split('.')[-1]
  full_filename = os.path.join(path, filename + extension)
  progress = tqdm(response.iter_content(1024),
                  f"Downloading {full_filename}",
                  total=file_size,
                  unit="B",
                  unit_scale=True,
                  unit_divisor=1024)
  with open(full_filename, "wb") as f:
    for data in progress:
      f.write(data)
      progress.update(len(data))

In [None]:
IMAGE_PATH = '/tmp/tweet_images/'
for tweet in tweets:
  download(tweet['image_1'], IMAGE_PATH, tweet['id_1'])
  download(tweet['image_2'], IMAGE_PATH, tweet['id_2'])

## Visualize texts and images from a sample tweet pair and corresponding entailment label



In [None]:
def visualize_tweet(tweet, index):
  print(tweet['text_' + str(index)])

  image_file = os.path.join(IMAGE_PATH, tweet['id_' + str(index)] + '.'
                            + tweet['image_' + str(index)].split('.')[-1])
  return Image(image_file)

In [None]:
tweet = tweets[np.random.randint(0, len(tweets))]

In [None]:
visualize_tweet(tweet, 1)

In [None]:
visualize_tweet(tweet, 2)

In [None]:
print(tweet['label'])