In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datetime import timedelta
from google.colab import drive, runtime 
import numpy as np
from os.path import isfile
import pandas as pd
from transformers import pipeline
import time

In [None]:
# Mounts the Google Drive so we can access the CSV file
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Reads the CSV file with the classified tweets
df = pd.read_csv('gdrive/My Drive/DW - Climate Protests/8.dfs_for_sentiment_analysis/protest_tweets/protest-tweets.csv', sep="|",
                 dtype={
                     "tweet_id": str,
                     "conversation_id": str,
                     "in_reply_to": str,
                 })

In [None]:
# Making sure that all the dtypes are fine
df.tweet_id.isna().value_counts()

False    63862
Name: tweet_id, dtype: int64

In [None]:
# How many entries do we have?
df.shape

(63862, 11)

In [None]:
# Loads a pre-trained language detector so we can select only the tweets that are in English 
classifier = pipeline(task="sentiment-analysis", model="papluca/xlm-roberta-base-language-detection")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# Does it recognize English?
classifier("This is a test")

[{'label': 'en', 'score': 0.9789774417877197}]

In [None]:
# What about Portuguese?
classifier("Isso é um teste")

[{'label': 'pt', 'score': 0.9927494525909424}]

In [None]:
# What about arrays?
classifier(["Isso é um teste", "This is a test"])

[{'label': 'pt', 'score': 0.9927494525909424},
 {'label': 'en', 'score': 0.9789774417877197}]

In [None]:
# Runs it through all tweets and saves the answer back to an array
def lang_detect(data, classifier):

  result = classifier(data.raw_content.tolist())
  result = pd.DataFrame(result)
  result = result.rename(columns={"label":"lang_label", "score":"lang_score"})

  return pd.concat([data.reset_index(drop=True), result.reset_index(drop=True)], axis=1)

In [None]:
# We will split the dataframe in smaller chunks 
# so we don't lose all progress if anything goes wrong
dfs_split = np.array_split(df, 100)

In [None]:
for index, chunk in enumerate(dfs_split):

  fname = f'gdrive/My Drive/DW - Climate Protests/9.high_engagement_language_detection/chunk-{index}.csv'

  if isfile(fname):
    continue

  start = time.time()
  
  result = lang_detect(chunk, classifier=classifier)
  result.to_csv(fname)

  end = time.time()
  print(f'Chunk {index} processing time:', str(timedelta(seconds=end-start)))

In [None]:
runtime.unassign()
