In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCol

In [2]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from datetime import timedelta
from google.colab import drive, runtime 
import numpy as np
from os.path import isfile
import pandas as pd
from transformers import pipeline
import time

In [5]:
# Mounts the Google Drive so we can access the CSV file
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
# Reads the CSV file with the classified tweets
df = pd.read_csv('gdrive/My Drive/DW - Climate Protests/8.dfs_for_sentiment_analysis/random_sample/random-sample.csv', sep="|")

In [7]:
# How many entries do we have?
df.shape

(33504, 11)

In [8]:
# Loads a pre-trained language detector so we can select only the tweets that are in English 
classifier = pipeline(task="sentiment-analysis", model="papluca/xlm-roberta-base-language-detection")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [9]:
# Runs it through all tweets and saves the answer back to an array
def lang_detect(data, classifier):

  result = classifier(data.raw_content.tolist())
  result = pd.DataFrame(result)
  result = result.rename(columns={"label":"lang_label", "score":"lang_score"})

  return pd.concat([data.reset_index(drop=True), result.reset_index(drop=True)], axis=1)

In [10]:
# We will split the dataframe in smaller chunks 
# so we don't lose all progress if anything goes wrong
dfs_split = np.array_split(df, 100)

In [11]:
for index, chunk in enumerate(dfs_split):

  fname = f'gdrive/My Drive/DW - Climate Protests/11.control_language_detect/chunk-{index}.csv'

  if isfile(fname):
    continue

  start = time.time()
  
  result = lang_detect(chunk, classifier=classifier)
  result.to_csv(fname)

  end = time.time()
  print(f'Chunk {index} processing time:', str(timedelta(seconds=end-start)))

Chunk 3 processing time: 0:00:53.951182
Chunk 4 processing time: 0:00:53.705876
Chunk 5 processing time: 0:00:54.211636
Chunk 6 processing time: 0:00:47.703025
Chunk 7 processing time: 0:00:43.603549
Chunk 8 processing time: 0:00:43.599389
Chunk 9 processing time: 0:00:40.308613
Chunk 10 processing time: 0:00:55.927955
Chunk 11 processing time: 0:00:54.338591
Chunk 12 processing time: 0:00:50.524878
Chunk 13 processing time: 0:00:48.566780
Chunk 14 processing time: 0:00:45.475253
Chunk 15 processing time: 0:00:45.359700
Chunk 16 processing time: 0:00:51.675863
Chunk 17 processing time: 0:00:52.961322
Chunk 18 processing time: 0:00:49.914379
Chunk 19 processing time: 0:00:47.971517
Chunk 20 processing time: 0:00:51.998909
Chunk 21 processing time: 0:00:47.188664
Chunk 22 processing time: 0:00:51.841902
Chunk 23 processing time: 0:00:49.564022
Chunk 24 processing time: 0:00:52.666014
Chunk 25 processing time: 0:01:03.010327
Chunk 26 processing time: 0:00:59.851424
Chunk 27 processing tim

In [12]:
runtime.unassign()
