In [47]:
import sys
import os

sys.path.append("/home/jupyter/news/src")
from newsapiscraper import NewsApiScraper
from newscollector import NewsCollector
from googletranslation import GoogleTranslate
from clustering import Clustering
from googleembeddings import GoogleEmbeddings
from strike_rag import StrikeRAG
from fire_rag import FireRAG
from filtering import Filter
from matching import Matching
from configuration import Configuration
import sectors_to_discard
from utils import split_liste, groupByName, generate_description, geoloc
import strike_relevancy


import pandas as pd
import numpy as np
import json
import requests
from copy import deepcopy
from geopy.geocoders import Nominatim
from dotenv import load_dotenv

In [48]:
load_dotenv()
NEWS_API_KEY = os.getenv('NEWS_API_KEY')
os.environ['http_proxy'] = os.getenv('http_proxy')
os.environ['https_proxy'] = os.getenv('https_proxy')
os.environ['HTTP_PROXY'] = os.getenv('HTTP_PROXY')
os.environ['HTTPS_PROXY'] = os.getenv('HTTPS_PROXY')

# Creating config file for the first time

## Generation initial config dict
We generate a dict that will be use to generate the finale config file. This final config should be generate once and be used at every moment if we do not add some new keys.

In [4]:
dict_config = {}
dict_config['keywords'] = ['strike', 'picket line', 'employee protest']
dict_config['country_lang'] = [
    {'country': 'BE','lang': ['fr']}, # Belgium
    {'country': 'CH','lang': ['fr', 'de', 'it']}, # Searching news with differentes languge in the same country where threre are several official language
    {'country': 'BG','lang': ['bg']},  # Bulgaria
    {'country': 'BR','lang': ['pt']},
    {'country' : 'CL',  'lang' : ['es']},
    {'country' : 'CN',  'lang' : ['zh']},
    {'country' : 'CO',  'lang' : ['es']},
    {'country' : 'CZ',  'lang' : ['cs']},
    {'country' : 'DE',  'lang' : ['de']},
    {'country' : 'DZ',  'lang' : ['ar']},
    {'country' : 'EE',  'lang' : ['et']},
    {'country' : 'ES',  'lang' : ['es']},
    {'country' : 'FR',  'lang' : ['fr']},
    {'country' : 'GB',  'lang' : ['en']},
    {'country' : 'HU',  'lang' : ['hu']},
    {'country' : 'ID',  'lang' : ['id']},
    {'country' : 'IN',  'lang' : ['hi']},
    {'country' : 'IT',  'lang' : ['it']},
    {'country' : 'JP',  'lang' : ['ja']},
    {'country' : 'KR',  'lang' : ['ko']},
    {'country' : 'LT',  'lang' : ['lt']}, 
    {'country' : 'MA',  'lang' : ['ar']},
    {'country' : 'MC',  'lang' : ['fr']}, 
    {'country' : 'MX',  'lang' : ['es']},
    {'country' : 'MY',  'lang' : ['ms']},
    {'country' : 'NL',  'lang' : ['nl']},
    {'country' : 'PL',  'lang' : ['pl']},
    {'country' : 'PT',  'lang' : ['pt']},
    {'country' : 'RO',  'lang' : ['ro']},
    {'country' : 'SI',  'lang' : ['sl']},
    {'country' : 'SK',  'lang' : ['sk']},
    {'country' : 'TH',  'lang' : ['th']},
    {'country' : 'TN',  'lang' : ['ar']},
    {'country' : 'TR',  'lang' : ['tr']},
    {'country' : 'UA',  'lang' : ['uk']},
    {'country' : 'VN',  'lang' : ['vi']},
    {'country' : 'SE',  'lang' : ['sv']},
    {'country' : 'SV',  'lang' : ['es']},
    {'country' : 'CA',  'lang' : ['fr']},
    {'country' : 'LU',  'lang' : ['fr']}
]

dict_config['NEWS_API_KEY'] = NEWS_API_KEY # if you use News API services

dict_config['rag_cong'] = {'vertexai_llm': 'gemini-1.5-flash',
                            'vertexai_embedding_name': 'text-embedding-004',
                            'chunk_size': 2000, 'chunk_overlap': 10, 'max_doc': 5, 'retry': 1}
# dict_config["sectors_to_discard"] = sectors_to_discard.SECTORS_TO_DISCARD # Or spécify a liste of sectors you want to discard
dict_config["decision_function_args"] = {
    'sectors_to_discard': sectors_to_discard.SECTORS_TO_DISCARD, # Or spécify a liste of sectors you want to discard
    'desirable_temporalities' : ['upcoming', 'ongoing', 'unknown']
}

dict_config['project_id']="irn-67050-lab-65"
                          

In [5]:
dict_config

{'keywords': ['strike', 'picket line', 'employee protest'],
 'country_lang': [{'country': 'BE', 'lang': ['fr']},
  {'country': 'CH', 'lang': ['fr', 'de', 'it']},
  {'country': 'BG', 'lang': ['bg']},
  {'country': 'BR', 'lang': ['pt']},
  {'country': 'CL', 'lang': ['es']},
  {'country': 'CN', 'lang': ['zh']},
  {'country': 'CO', 'lang': ['es']},
  {'country': 'CZ', 'lang': ['cs']},
  {'country': 'DE', 'lang': ['de']},
  {'country': 'DZ', 'lang': ['ar']},
  {'country': 'EE', 'lang': ['et']},
  {'country': 'ES', 'lang': ['es']},
  {'country': 'FR', 'lang': ['fr']},
  {'country': 'GB', 'lang': ['en']},
  {'country': 'HU', 'lang': ['hu']},
  {'country': 'ID', 'lang': ['id']},
  {'country': 'IN', 'lang': ['hi']},
  {'country': 'IT', 'lang': ['it']},
  {'country': 'JP', 'lang': ['ja']},
  {'country': 'KR', 'lang': ['ko']},
  {'country': 'LT', 'lang': ['lt']},
  {'country': 'MA', 'lang': ['ar']},
  {'country': 'MC', 'lang': ['fr']},
  {'country': 'MX', 'lang': ['es']},
  {'country': 'MY', 'lan

In [6]:
initial_config_file ="/home/jupyter/news/config/initial_config_file.json"
with open(initial_config_file, 'w') as file :
    json.dump(dict_config, file, indent = 4)

## Generating final config file from initial config file
It it this final config file which will be used to get news

In [7]:
initial_config_file = "/home/jupyter/news/config/initial_config_file.json"
final_config_file = "/home/jupyter/news/config/final_config_file.json"
config = Configuration( initial_config_file = initial_config_file, final_config_file = final_config_file)

# Loading config file

In [8]:
config_file = '/home/jupyter/news/config/final_config_file.json'
with open(config_file, 'r') as file :
    conf = json.load(file)
conf['decision_function'] = strike_relevancy.strike_relevancy
conf

{'keywords': ['strike', 'picket line', 'employee protest'],
 'country_lang': [{'country': 'BE',
   'lang': 'fr',
   'queries': ['grève', 'piquet de grève', 'protestation des employés']},
  {'country': 'CH',
   'lang': 'fr',
   'queries': ['grève', 'piquet de grève', 'protestation des employés']},
  {'country': 'CH',
   'lang': 'de',
   'queries': ['schlagen', 'Streikposten', 'Mitarbeiterprotest']},
  {'country': 'CH',
   'lang': 'it',
   'queries': ['sciopero',
    'cordone di scioperanti',
    'protesta dei dipendenti']},
  {'country': 'BG',
   'lang': 'bg',
   'queries': ['стачка', 'пикет линия', 'протест на служител']},
  {'country': 'BR',
   'lang': 'pt',
   'queries': ['batida', 'linha de piquete', 'protesto de funcionários']},
  {'country': 'CL',
   'lang': 'es',
   'queries': ['huelga', 'piquete', 'protesta de empleados']},
  {'country': 'CN',
   'lang': 'zh',
   'queries': ['strike', 'picket line', 'employee protest']},
  {'country': 'CO',
   'lang': 'es',
   'queries': ['huelg

# Instanciating NewsApiScraper

In [9]:
scrapper = NewsApiScraper(api_key= conf['NEWS_API_KEY'], start_date='2024-08-22',end_date='2024-08-23')

# Collecting news via Newscolllector instance

In [10]:
collector = NewsCollector(config=conf['country_lang'], scraper=scrapper,path_to_save='/home/jupyter/news/data/strike_newsapi_vendre_23_aout.csv')
collector.collect_news()

  0%|          | 0/42 [00:00<?, ?it/s]

{'country': 'BE', 'lang': 'fr', 'queries': ['grève', 'piquet de grève', 'protestation des employés']}
search ended !



  0%|          | 0/15 [00:00<?, ?it/s][A
  7%|▋         | 1/15 [00:00<00:04,  3.04it/s][A
 13%|█▎        | 2/15 [00:00<00:03,  3.76it/s][A
 20%|██        | 3/15 [00:00<00:02,  4.05it/s][A
 27%|██▋       | 4/15 [00:00<00:02,  4.26it/s][A
 33%|███▎      | 5/15 [00:01<00:02,  3.36it/s][A
 40%|████      | 6/15 [00:01<00:02,  3.72it/s][A
 47%|████▋     | 7/15 [00:02<00:02,  3.17it/s][A
 53%|█████▎    | 8/15 [00:02<00:02,  2.66it/s][A
 60%|██████    | 9/15 [00:02<00:02,  2.54it/s][A
 67%|██████▋   | 10/15 [00:05<00:05,  1.04s/it][A
 73%|███████▎  | 11/15 [00:05<00:03,  1.20it/s][A
 80%|████████  | 12/15 [00:06<00:01,  1.50it/s][A
 87%|████████▋ | 13/15 [00:06<00:01,  1.74it/s][A
 93%|█████████▎| 14/15 [00:06<00:00,  2.01it/s][A
100%|██████████| 15/15 [00:07<00:00,  2.10it/s][A


News collection ended ! 
BE fr grève
 data_.shape :(15, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
BE fr piquet de grève
search ended !



0it [00:00, ?it/s][A

News collection ended ! 



  2%|▏         | 1/42 [00:08<06:08,  8.99s/it]

BE fr protestation des employés
{'country': 'CH', 'lang': 'fr', 'queries': ['grève', 'piquet de grève', 'protestation des employés']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
CH fr grève





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
CH fr piquet de grève





search ended !



0it [00:00, ?it/s][A
  5%|▍         | 2/42 [00:10<03:14,  4.86s/it]

News collection ended ! 
CH fr protestation des employés
{'country': 'CH', 'lang': 'de', 'queries': ['schlagen', 'Streikposten', 'Mitarbeiterprotest']}
search ended !



  0%|          | 0/77 [00:00<?, ?it/s][A
  1%|▏         | 1/77 [00:00<00:16,  4.49it/s][A
  3%|▎         | 2/77 [00:00<00:21,  3.55it/s][A
  4%|▍         | 3/77 [00:01<00:33,  2.19it/s][A
  5%|▌         | 4/77 [00:01<00:40,  1.80it/s][A
  6%|▋         | 5/77 [00:02<00:50,  1.43it/s][A
  8%|▊         | 6/77 [00:03<00:49,  1.44it/s][A
  9%|▉         | 7/77 [00:03<00:38,  1.82it/s][A
 10%|█         | 8/77 [00:04<00:32,  2.13it/s][A
 12%|█▏        | 9/77 [00:04<00:24,  2.78it/s][A
 13%|█▎        | 10/77 [00:04<00:23,  2.82it/s][A
 14%|█▍        | 11/77 [00:04<00:24,  2.66it/s][A
 16%|█▌        | 12/77 [00:05<00:23,  2.74it/s][A
 17%|█▋        | 13/77 [00:05<00:22,  2.82it/s][A
 18%|█▊        | 14/77 [00:06<00:23,  2.63it/s][A
 19%|█▉        | 15/77 [00:06<00:24,  2.54it/s][A
 21%|██        | 16/77 [00:06<00:23,  2.56it/s][A
 22%|██▏       | 17/77 [00:07<00:28,  2.11it/s][A
 23%|██▎       | 18/77 [00:08<00:27,  2.16it/s][A
 25%|██▍       | 19/77 [00:08<00:23,  2.43it/s]

News collection ended ! 
CH de schlagen
 data_.shape :(77, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
CH de Streikposten
search ended !



0it [00:00, ?it/s][A
  7%|▋         | 3/42 [00:42<11:00, 16.95s/it]

News collection ended ! 
CH de Mitarbeiterprotest
{'country': 'CH', 'lang': 'it', 'queries': ['sciopero', 'cordone di scioperanti', 'protesta dei dipendenti']}
search ended !



  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [00:00<00:01,  3.73it/s][A
 25%|██▌       | 2/8 [00:00<00:01,  4.56it/s][A
 38%|███▊      | 3/8 [00:00<00:01,  4.06it/s][A
 50%|█████     | 4/8 [00:00<00:00,  4.37it/s][A
 62%|██████▎   | 5/8 [00:01<00:01,  2.03it/s][A
 75%|███████▌  | 6/8 [00:02<00:00,  2.31it/s][A
 88%|████████▊ | 7/8 [00:02<00:00,  2.55it/s][A
100%|██████████| 8/8 [00:03<00:00,  2.48it/s][A


News collection ended ! 
CH it sciopero
 data_.shape :(8, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
CH it cordone di scioperanti
search ended !



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  4.61it/s][A
 10%|▉         | 4/42 [00:47<07:48, 12.32s/it]

News collection ended ! 
CH it protesta dei dipendenti
 data_.shape :(1, 6)
{'country': 'BG', 'lang': 'bg', 'queries': ['стачка', 'пикет линия', 'протест на служител']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
BG bg стачка





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
BG bg пикет линия





search ended !



0it [00:00, ?it/s][A
 12%|█▏        | 5/42 [00:48<05:10,  8.38s/it]

News collection ended ! 
BG bg протест на служител
{'country': 'BR', 'lang': 'pt', 'queries': ['batida', 'linha de piquete', 'protesto de funcionários']}
search ended !



  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:01<00:11,  1.40s/it][A
 22%|██▏       | 2/9 [00:02<00:06,  1.05it/s][A
 33%|███▎      | 3/9 [00:04<00:08,  1.46s/it][A
 44%|████▍     | 4/9 [00:04<00:05,  1.04s/it][A
 56%|█████▌    | 5/9 [00:06<00:04,  1.24s/it][A
 67%|██████▋   | 6/9 [00:06<00:03,  1.01s/it][A
 78%|███████▊  | 7/9 [00:07<00:01,  1.20it/s][A
 89%|████████▉ | 8/9 [00:07<00:00,  1.42it/s][A
100%|██████████| 9/9 [00:08<00:00,  1.08it/s][A


News collection ended ! 
BR pt batida
 data_.shape :(9, 6)
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
BR pt linha de piquete





search ended !



  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:01,  2.78it/s][A
 50%|█████     | 2/4 [00:01<00:01,  1.28it/s][A
 75%|███████▌  | 3/4 [00:03<00:01,  1.35s/it][A
100%|██████████| 4/4 [00:04<00:00,  1.13s/it][A
 14%|█▍        | 6/42 [01:03<06:23, 10.65s/it]

News collection ended ! 
BR pt protesto de funcionários
 data_.shape :(4, 6)
{'country': 'CL', 'lang': 'es', 'queries': ['huelga', 'piquete', 'protesta de empleados']}
search ended !



  0%|          | 0/38 [00:00<?, ?it/s][A
  3%|▎         | 1/38 [00:00<00:10,  3.60it/s][A
  5%|▌         | 2/38 [00:00<00:10,  3.37it/s][A
  8%|▊         | 3/38 [00:00<00:09,  3.55it/s][A
 11%|█         | 4/38 [00:01<00:09,  3.45it/s][A
 13%|█▎        | 5/38 [00:01<00:15,  2.09it/s][A
 16%|█▌        | 6/38 [00:02<00:13,  2.40it/s][A
 18%|█▊        | 7/38 [00:02<00:11,  2.68it/s][A
 21%|██        | 8/38 [00:02<00:10,  2.76it/s][A
 24%|██▎       | 9/38 [00:03<00:10,  2.80it/s][A
 26%|██▋       | 10/38 [00:03<00:09,  3.09it/s][A
 29%|██▉       | 11/38 [00:03<00:09,  2.91it/s][A
 32%|███▏      | 12/38 [00:04<00:14,  1.82it/s][A
 34%|███▍      | 13/38 [00:05<00:16,  1.47it/s][A
 37%|███▋      | 14/38 [00:06<00:13,  1.79it/s][A
 39%|███▉      | 15/38 [00:06<00:12,  1.91it/s][A
 42%|████▏     | 16/38 [00:07<00:12,  1.81it/s][A
 45%|████▍     | 17/38 [00:07<00:10,  1.91it/s][A
 47%|████▋     | 18/38 [00:07<00:08,  2.25it/s][A
 50%|█████     | 19/38 [00:08<00:07,  2.60it/s]

News collection ended ! 
CL es huelga
 data_.shape :(38, 6)
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
CL es piquete





search ended !



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:01<00:00,  1.37s/it][A
 17%|█▋        | 7/42 [01:27<08:36, 14.75s/it]

News collection ended ! 
CL es protesta de empleados
 data_.shape :(1, 6)
{'country': 'CN', 'lang': 'zh', 'queries': ['strike', 'picket line', 'employee protest']}
search ended !



  0%|          | 0/4 [00:00<?, ?it/s][ABuilding prefix dict from /opt/conda/lib/python3.10/site-packages/jieba/dict.txt ...
Loading model from cache /var/tmp/jieba.cache
Loading model cost 1.1132879257202148 seconds.
Prefix dict has been built succesfully.

 25%|██▌       | 1/4 [00:01<00:03,  1.32s/it][A
 50%|█████     | 2/4 [00:04<00:04,  2.21s/it][A
 75%|███████▌  | 3/4 [00:06<00:02,  2.17s/it][A
100%|██████████| 4/4 [00:07<00:00,  1.76s/it][A


News collection ended ! 
CN zh strike
 data_.shape :(4, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
CN zh picket line
search ended !



0it [00:00, ?it/s][A
 19%|█▉        | 8/42 [01:35<07:07, 12.56s/it]

News collection ended ! 
CN zh employee protest
{'country': 'CO', 'lang': 'es', 'queries': ['huelga', 'piquete', 'protesta de empleados']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
CO es huelga





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
CO es piquete





search ended !



0it [00:00, ?it/s][A
 21%|██▏       | 9/42 [01:37<05:10,  9.41s/it]

News collection ended ! 
CO es protesta de empleados
{'country': 'CZ', 'lang': 'cs', 'queries': ['stávka', 'hlídková linie', 'protest zaměstnanců']}
search ended !



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  2.78it/s][A
100%|██████████| 2/2 [00:00<00:00,  2.82it/s][A


News collection ended ! 
CZ cs stávka
 data_.shape :(2, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
CZ cs hlídková linie
search ended !



0it [00:00, ?it/s][A
 24%|██▍       | 10/42 [01:39<03:51,  7.22s/it]

News collection ended ! 
CZ cs protest zaměstnanců
{'country': 'DE', 'lang': 'de', 'queries': ['schlagen', 'Streikposten', 'Mitarbeiterprotest']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
DE de schlagen
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
DE de Streikposten





search ended !



0it [00:00, ?it/s][A
 26%|██▌       | 11/42 [01:41<02:48,  5.45s/it]

News collection ended ! 
DE de Mitarbeiterprotest
{'country': 'DZ', 'lang': 'ar', 'queries': ['يضرب', 'خط الاعتصام', 'احتجاج الموظفين']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
DZ ar يضرب
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
DZ ar خط الاعتصام
search ended !



0it [00:00, ?it/s][A
 29%|██▊       | 12/42 [01:42<02:01,  4.05s/it]

News collection ended ! 
DZ ar احتجاج الموظفين
{'country': 'EE', 'lang': 'et', 'queries': ['streikima', 'piketijoon', 'töötajate protest']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
EE et streikima
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
EE et piketijoon





search ended !



0it [00:00, ?it/s][A
 31%|███       | 13/42 [01:43<01:31,  3.17s/it]

News collection ended ! 
EE et töötajate protest
{'country': 'ES', 'lang': 'es', 'queries': ['huelga', 'piquete', 'protesta de empleados']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
ES es huelga





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
ES es piquete





search ended !



0it [00:00, ?it/s][A
 33%|███▎      | 14/42 [01:44<01:14,  2.68s/it]

News collection ended ! 
ES es protesta de empleados
{'country': 'FR', 'lang': 'fr', 'queries': ['grève', 'piquet de grève', 'protestation des employés']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
FR fr grève
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
FR fr piquet de grève





search ended !



0it [00:00, ?it/s][A
 36%|███▌      | 15/42 [01:45<01:00,  2.23s/it]

News collection ended ! 
FR fr protestation des employés
{'country': 'GB', 'lang': 'en', 'queries': ['strike', 'picket line', 'employee protest']}
search ended !



  0%|          | 0/90 [00:00<?, ?it/s][A
  1%|          | 1/90 [00:00<01:00,  1.48it/s][A
  2%|▏         | 2/90 [00:01<01:10,  1.24it/s][A
  3%|▎         | 3/90 [00:02<01:04,  1.34it/s][A
  4%|▍         | 4/90 [00:02<00:44,  1.92it/s][A
  6%|▌         | 5/90 [00:02<00:33,  2.53it/s][A
  7%|▋         | 6/90 [00:02<00:29,  2.83it/s][A
  8%|▊         | 7/90 [00:03<00:38,  2.18it/s][A
  9%|▉         | 8/90 [00:03<00:32,  2.51it/s][A
 10%|█         | 9/90 [00:03<00:26,  3.11it/s][A
 11%|█         | 10/90 [00:04<00:34,  2.29it/s][A
 12%|█▏        | 11/90 [00:04<00:30,  2.63it/s][A
 13%|█▎        | 12/90 [00:05<00:25,  3.03it/s][A
 14%|█▍        | 13/90 [00:05<00:27,  2.85it/s][A
 16%|█▌        | 14/90 [00:05<00:22,  3.35it/s][A
 17%|█▋        | 15/90 [00:06<00:30,  2.48it/s][A
 18%|█▊        | 16/90 [00:06<00:26,  2.79it/s][A
 19%|█▉        | 17/90 [00:06<00:23,  3.09it/s][A
 20%|██        | 18/90 [00:07<00:23,  3.10it/s][A
 21%|██        | 19/90 [00:07<00:20,  3.42it/s]

News collection ended ! 
GB en strike
 data_.shape :(90, 6)
search ended !



  0%|          | 0/17 [00:00<?, ?it/s][A
  6%|▌         | 1/17 [00:00<00:02,  6.32it/s][A
 12%|█▏        | 2/17 [00:00<00:04,  3.60it/s][A
 18%|█▊        | 3/17 [00:00<00:04,  3.42it/s][A
 24%|██▎       | 4/17 [00:01<00:03,  3.31it/s][A
 29%|██▉       | 5/17 [00:01<00:05,  2.04it/s][A
 35%|███▌      | 6/17 [00:02<00:04,  2.35it/s][A
 41%|████      | 7/17 [00:02<00:03,  2.74it/s][A
 47%|████▋     | 8/17 [00:02<00:02,  3.00it/s][A
 53%|█████▎    | 9/17 [00:03<00:02,  3.01it/s][A
 65%|██████▍   | 11/17 [00:03<00:01,  3.50it/s][A
 71%|███████   | 12/17 [00:03<00:01,  3.29it/s][A
 76%|███████▋  | 13/17 [00:04<00:01,  2.72it/s][A
 82%|████████▏ | 14/17 [00:05<00:01,  1.79it/s][A
 88%|████████▊ | 15/17 [00:06<00:01,  1.35it/s][A
 94%|█████████▍| 16/17 [00:07<00:00,  1.65it/s][A
100%|██████████| 17/17 [00:07<00:00,  2.24it/s][A


News collection ended ! 
GB en picket line
 data_.shape :(17, 6)
search ended !



  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [00:00<00:02,  3.40it/s][A
 25%|██▌       | 2/8 [00:00<00:02,  2.19it/s][A
 38%|███▊      | 3/8 [00:01<00:02,  1.80it/s][A
 50%|█████     | 4/8 [00:02<00:02,  1.38it/s][A
 62%|██████▎   | 5/8 [00:02<00:01,  1.71it/s][A
 75%|███████▌  | 6/8 [00:03<00:01,  1.65it/s][A
 88%|████████▊ | 7/8 [00:03<00:00,  2.16it/s][A
100%|██████████| 8/8 [00:04<00:00,  1.60it/s][A
 38%|███▊      | 16/42 [03:08<11:24, 26.33s/it]

News collection ended ! 
GB en employee protest
 data_.shape :(8, 6)
{'country': 'HU', 'lang': 'hu', 'queries': ['sztrájk', 'harcelőőrsök vonala', 'alkalmazottak tiltakozása']}
search ended !



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:00<00:00,  2.04it/s][A
100%|██████████| 2/2 [00:01<00:00,  1.01it/s][A


News collection ended ! 
HU hu sztrájk
 data_.shape :(2, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
HU hu harcelőőrsök vonala
search ended !



0it [00:00, ?it/s][A
 40%|████      | 17/42 [03:11<08:03, 19.32s/it]

News collection ended ! 
HU hu alkalmazottak tiltakozása
{'country': 'ID', 'lang': 'id', 'queries': ['memukul', 'garis piket', 'protes karyawan']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
ID id memukul
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
ID id garis piket
search ended !



0it [00:00, ?it/s][A
 43%|████▎     | 18/42 [03:12<05:30, 13.78s/it]

News collection ended ! 
ID id protes karyawan
{'country': 'IN', 'lang': 'hi', 'queries': ['हड़ताल', 'प्रहरियों की पंक्ति', 'कर्मचारी विरोध प्रदर्शन']}
search ended !



  0%|          | 0/17 [00:00<?, ?it/s][A
  6%|▌         | 1/17 [00:00<00:04,  3.48it/s][A
 12%|█▏        | 2/17 [00:00<00:07,  2.14it/s][A
 18%|█▊        | 3/17 [00:01<00:05,  2.64it/s][A
 24%|██▎       | 4/17 [00:01<00:04,  2.90it/s][A
 29%|██▉       | 5/17 [00:01<00:03,  3.30it/s][A
 35%|███▌      | 6/17 [00:02<00:07,  1.56it/s][A
 41%|████      | 7/17 [00:03<00:05,  1.92it/s][A
 47%|████▋     | 8/17 [00:03<00:04,  1.98it/s][A
 53%|█████▎    | 9/17 [00:04<00:03,  2.25it/s][A
 59%|█████▉    | 10/17 [00:05<00:05,  1.27it/s][A
 65%|██████▍   | 11/17 [00:07<00:06,  1.02s/it][A
 71%|███████   | 12/17 [00:08<00:05,  1.17s/it][A
 76%|███████▋  | 13/17 [00:08<00:03,  1.11it/s][A
 82%|████████▏ | 14/17 [00:09<00:02,  1.43it/s][A
 88%|████████▊ | 15/17 [00:10<00:01,  1.05it/s][A
 94%|█████████▍| 16/17 [00:10<00:00,  1.34it/s][A
100%|██████████| 17/17 [00:11<00:00,  1.46it/s][A


News collection ended ! 
IN hi हड़ताल
 data_.shape :(17, 6)
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
IN hi प्रहरियों की पंक्ति





search ended !



  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:01<00:01,  1.57s/it][A
100%|██████████| 2/2 [00:02<00:00,  1.24s/it][A
 45%|████▌     | 19/42 [03:27<05:26, 14.20s/it]

News collection ended ! 
IN hi कर्मचारी विरोध प्रदर्शन
 data_.shape :(2, 6)
{'country': 'IT', 'lang': 'it', 'queries': ['sciopero', 'cordone di scioperanti', 'protesta dei dipendenti']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
IT it sciopero





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
IT it cordone di scioperanti





search ended !



0it [00:00, ?it/s][A
 48%|████▊     | 20/42 [03:28<03:48, 10.39s/it]

News collection ended ! 
IT it protesta dei dipendenti
{'country': 'JP', 'lang': 'ja', 'queries': ['ストライク', 'ピケライン', '従業員の抗議']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
JP ja ストライク





search ended !



0it [00:00, ?it/s][A


News collection ended ! 
JP ja ピケライン
search ended !



0it [00:00, ?it/s][A
 50%|█████     | 21/42 [03:29<02:38,  7.54s/it]

News collection ended ! 
JP ja 従業員の抗議
{'country': 'KR', 'lang': 'ko', 'queries': ['스트라이크', '피켓 라인', '직원 시위']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
KR ko 스트라이크
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
KR ko 피켓 라인





search ended !



0it [00:00, ?it/s][A
 52%|█████▏    | 22/42 [03:30<01:50,  5.53s/it]

News collection ended ! 
KR ko 직원 시위
{'country': 'LT', 'lang': 'lt', 'queries': ['streikuoti', 'piketo linija', 'darbuotojų protestas']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 





LT lt streikuoti
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
LT lt piketo linija





search ended !



0it [00:00, ?it/s][A
 55%|█████▍    | 23/42 [03:31<01:18,  4.13s/it]

News collection ended ! 
LT lt darbuotojų protestas
{'country': 'MA', 'lang': 'ar', 'queries': ['يضرب', 'خط الاعتصام', 'احتجاج الموظفين']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
MA ar يضرب
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
MA ar خط الاعتصام
search ended !



0it [00:00, ?it/s][A
 57%|█████▋    | 24/42 [03:32<00:55,  3.07s/it]

News collection ended ! 
MA ar احتجاج الموظفين
{'country': 'MC', 'lang': 'fr', 'queries': ['grève', 'piquet de grève', 'protestation des employés']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
MC fr grève





search ended !



0it [00:00, ?it/s][A


News collection ended ! 
MC fr piquet de grève
search ended !



0it [00:00, ?it/s][A
 60%|█████▉    | 25/42 [03:32<00:41,  2.41s/it]

News collection ended ! 
MC fr protestation des employés
{'country': 'MX', 'lang': 'es', 'queries': ['huelga', 'piquete', 'protesta de empleados']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
MX es huelga





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
MX es piquete





search ended !



0it [00:00, ?it/s][A
 62%|██████▏   | 26/42 [03:34<00:34,  2.19s/it]

News collection ended ! 
MX es protesta de empleados
{'country': 'MY', 'lang': 'ms', 'queries': ['mogok', 'barisan piket', 'bantahan pekerja']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
MY ms mogok
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
MY ms barisan piket





search ended !



0it [00:00, ?it/s][A
 64%|██████▍   | 27/42 [03:35<00:28,  1.87s/it]

News collection ended ! 
MY ms bantahan pekerja
{'country': 'NL', 'lang': 'nl', 'queries': ['staking', 'piketlijn', 'werknemersprotest']}
search ended !



  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:00<00:01,  5.67it/s][A
 20%|██        | 2/10 [00:00<00:01,  5.95it/s][A
 30%|███       | 3/10 [00:00<00:01,  3.74it/s][A
 40%|████      | 4/10 [00:01<00:02,  2.32it/s][A
 50%|█████     | 5/10 [00:02<00:02,  1.96it/s][A
 60%|██████    | 6/10 [00:02<00:02,  1.81it/s][A
 70%|███████   | 7/10 [00:02<00:01,  2.28it/s][A
 80%|████████  | 8/10 [00:03<00:01,  1.85it/s][A
100%|██████████| 10/10 [00:04<00:00,  2.22it/s][A


News collection ended ! 
NL nl staking
 data_.shape :(10, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
NL nl piketlijn
search ended !



0it [00:00, ?it/s][A
 67%|██████▋   | 28/42 [03:41<00:41,  2.99s/it]

News collection ended ! 
NL nl werknemersprotest
{'country': 'PL', 'lang': 'pl', 'queries': ['strajk', 'linia pikiet', 'protest pracowniczy']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
PL pl strajk
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
PL pl linia pikiet
search ended !



0it [00:00, ?it/s][A
 69%|██████▉   | 29/42 [03:41<00:29,  2.28s/it]

News collection ended ! 
PL pl protest pracowniczy
{'country': 'PT', 'lang': 'pt', 'queries': ['batida', 'linha de piquete', 'protesto de funcionários']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
PT pt batida
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
PT pt linha de piquete





search ended !



0it [00:00, ?it/s][A
 71%|███████▏  | 30/42 [03:42<00:22,  1.89s/it]

News collection ended ! 
PT pt protesto de funcionários
{'country': 'RO', 'lang': 'ro', 'queries': ['grevă', 'linie de pichet', 'protestul angajatului']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
RO ro grevă





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
RO ro linie de pichet





search ended !



0it [00:00, ?it/s][A
 74%|███████▍  | 31/42 [03:44<00:18,  1.66s/it]

News collection ended ! 
RO ro protestul angajatului
{'country': 'SI', 'lang': 'sl', 'queries': ['stavka', 'odbojna linija', 'protest zaposlenih']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
SI sl stavka
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
SI sl odbojna linija
search ended !



0it [00:00, ?it/s][A
 76%|███████▌  | 32/42 [03:44<00:14,  1.43s/it]

News collection ended ! 
SI sl protest zaposlenih
{'country': 'SK', 'lang': 'sk', 'queries': ['štrajk', 'piketová čiara', 'protest zamestnancov']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
SK sk štrajk





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
SK sk piketová čiara





search ended !



0it [00:00, ?it/s][A
 79%|███████▊  | 33/42 [03:46<00:12,  1.42s/it]

News collection ended ! 
SK sk protest zamestnancov
{'country': 'TH', 'lang': 'th', 'queries': ['โจมตี', 'แนวป้องกัน', 'การประท้วงของพนักงาน']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
TH th โจมตี
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
TH th แนวป้องกัน





search ended !



0it [00:00, ?it/s][A
 81%|████████  | 34/42 [03:46<00:09,  1.17s/it]

News collection ended ! 
TH th การประท้วงของพนักงาน
{'country': 'TN', 'lang': 'ar', 'queries': ['يضرب', 'خط الاعتصام', 'احتجاج الموظفين']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
TN ar يضرب





search ended !



0it [00:00, ?it/s][A


News collection ended ! 
TN ar خط الاعتصام
search ended !



0it [00:00, ?it/s][A
 83%|████████▎ | 35/42 [03:47<00:07,  1.08s/it]

News collection ended ! 
TN ar احتجاج الموظفين
{'country': 'TR', 'lang': 'tr', 'queries': ['çarpmak', 'grev hattı', 'çalışan protestosu']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
TR tr çarpmak
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
TR tr grev hattı
search ended !



0it [00:00, ?it/s][A
 86%|████████▌ | 36/42 [03:48<00:05,  1.07it/s]

News collection ended ! 
TR tr çalışan protestosu
{'country': 'UA', 'lang': 'uk', 'queries': ['страйк', 'пікет', 'протест працівника']}
search ended !



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  4.80it/s][A


News collection ended ! 
UA uk страйк
 data_.shape :(1, 6)
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
UA uk пікет





search ended !



0it [00:00, ?it/s][A
 88%|████████▊ | 37/42 [03:49<00:05,  1.04s/it]

News collection ended ! 
UA uk протест працівника
{'country': 'VN', 'lang': 'vi', 'queries': ['đánh đập', 'hàng rào rào chắn', 'cuộc biểu tình của nhân viên']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
VN vi đánh đập
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
VN vi hàng rào rào chắn
search ended !



0it [00:00, ?it/s][A
 90%|█████████ | 38/42 [03:50<00:03,  1.10it/s]

News collection ended ! 
VN vi cuộc biểu tình của nhân viên
{'country': 'SE', 'lang': 'sv', 'queries': ['strejk', 'piketlinje', 'anställdas protest']}
search ended !



  0%|          | 0/21 [00:00<?, ?it/s][A
  5%|▍         | 1/21 [00:00<00:13,  1.50it/s][A
 10%|▉         | 2/21 [00:01<00:09,  2.04it/s][A
 14%|█▍        | 3/21 [00:01<00:08,  2.16it/s][A
 19%|█▉        | 4/21 [00:01<00:07,  2.36it/s][A
 24%|██▍       | 5/21 [00:02<00:06,  2.33it/s][A
 29%|██▊       | 6/21 [00:02<00:06,  2.16it/s][A
 33%|███▎      | 7/21 [00:03<00:06,  2.09it/s][A
 38%|███▊      | 8/21 [00:03<00:06,  2.06it/s][A
 43%|████▎     | 9/21 [00:04<00:06,  1.83it/s][A
 48%|████▊     | 10/21 [00:04<00:05,  2.02it/s][A
 52%|█████▏    | 11/21 [00:05<00:04,  2.15it/s][A
 57%|█████▋    | 12/21 [00:05<00:03,  2.43it/s][A
 62%|██████▏   | 13/21 [00:05<00:03,  2.63it/s][A
 67%|██████▋   | 14/21 [00:06<00:02,  2.61it/s][A
 71%|███████▏  | 15/21 [00:06<00:02,  2.66it/s][A
 76%|███████▌  | 16/21 [00:06<00:01,  2.86it/s][A
 81%|████████  | 17/21 [00:07<00:01,  2.84it/s][A
 86%|████████▌ | 18/21 [00:07<00:01,  2.91it/s][A
 90%|█████████ | 19/21 [00:08<00:00,  2.41it/s]

News collection ended ! 
SE sv strejk
 data_.shape :(21, 6)
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
SE sv piketlinje
search ended !



0it [00:00, ?it/s][A
 93%|█████████▎| 39/42 [04:00<00:10,  3.61s/it]

News collection ended ! 
SE sv anställdas protest
{'country': 'SV', 'lang': 'es', 'queries': ['huelga', 'piquete', 'protesta de empleados']}
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
SV es huelga





search ended !



0it [00:00, ?it/s][A

News collection ended ! 
SV es piquete





search ended !



0it [00:00, ?it/s][A
 95%|█████████▌| 40/42 [04:01<00:05,  2.98s/it]

News collection ended ! 
SV es protesta de empleados
{'country': 'CA', 'lang': 'fr', 'queries': ['grève', 'piquet de grève', 'protestation des employés']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
CA fr grève
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
CA fr piquet de grève
search ended !



0it [00:00, ?it/s][A
 98%|█████████▊| 41/42 [04:02<00:02,  2.32s/it]

News collection ended ! 
CA fr protestation des employés
{'country': 'LU', 'lang': 'fr', 'queries': ['grève', 'piquet de grève', 'protestation des employés']}
search ended !



0it [00:00, ?it/s][A


News collection ended ! 
LU fr grève
search ended !



0it [00:00, ?it/s][A

News collection ended ! 
LU fr piquet de grève





search ended !



0it [00:00, ?it/s][A
100%|██████████| 42/42 [04:03<00:00,  5.80s/it]


News collection ended ! 
LU fr protestation des employés


Unnamed: 0,dates,titles,links,texts,lang,cat
0,2024-08-23T07:33:30Z,Le Seigneur des Anneaux : La Guerre du Rohirim...,https://www.journaldugeek.com/2024/08/23/le-se...,La Terre du Milieu sera à l’honneur en cette f...,fr,grève
1,2024-08-23T08:00:10Z,Le viol et le meurtre d’une médecin illustrent...,https://www.lemonde.fr/international/article/2...,Manifestation de médecins et d’ambulanciers ap...,fr,grève
2,2024-08-22T14:07:30Z,"En Inde, après le viol et le meurtre d’une méd...",https://www.lemonde.fr/international/article/2...,Des avocats de la Haute Cour de Calcutta lors ...,fr,grève
3,2024-08-22T16:31:40Z,L’été où l’Espagne a dit non au surtourisme,https://www.courrierinternational.com/article/...,Certains étaient déjà descendus dans la rue po...,fr,grève
4,2024-08-22T14:30:10Z,"En Inde, les médecins du principal hôpital de ...",https://www.francetvinfo.fr/monde/inde/en-inde...,Les médecins ont souligné qu'ils mettaient fin...,fr,grève
...,...,...,...,...,...,...
16,2024-08-23T07:11:51Z,FN: 90 procent av Gazaborna har tvingats fly,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...,Palestinier tvingades återigen fly vid en evak...,sv,strejk
17,2024-08-23T04:47:52Z,Brand i flera bilar,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...,TRE NYHETER DU INTE FÅR MISSA: MP-topparnas be...,sv,strejk
18,2024-08-23T04:42:31Z,Brand i härbärge,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...,TRE NYHETER DU INTE FÅR MISSA: MP-topparnas be...,sv,strejk
19,2024-08-23T05:20:05Z,Misstänkt föremål i Kalmar var skarpt,https://www.aftonbladet.se/nyheter/a/Rr77qd/af...,TRE NYHETER DU INTE FÅR MISSA: MP-topparnas be...,sv,strejk


In [11]:
collector.data.shape

(327, 6)

In [12]:
collector.data.columns

Index(['dates', 'titles', 'links', 'texts', 'lang', 'cat'], dtype='object')

In [13]:
collector.data.head()

Unnamed: 0,dates,titles,links,texts,lang,cat
0,2024-08-23T07:33:30Z,Le Seigneur des Anneaux : La Guerre du Rohirim...,https://www.journaldugeek.com/2024/08/23/le-se...,La Terre du Milieu sera à l’honneur en cette f...,fr,grève
1,2024-08-23T08:00:10Z,Le viol et le meurtre d’une médecin illustrent...,https://www.lemonde.fr/international/article/2...,Manifestation de médecins et d’ambulanciers ap...,fr,grève
2,2024-08-22T14:07:30Z,"En Inde, après le viol et le meurtre d’une méd...",https://www.lemonde.fr/international/article/2...,Des avocats de la Haute Cour de Calcutta lors ...,fr,grève
3,2024-08-22T16:31:40Z,L’été où l’Espagne a dit non au surtourisme,https://www.courrierinternational.com/article/...,Certains étaient déjà descendus dans la rue po...,fr,grève
4,2024-08-22T14:30:10Z,"En Inde, les médecins du principal hôpital de ...",https://www.francetvinfo.fr/monde/inde/en-inde...,Les médecins ont souligné qu'ils mettaient fin...,fr,grève


# Translation 

In [14]:
translation = GoogleTranslate(project_id=conf['project_id'])

In [15]:
conf['project_id']

'irn-67050-lab-65'

In [16]:
df = collector.data
df.shape

(327, 6)

In [17]:
del os.environ['http_proxy']
del os.environ['https_proxy']
del os.environ['HTTP_PROXY']
del os.environ['HTTPS_PROXY']

In [18]:
## Instanciating Translation object in order to translate the collected news

In [19]:
 trans_df = translation.translation(df, limit=30720)

In [20]:
# translation.fails_index

In [21]:
trans_df.shape

(327, 8)

In [22]:
trans_df.columns

Index(['dates', 'titles', 'links', 'texts', 'lang', 'cat', 'translated_title',
       'translated_text'],
      dtype='object')

# Cleaning the translated dataframe
On loses more data when the cleaning happens before translation. In fact, some languages like japanes or chinese do not follow some common rule like have space between words. So one loses those data if cleaning happens base one blank space rules.

In [23]:
print("cleaning data ...")
if len(trans_df) == 0 :
    trans_df = None
else :
    trans_df = trans_df[trans_df['translated_title'] != '']
    trans_df = trans_df[trans_df['translated_text'] != '']

    trans_df = trans_df[trans_df['translated_title'].str.count('\s+').ge(3)] #keep only titles having more than 4 spaces in the title
    trans_df = trans_df[trans_df['translated_text'].str.count('\s+').ge(20)] #keep only titles having more than 20 spaces in the body

    trans_df = (trans_df.drop_duplicates(subset=['translated_title'])).sort_index()
    trans_df = (trans_df.drop_duplicates(subset=['translated_text'])).sort_index()
    trans_df = (trans_df.drop_duplicates(subset=['links'])).sort_index()
    trans_df = trans_df.reset_index(drop=True)

    if len(trans_df) == 0 :
        trans_df = None

cleaning data ...


In [24]:
trans_df.shape

(194, 8)

# Embeddings 

In [25]:
texts = list(trans_df['translated_text'])

In [26]:
embedding = GoogleEmbeddings()
embedding.fit_transform(texts)

194
194


# Clustering 

In [27]:
## clustering train data 
xtrain = embedding.embedded_data
xtrain.shape

(194, 256)

In [28]:
model = Clustering(percentile=10, linkage='average', metric='cosine')
model.fit(xtrain, n=500)
ypred = model.predict(xtrain)

100%|██████████| 500/500 [00:05<00:00, 99.70it/s] 


In [29]:
trans_df['class'] = ypred
len(np.unique(ypred))

175

In [30]:
trans_df

Unnamed: 0,dates,titles,links,texts,lang,cat,translated_title,translated_text,class
0,2024-08-23T04:41:00Z,"Piloti Air Canada hrozí stávkou, chtějí stejné...",https://www.idnes.cz/ekonomika/doprava/air-can...,Piloti letecké společnosti Air Canada vstupují...,cs,stávka,"Air Canada pilots threaten to strike, want the...",Air Canada pilots go on strike alert. They are...,87
1,2024-08-23T01:13:17Z,Trabajadores del Poder Judicial protestan en C...,https://www.elfinanciero.com.mx/nacional/2024/...,Trabajadores del Poder Judicial se manifestaro...,es,protesta de empleados,Judicial Branch workers protest in the Chamber...,Workers of the Judicial Branch demonstrated ou...,144
2,2024-08-23T07:33:30Z,Le Seigneur des Anneaux : La Guerre du Rohirim...,https://www.journaldugeek.com/2024/08/23/le-se...,La Terre du Milieu sera à l’honneur en cette f...,fr,grève,The Lord of the Rings: War of the Rohirim Gets...,Middle-earth will be in the spotlight at the e...,168
3,2024-08-22T16:23:39Z,सुप्रीम कोर्ट की अपील के बाद काम पर लौटे डॉक्ट...,https://www.aajtak.in/india/news/story/doctors...,रेजिडेंट डॉक्टर्स एसोसिएशन एम्स (RDA AIIMS) दि...,hi,हड़ताल,Doctors returned to work after the Supreme Cou...,Resident Doctors Association AIIMS (RDA AIIMS)...,150
4,2024-08-23T03:08:21Z,न्याय की मांग को लेकर सीटू व जनवादी महिला समित...,https://www.dainiktribuneonline.com/news/harya...,"भिवानी, 22 अगस्त (हप्र) कामकाजी महिला समन्वय स...",hi,कर्मचारी विरोध प्रदर्शन,Demonstration by CITU and Janwadi Mahila Samit...,"Bhiwani, August 22 (HP) On the call of the Wor...",129
...,...,...,...,...,...,...,...,...,...
189,2024-08-23T10:30:01Z,HP Color Laser MFP 179fwg Multifunktions-Farbl...,https://www.amazon.de/dp/B07RMJV1LT?tag=winfud...,Freitag ab 12:30 Uhr: HP Color Laser MFP 179fw...,de,schlagen,HP Color Laser MFP 179fwg Multifunction color ...,Friday from 12:30 p.m.: HP Color Laser MFP 179...,17
190,2024-08-23T07:26:25Z,Bundesliga-Vorschau - Titelkandidaten im Meist...,https://www.focus.de/sport/fussball/bundesliga...,E-Mail\n\nTeilen\n\nMehr\n\nTwitter\n\nDrucken...,de,schlagen,Bundesliga preview - Title candidates in the c...,E-mail\n\nShare\n\nMore\n\nTwitter\n\nPrint\n\...,0
191,2024-08-23T11:42:00Z,Der besondere Ausblick auf die Liga - Kane-Kom...,https://www.focus.de/sport/fussball/bundesliga...,Der besondere Ausblick auf die Liga: Kane-Koma...,de,schlagen,The special outlook on the league - Kane coma ...,The special outlook on the league: Kane coma a...,18
192,2024-08-22T03:30:00Z,KOMMENTAR - Unsere Psyche wird immer kränker. ...,https://www.nzz.ch/meinung/unsere-psyche-wird-...,Kommentar Unsere Psyche wird immer kränker. Do...,de,schlagen,COMMENT - Our psyche is becoming increasingly ...,Comment Our psyche is becoming increasingly il...,12


# RAG

In [31]:
trans_df = trans_df.rename(columns= {'links': 'url', 'dates': 'date'})
rag_data = trans_df[['date', 'cat', 'lang', 'url','translated_title', 'translated_text', 'class']]
# rag_data = dataframe[['date', 'cat', 'lang', 'title', 'text', 'url','translated_title', 'translated_text', 'class', 'description']]

rag_data = rag_data.fillna('')

In [32]:
rag_data

Unnamed: 0,date,cat,lang,url,translated_title,translated_text,class
0,2024-08-23T04:41:00Z,stávka,cs,https://www.idnes.cz/ekonomika/doprava/air-can...,"Air Canada pilots threaten to strike, want the...",Air Canada pilots go on strike alert. They are...,87
1,2024-08-23T01:13:17Z,protesta de empleados,es,https://www.elfinanciero.com.mx/nacional/2024/...,Judicial Branch workers protest in the Chamber...,Workers of the Judicial Branch demonstrated ou...,144
2,2024-08-23T07:33:30Z,grève,fr,https://www.journaldugeek.com/2024/08/23/le-se...,The Lord of the Rings: War of the Rohirim Gets...,Middle-earth will be in the spotlight at the e...,168
3,2024-08-22T16:23:39Z,हड़ताल,hi,https://www.aajtak.in/india/news/story/doctors...,Doctors returned to work after the Supreme Cou...,Resident Doctors Association AIIMS (RDA AIIMS)...,150
4,2024-08-23T03:08:21Z,कर्मचारी विरोध प्रदर्शन,hi,https://www.dainiktribuneonline.com/news/harya...,Demonstration by CITU and Janwadi Mahila Samit...,"Bhiwani, August 22 (HP) On the call of the Wor...",129
...,...,...,...,...,...,...,...
189,2024-08-23T10:30:01Z,schlagen,de,https://www.amazon.de/dp/B07RMJV1LT?tag=winfud...,HP Color Laser MFP 179fwg Multifunction color ...,Friday from 12:30 p.m.: HP Color Laser MFP 179...,17
190,2024-08-23T07:26:25Z,schlagen,de,https://www.focus.de/sport/fussball/bundesliga...,Bundesliga preview - Title candidates in the c...,E-mail\n\nShare\n\nMore\n\nTwitter\n\nPrint\n\...,0
191,2024-08-23T11:42:00Z,schlagen,de,https://www.focus.de/sport/fussball/bundesliga...,The special outlook on the league - Kane coma ...,The special outlook on the league: Kane coma a...,18
192,2024-08-22T03:30:00Z,schlagen,de,https://www.nzz.ch/meinung/unsere-psyche-wird-...,COMMENT - Our psyche is becoming increasingly ...,Comment Our psyche is becoming increasingly il...,12


### Get suppliers data 


In [33]:
# load suppliers
supplier1 = pd.read_csv('/home/jupyter/news/data/suppliers/20231004_Fichier_Extraction_avec_usines_clientes.csv')
supplier1.drop(columns=["Unnamed: 0"], inplace=True)
supplier1['tier'] ='1'
# supplier1

supplier2 = pd.read_csv('/home/jupyter/news/data/suppliers/tiern_N.csv')
supplier2.drop(columns=["Unnamed: 0"], inplace=True)
supplier2['tier'] ='N'
# supplier2
suppliers = pd.concat([supplier1[['country', 'city', 'suggested_name', 'tier']], supplier2[['country', 'city', 'suggested_name', 'tier']]])
print(suppliers.shape)
suppliers.dropna(inplace=True)
print(suppliers.shape)

(59581, 4)
(59580, 4)


### Running RAG

In [34]:
strike = StrikeRAG(vertexai_llm= conf['rag_cong']['vertexai_llm'],
                   vertexai_embedding_name = conf['rag_cong']['vertexai_embedding_name'], 
                   retry = conf['rag_cong']['retry'], 
                   max_doc = conf['rag_cong']['max_doc'], 
                   chunk_size = conf['rag_cong']['chunk_size'], 
                   chunk_overlap = conf['rag_cong']['chunk_overlap']
                  )

strike.retrieve_infos_with_retry(dataframe=rag_data)

label : 0
 document creation : 0
first retrieval
company is None or belongs to self.liste
label : 1
 document creation : 1
first retrieval
company is None or belongs to self.liste
label : 2
 document creation : 2
first retrieval
second retrieval
3rd retrieval
results :
{'strike': {'labor_strike': 'no', 'justification': 'The text explicitly states that the shutdown is a lockout, where management prevents workers from working, rather than a strike where workers refuse to work.'}, 'impacted_company': 'Canadian National (CN)', 'locations': [{'city': 'Various', 'country': 'Canada'}], 'impacted_business_sectors': ['Transportation', 'Agriculture'], 'automotive_industry': {'concerned': 'yes', 'justification': "The article states that some U.S. auto plants could temporarily shut down if they can't manufacture engines, transmissions, or stampings in Canadian plants due to the disruption of supply chains caused by the rail shutdown. This indicates that the car-making industry is directly affected

In [35]:
iterative_results = deepcopy(strike.all_results)
len(iterative_results)

28

### Matching process 


In [36]:
matching =  Matching()
r = matching.match(set_news = iterative_results, dataframe=suppliers)

28it [00:00, 39.90it/s]


### Filtering process

In [37]:
# SECTORS_TO_DISCARD = sectors_to_discard.SECTORS_TO_DISCARD

In [38]:
# decision_function_args = {
#     'sectors_to_discard': dict_config["sectors_to_discard"],
#     'desirable_temporalities' : ['upcoming', 'ongoing', 'unknown']
# }

In [39]:
filtre = Filter(empty_companie_name_index=matching.index, decision_function=conf['decision_function'], decision_function_args=conf["decision_function_args"], filename=None)
rr = filtre.filtering(matching.results)

In [40]:
rr

[{'strike': {'labor_strike': 'yes',
   'justification': 'The provided context explicitly states that the strike at Tesla in Sweden was initiated by IF Metall, a labor union, and involved other unions in sympathy measures. This indicates a labor strike driven by union demands for a collective agreement.'},
  'impacted_company': 'Tesla',
  'locations': [{'city': 'Norrköping', 'country': 'Sweden'}],
  'impacted_business_sectors': ['Automotive Manufacturing', 'Technology'],
  'automotive_industry': {'concerned': 'yes',
   'justification': "Tesla is a major player in the automotive manufacturing industry. A strike at Tesla, especially one that disrupts production, would likely have a ripple effect on the broader car-making industry. This is because:\n\n* **Supply Chain Disruptions:** Tesla's strike could disrupt the supply of components or materials used by other car manufacturers, especially those relying on similar parts or technologies.\n* **Competition:**  A strike at Tesla could give c

## Grouping processed articles based on their names

In [41]:
rr = groupByName(rr)

In [42]:
results = generate_description(rr, dataframe=rag_data)

yes


1it [00:00,  1.30it/s]


## Adding geographical coordonate

In [43]:
load_dotenv()
NEWS_API_KEY = os.getenv('NEWS_API_KEY')
os.environ['http_proxy'] = os.getenv('http_proxy')
os.environ['https_proxy'] = os.getenv('https_proxy')
os.environ['HTTP_PROXY'] = os.getenv('HTTP_PROXY')
os.environ['HTTPS_PROXY'] = os.getenv('HTTPS_PROXY')

In [44]:
final_results = geoloc(results)

In [45]:
final_results

[{'strike': {'labor_strike': 'yes',
   'justification': 'The provided context explicitly states that the strike at Tesla in Sweden was initiated by IF Metall, a labor union, and involved other unions in sympathy measures. This indicates a labor strike driven by union demands for a collective agreement.'},
  'impacted_company': 'Tesla',
  'locations': [{'city': 'Norrköping',
    'country': 'Sweden',
    'latitude': 58.5909124,
    'longitude': 16.1903511}],
  'impacted_business_sectors': ['Automotive Manufacturing', 'Technology'],
  'automotive_industry': {'concerned': 'yes',
   'justification': "Tesla is a major player in the automotive manufacturing industry. A strike at Tesla, especially one that disrupts production, would likely have a ripple effect on the broader car-making industry. This is because:\n\n* **Supply Chain Disruptions:** Tesla's strike could disrupt the supply of components or materials used by other car manufacturers, especially those relying on similar parts or tech

## Saving the final results

In [46]:
# file path where the results will be saved
filename = "/home/jupyter/news/results/json/strike/complete_pipeline_test.json"
with open(filename, "w") as final:
    json.dump(final_results, final, indent=4)
    print("saved !")

saved !
