In [1]:
import sys
import os


sys.path.append("../src/clustering")
sys.path.append("../src/configuration")
sys.path.append("../src/decision_functions")
sys.path.append("../src/embeddings")
sys.path.append("../src/filtering")
sys.path.append("../src/matching")
sys.path.append("../src/non_desirable_sectors")
sys.path.append("../src/rag")
sys.path.append("../src/scraping")
sys.path.append("../src/translation")
# sys.path.append("../src/output_parsers)
from newsapiscraper import NewsApiScraper
from newscollector import NewsCollector
from googletranslation import GoogleTranslate
from clustering import Clustering
from googleembeddings import GoogleEmbeddings
from strike_rag import StrikeRAG
from fire_rag import FireRAG
from filtering import Filter
from matching import Matching
from configuration import Configuration
import sectors_to_discard
from utils import split_liste, groupByName, generate_description, geoloc
import strike_relevancy


import pandas as pd
import numpy as np
import json
import requests
from copy import deepcopy
from geopy.geocoders import Nominatim
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())

NEWS_API_KEY = os.getenv('NEWS_API_KEY')
os.environ['http_proxy'] = os.getenv('http_proxy')
os.environ['https_proxy'] = os.getenv('https_proxy')
os.environ['HTTP_PROXY'] = os.getenv('HTTP_PROXY')
os.environ['HTTPS_PROXY'] = os.getenv('HTTPS_PROXY')

# Creating config file for the first time

## Generation initial config dict
We generate a dict that will be use to generate the finale config file. This final config should be generate once and be used at every moment if we do not add some new keys.

In [2]:
dict_config = {}
dict_config['keywords'] = ['strike', 'picket line', 'employee protest']
dict_config['country_lang'] = [
    {'country': 'BE','lang': ['fr']}, # Belgium
    {'country': 'CH','lang': ['fr', 'de', 'it']}, # Searching news with differentes languge in the same country where threre are several official language
    {'country': 'BG','lang': ['bg']},  # Bulgaria
    {'country': 'BR','lang': ['pt']},
#     # {'country' : 'CL',  'lang' : ['es']},
#     # {'country' : 'CN',  'lang' : ['zh']},
#     # {'country' : 'CO',  'lang' : ['es']},
#     # {'country' : 'CZ',  'lang' : ['cs']},
#     # {'country' : 'DE',  'lang' : ['de']},
#     # {'country' : 'DZ',  'lang' : ['ar']},
#     # {'country' : 'EE',  'lang' : ['et']},
#     # {'country' : 'ES',  'lang' : ['es']},
#     # {'country' : 'FR',  'lang' : ['fr']},
#     # {'country' : 'GB',  'lang' : ['en']},
#     # {'country' : 'HU',  'lang' : ['hu']},
#     # {'country' : 'ID',  'lang' : ['id']},
#     # {'country' : 'IN',  'lang' : ['hi']},
#     # {'country' : 'IT',  'lang' : ['it']},
#     # {'country' : 'JP',  'lang' : ['ja']},
#     # {'country' : 'KR',  'lang' : ['ko']},
#     # {'country' : 'LT',  'lang' : ['lt']}, 
#     # {'country' : 'MA',  'lang' : ['ar']},
#     # {'country' : 'MC',  'lang' : ['fr']}, 
#     # {'country' : 'MX',  'lang' : ['es']},
#     # {'country' : 'MY',  'lang' : ['ms']},
#     # {'country' : 'NL',  'lang' : ['nl']},
#     # {'country' : 'PL',  'lang' : ['pl']},
#     # {'country' : 'PT',  'lang' : ['pt']},
#     # {'country' : 'RO',  'lang' : ['ro']},
#     # {'country' : 'SI',  'lang' : ['sl']},
#     # {'country' : 'SK',  'lang' : ['sk']},
#     # {'country' : 'TH',  'lang' : ['th']},
#     # {'country' : 'TN',  'lang' : ['ar']},
#     # {'country' : 'TR',  'lang' : ['tr']},
#     # {'country' : 'UA',  'lang' : ['uk']},
#     # {'country' : 'VN',  'lang' : ['vi']},
#     # {'country' : 'SE',  'lang' : ['sv']},
#     # {'country' : 'SV',  'lang' : ['es']},
#     # {'country' : 'CA',  'lang' : ['fr']},
#     # {'country' : 'LU',  'lang' : ['fr']}
]

dict_config['NEWS_API_KEY'] = NEWS_API_KEY # if you use News API services

dict_config['rag_cong'] = {'vertexai_llm': 'gemini-1.5-flash',
                            'vertexai_embedding_name': 'text-embedding-004',
                            'chunk_size': 2000, 'chunk_overlap': 10, 'max_doc': 5, 'retry': 1}
# dict_config["sectors_to_discard"] = sectors_to_discard.SECTORS_TO_DISCARD # Or spécify a liste of sectors you want to discard
dict_config["decision_function_args"] = {
    'sectors_to_discard': sectors_to_discard.SECTORS_TO_DISCARD, # Or spécify a liste of sectors you want to discard
    'desirable_temporalities' : ['upcoming', 'ongoing', 'unknown']
}

dict_config['project_id']="irn-67050-lab-65"
                          

In [4]:
# dict_config

In [5]:
initial_config_file ="../config/initial_config_file.json"
with open(initial_config_file, 'w') as file :
    json.dump(dict_config, file, indent = 4)

## Generating final config file from initial config file
It it this final config file which will be used to get news

In [6]:
initial_config_file = "../config/initial_config_file.json"
final_config_file = "../config/final_config_file.json"
config = Configuration( initial_config_file = initial_config_file, final_config_file = final_config_file)

# Loading config file

In [7]:
config_file = "../config/final_config_file.json"
with open(config_file, 'r') as file :
    conf = json.load(file)
conf['decision_function'] = strike_relevancy.strike_relevancy
conf

{'keywords': ['strike', 'picket line', 'employee protest'],
 'country_lang': [{'country': 'BE',
   'lang': 'fr',
   'queries': ['grève', 'piquet de grève', 'protestation des employés']},
  {'country': 'CH',
   'lang': 'fr',
   'queries': ['grève', 'piquet de grève', 'protestation des employés']},
  {'country': 'CH',
   'lang': 'de',
   'queries': ['schlagen', 'Streikposten', 'Mitarbeiterprotest']},
  {'country': 'CH',
   'lang': 'it',
   'queries': ['sciopero',
    'cordone di scioperanti',
    'protesta dei dipendenti']},
  {'country': 'BG',
   'lang': 'bg',
   'queries': ['стачка', 'пикет линия', 'протест на служител']},
  {'country': 'BR',
   'lang': 'pt',
   'queries': ['batida', 'linha de piquete', 'protesto de funcionários']}],
 'NEWS_API_KEY': 'b1dcacfbc49d4ac1a48db5f5652b11a2',
 'rag_cong': {'vertexai_llm': 'gemini-1.5-flash',
  'vertexai_embedding_name': 'text-embedding-004',
  'chunk_size': 2000,
  'chunk_overlap': 10,
  'max_doc': 5,
  'retry': 1},
 'decision_function_args':

# Instanciating NewsApiScraper

In [8]:
scrapper = NewsApiScraper(api_key= conf['NEWS_API_KEY'], start_date='2024-08-25',end_date='2024-08-26')

# Collecting news via Newscolllector instance

In [9]:
# collector = NewsCollector(config=conf['country_lang'], scraper=scrapper,path_to_save='/home/jupyter/news/data/strike_newsapi_vendre_23_aout.csv')
collector = NewsCollector(config=conf['country_lang'], scraper=scrapper,path_to_save=None)
data = collector.collect_news()

In [10]:
collector.data.shape

In [11]:
collector.data.columns

In [12]:
collector.data.head()

# Translation 

In [13]:
df = pd.read_csv("/home/jupyter/data/data.csv")
df.shape

(52, 6)

In [14]:
df.columns

Index(['dates', 'titles', 'links', 'texts', 'lang', 'cat'], dtype='object')

In [15]:
translation = GoogleTranslate(project_id=conf['project_id'])

I0000 00:00:1724665910.519696  795598 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [16]:
# conf['project_id']

In [17]:
# df = collector.data
# df
# df.to_csv("/home/jupyter/data/data.csv", index = False)

In [18]:
del os.environ['http_proxy']
del os.environ['https_proxy']
del os.environ['HTTP_PROXY']
del os.environ['HTTPS_PROXY']

In [19]:
## Instanciating Translation object in order to translate the collected news

In [20]:
df.dropna(inplace=True)
df.shape

(51, 6)

In [21]:
 trans_df = translation.translation(df, limit=30720)

In [22]:
# translation.fails_index

In [23]:
trans_df.shape

(51, 8)

In [24]:
trans_df.columns

Index(['dates', 'titles', 'links', 'texts', 'lang', 'cat', 'translated_title',
       'translated_text'],
      dtype='object')

# Cleaning the translated dataframe
On loses more data when the cleaning happens before translation. In fact, some languages like japanes or chinese do not follow some common rule like have space between words. So one loses those data if cleaning happens base one blank space rules.

In [25]:
print("cleaning data ...")
if len(trans_df) == 0 :
    trans_df = None
else :
    trans_df = trans_df[trans_df['translated_title'] != '']
    trans_df = trans_df[trans_df['translated_text'] != '']

    trans_df = trans_df[trans_df['translated_title'].str.count('\s+').ge(3)] #keep only titles having more than 4 spaces in the title
    trans_df = trans_df[trans_df['translated_text'].str.count('\s+').ge(20)] #keep only titles having more than 20 spaces in the body

    trans_df = (trans_df.drop_duplicates(subset=['translated_title'])).sort_index()
    trans_df = (trans_df.drop_duplicates(subset=['translated_text'])).sort_index()
    trans_df = (trans_df.drop_duplicates(subset=['links'])).sort_index()
    trans_df = trans_df.reset_index(drop=True)

    if len(trans_df) == 0 :
        trans_df = None

cleaning data ...


In [26]:
trans_df.shape

(48, 8)

# Embeddings 

In [27]:
texts = list(trans_df['translated_text'])

In [28]:
embedding = GoogleEmbeddings()
embedding.fit_transform(texts)

48
48


# Clustering 

In [29]:
## clustering train data 
xtrain = embedding.embedded_data
xtrain.shape

(48, 256)

In [30]:
model = Clustering(percentile=10, linkage='average', metric='cosine')
model.fit(xtrain, n=500)
ypred = model.predict(xtrain)

100%|██████████| 500/500 [00:01<00:00, 313.59it/s]


In [31]:
trans_df['class'] = ypred
len(np.unique(ypred))

44

In [32]:
trans_df

Unnamed: 0,dates,titles,links,texts,lang,cat,translated_title,translated_text,class
0,2024-08-26T02:45:12Z,"A Cuba, la répression continue de tétaniser la...",https://www.lemonde.fr/international/article/2...,L’artiste cubain Luis Manuel Otero Alcantara c...,fr,grève,"In Cuba, repression continues to paralyze civi...",Cuban artist Luis Manuel Otero Alcantara is ta...,35
1,2024-08-25T14:34:00Z,La CGT prépare une rentrée sociale offensive,https://www.latribune.fr/economie/france/la-cg...,Alors qu'Emmanuel Macron tarde encore à nommer...,fr,grève,The CGT is preparing an offensive social return,While Emmanuel Macron is still slow to appoint...,37
2,2024-08-26T05:54:00Z,Ces 12 séries qu’on attend avec impatience pou...,https://www.numerama.com/pop-culture/1795084-c...,Lecture Zen Résumer l'article\n\nEntre les pro...,fr,grève,These 12 series that we are impatiently waitin...,Zen Reading Summarize the article\n\nBetween t...,34
3,2024-08-25T16:00:00Z,Doit-on s'attendre à des grèves dans l'enseign...,https://www.lavenir.net/actu/societe/2024/08/2...,"Deux mois plus tard, les inquiétudes des syndi...",fr,grève,Should we expect strikes in education in the W...,"Two months later, the unions' concerns have no...",38
4,2024-08-25T18:00:08Z,Les brèves critiques de la rentrée littéraire ...,https://www.lemonde.fr/critique-litteraire/art...,"Huit romans, un essai d’histoire, une antholog...",fr,grève,Brief reviews of the new literary season: Eric...,"Eight novels, a history essay, an anthology… H...",33
5,2024-08-25T04:30:00Z,"Il y a 80 ans, les Parisiens boutaient les All...",https://www.lepoint.fr/culture/il-y-a-77-ans-l...,Paris est sur des charbons ardents. Le départ ...,fr,grève,"80 years ago, Parisians kicked the Germans out...",Paris is on tenterhooks. The Germans' departur...,41
6,2024-08-26T04:00:00Z,La gauche doit (aussi) se renforcer par le bas,https://www.alternatives-economiques.fr/gauche...,"Parvenu en tête des élections législatives, le...",fr,grève,The left must (also) strengthen itself from below,Having come out on top in the legislative elec...,36
7,2024-08-25T03:38:00Z,COMMENT NETFLIX A CHANGÉ NOS VIES (3/3) - L'he...,https://www.latribune.fr/technos-medias/commen...,Ni la pluie ni le froid n'ont dissuadé les hab...,fr,grève,HOW NETFLIX CHANGED OUR LIVES (3/3) - Time for...,Neither the rain nor the cold deterred the red...,28
8,2024-08-25T06:05:28Z,"""Tout d'un coup, les gens ont pris une arme et...",https://www.francetvinfo.fr/france/ile-de-fran...,"Le 25 août 1944, la capitale est libérée de l'...",fr,grève,"""All of a sudden, people took up weapons and p...","On August 25, 1944, the capital was liberated ...",39
9,2024-08-25T08:53:45Z,80 ans de la libération de Paris : le carnet d...,https://www.francetvinfo.fr/france/ile-de-fran...,"Du 15 au 25 août 1944, Roger Trentesaux, 27 an...",fr,grève,80 years since the liberation of Paris: a resi...,"From August 15 to 25, 1944, Roger Trentesaux, ...",24


# RAG

In [33]:
trans_df = trans_df.rename(columns= {'links': 'url', 'dates': 'date'})
rag_data = trans_df[['date', 'cat', 'lang', 'url','translated_title', 'translated_text', 'class']]
# rag_data = dataframe[['date', 'cat', 'lang', 'title', 'text', 'url','translated_title', 'translated_text', 'class', 'description']]

rag_data = rag_data.fillna('')

In [34]:
rag_data

Unnamed: 0,date,cat,lang,url,translated_title,translated_text,class
0,2024-08-26T02:45:12Z,grève,fr,https://www.lemonde.fr/international/article/2...,"In Cuba, repression continues to paralyze civi...",Cuban artist Luis Manuel Otero Alcantara is ta...,35
1,2024-08-25T14:34:00Z,grève,fr,https://www.latribune.fr/economie/france/la-cg...,The CGT is preparing an offensive social return,While Emmanuel Macron is still slow to appoint...,37
2,2024-08-26T05:54:00Z,grève,fr,https://www.numerama.com/pop-culture/1795084-c...,These 12 series that we are impatiently waitin...,Zen Reading Summarize the article\n\nBetween t...,34
3,2024-08-25T16:00:00Z,grève,fr,https://www.lavenir.net/actu/societe/2024/08/2...,Should we expect strikes in education in the W...,"Two months later, the unions' concerns have no...",38
4,2024-08-25T18:00:08Z,grève,fr,https://www.lemonde.fr/critique-litteraire/art...,Brief reviews of the new literary season: Eric...,"Eight novels, a history essay, an anthology… H...",33
5,2024-08-25T04:30:00Z,grève,fr,https://www.lepoint.fr/culture/il-y-a-77-ans-l...,"80 years ago, Parisians kicked the Germans out...",Paris is on tenterhooks. The Germans' departur...,41
6,2024-08-26T04:00:00Z,grève,fr,https://www.alternatives-economiques.fr/gauche...,The left must (also) strengthen itself from below,Having come out on top in the legislative elec...,36
7,2024-08-25T03:38:00Z,grève,fr,https://www.latribune.fr/technos-medias/commen...,HOW NETFLIX CHANGED OUR LIVES (3/3) - Time for...,Neither the rain nor the cold deterred the red...,28
8,2024-08-25T06:05:28Z,grève,fr,https://www.francetvinfo.fr/france/ile-de-fran...,"""All of a sudden, people took up weapons and p...","On August 25, 1944, the capital was liberated ...",39
9,2024-08-25T08:53:45Z,grève,fr,https://www.francetvinfo.fr/france/ile-de-fran...,80 years since the liberation of Paris: a resi...,"From August 15 to 25, 1944, Roger Trentesaux, ...",24


### Get suppliers data 


In [35]:
# load suppliers
supplier1 = pd.read_csv('/home/jupyter/data/suppliers/20231004_Fichier_Extraction_avec_usines_clientes.csv')
supplier1.drop(columns=["Unnamed: 0"], inplace=True)
supplier1['tier'] ='1'
# supplier1

supplier2 = pd.read_csv('/home/jupyter/data/suppliers/tiern_N.csv')
supplier2.drop(columns=["Unnamed: 0"], inplace=True)
supplier2['tier'] ='N'
# supplier2
suppliers = pd.concat([supplier1[['country', 'city', 'suggested_name', 'tier']], supplier2[['country', 'city', 'suggested_name', 'tier']]])
print(suppliers.shape)
suppliers.dropna(inplace=True)
print(suppliers.shape)

(59581, 4)
(59580, 4)


### Running RAG

In [36]:
strike = StrikeRAG(vertexai_llm= conf['rag_cong']['vertexai_llm'],
                   vertexai_embedding_name = conf['rag_cong']['vertexai_embedding_name'], 
                   retry = conf['rag_cong']['retry'], 
                   max_doc = conf['rag_cong']['max_doc'], 
                   chunk_size = conf['rag_cong']['chunk_size'], 
                   chunk_overlap = conf['rag_cong']['chunk_overlap']
                  )

strike.retrieve_infos_with_retry(dataframe=rag_data)

label : 0
 document creation : 0
first retrieval
company is None or belongs to self.liste
label : 1
 document creation : 1
first retrieval
no second retrieval
Gapic client context issue detected.This can occur due to parallelization.
Gapic client context issue detected.This can occur due to parallelization.
results :
{'strike': {'labor_strike': 'yes', 'justification': 'The text explicitly mentions a potential "national strike" by the UAW (United Auto Workers) against Stellantis, indicating a labor dispute.'}, 'impacted_company': 'Stellantis', 'locations': [{'city': 'Sterling Heights', 'country': 'United States'}], 'impacted_business_sectors': ['Automotive Manufacturing', 'Labor Relations'], 'automotive_industry': {'concerned': 'yes', 'justification': 'Stellantis is a major car manufacturer. A strike by its US workers would directly impact the production of Stellantis vehicles, which are part of the car-making industry. This would disrupt the supply chain and potentially affect other ca

In [37]:
strike.results

[]

In [38]:
iterative_results = deepcopy(strike.all_results)
len(iterative_results)

2

### Matching process 


In [39]:
matching =  Matching()
r = matching.match(set_news = iterative_results, dataframe=suppliers)

2it [00:00, 35.87it/s]


### Filtering process

In [40]:
# SECTORS_TO_DISCARD = sectors_to_discard.SECTORS_TO_DISCARD

In [41]:
# decision_function_args = {
#     'sectors_to_discard': dict_config["sectors_to_discard"],
#     'desirable_temporalities' : ['upcoming', 'ongoing', 'unknown']
# }

In [42]:
filtre = Filter(empty_companie_name_index=matching.index, decision_function=conf['decision_function'], decision_function_args=conf["decision_function_args"], filename=None)
rr = filtre.filtering(matching.results)

In [43]:
rr

[{'strike': {'labor_strike': 'yes',
   'justification': 'The text explicitly mentions a potential "national strike" by the UAW (United Auto Workers) against Stellantis, indicating a labor dispute.'},
  'impacted_company': 'Stellantis',
  'locations': [{'city': 'Sterling Heights', 'country': 'United States'}],
  'impacted_business_sectors': ['Automotive Manufacturing', 'Labor Relations'],
  'automotive_industry': {'concerned': 'yes',
   'justification': 'Stellantis is a major car manufacturer. A strike by its US workers would directly impact the production of Stellantis vehicles, which are part of the car-making industry. This would disrupt the supply chain and potentially affect other car manufacturers that rely on Stellantis components or services.'},
  'temporality': {'strike_status': 'unknown', 'justification': ''},
  'sources': ['https://www.ilgiornale.it/news/economia/stellantis-operai-usa-piazza-contro-tavares-deve-rispettare-2361474.html'],
  'core_company': 'Stellantis',
  'sup

## Grouping processed articles based on their names

In [44]:
rr = groupByName(rr)

In [45]:
results = generate_description(rr, dataframe=rag_data)

## Adding geographical coordonate

In [46]:
load_dotenv()
NEWS_API_KEY = os.getenv('NEWS_API_KEY')
os.environ['http_proxy'] = os.getenv('http_proxy')
os.environ['https_proxy'] = os.getenv('https_proxy')
os.environ['HTTP_PROXY'] = os.getenv('HTTP_PROXY')
os.environ['HTTPS_PROXY'] = os.getenv('HTTPS_PROXY')

In [47]:
final_results = geoloc(results)

In [48]:
final_results

[{'strike': {'labor_strike': 'yes',
   'justification': 'The text explicitly mentions a potential "national strike" by the UAW (United Auto Workers) against Stellantis, indicating a labor dispute.'},
  'impacted_company': 'Stellantis',
  'locations': [{'city': 'Sterling Heights', 'country': 'United States'}],
  'impacted_business_sectors': ['Automotive Manufacturing', 'Labor Relations'],
  'automotive_industry': {'concerned': 'yes',
   'justification': 'Stellantis is a major car manufacturer. A strike by its US workers would directly impact the production of Stellantis vehicles, which are part of the car-making industry. This would disrupt the supply chain and potentially affect other car manufacturers that rely on Stellantis components or services.'},
  'temporality': {'strike_status': 'unknown', 'justification': ''},
  'sources': ['https://www.ilgiornale.it/news/economia/stellantis-operai-usa-piazza-contro-tavares-deve-rispettare-2361474.html'],
  'core_company': 'Stellantis',
  'sup

## Saving the final results

In [49]:
# file path where the results will be saved
filename = "/home/jupyter/news/results/json/strike/complete_pipeline_test.json"
with open(filename, "w") as final:
    json.dump(final_results, final, indent=4)
    print("saved !")

FileNotFoundError: [Errno 2] No such file or directory: '/home/jupyter/news/results/json/strike/complete_pipeline_test.json'