In [1]:

#%%capture
#!pip install transformers wikipedia sentencepiece

In [2]:

from transformers import pipeline
import torch
import wikipedia

wikipedia.set_lang("fr")

In [3]:
torch.cuda.is_available() 

True

In [4]:
device = 0 if torch.cuda.is_available() else -1
nlp = pipeline('question-answering', model='etalab-ia/camembert-base-squadFR-fquad-piaf', tokenizer='etalab-ia/camembert-base-squadFR-fquad-piaf', device=device)

nlp({
    'question': "Qui est Claude Monet?",
    'context': "Claude Monet, né le 14 novembre 1840 à Paris et mort le 5 décembre 1926 à Giverny, est un peintre français et l’un des fondateurs de l'impressionnisme."
})



{'score': 0.5439727306365967,
 'start': 86,
 'end': 106,
 'answer': ' un peintre français'}

In [5]:
searches = wikipedia.search('France 2')
page = wikipedia.page(searches[0], auto_suggest=False)

In [6]:
nlp({
    'question': "France 2 est un chaine generaliste ou thematique?",
    'context': page.content.replace("\n", "")
})

{'score': 0.9918084144592285,
 'start': 12153,
 'end': 12165,
 'answer': ' généraliste'}

In [7]:
nlp({
    'question': "Qui est le proprietaire de la chaine?",
    'context': page.content.replace("\n", "")
})

{'score': 0.9921084642410278,
 'start': 152,
 'end': 172,
 'answer': ' France Télévisions.'}

In [8]:
nlp({
    'question': "Aujour'hui, France 2 est un chaine privéé ou publique?",
    'context': page.content.replace("\n", "")
})

{'score': 0.9748696684837341,
 'start': 11595,
 'end': 11604,
 'answer': ' publique'}

In [9]:
nlp({
    'question': "France 2 est un chaine de information ou divertissement?",
    'context': page.content.replace("\n", "")
})

{'score': 0.9516981244087219, 'start': 269, 'end': 278, 'answer': ' divertir'}

In [10]:
import sys
sys.path.insert(0, '../quotaclimat/data_analytics/')

In [11]:
from scraper.wiki import WikiChannelDataManager
import pandas as pd
from tqdm import tqdm

In [12]:
manager = WikiChannelDataManager('../data/channels.xlsx')

In [13]:
generator = manager.generate_page_content()

In [14]:
next(generator)[1][:100]

'Radio France internationale, généralement désignée par son sigle RFI, est une station de radio publi'

In [15]:
questions = {
    'status': "France 2 est un chaine generaliste ou thematique?",
    'group': "Qui est le proprietaire de la chaine?",
    'privatization': "Aujour'hui, France 2 est un chaine privéé ou publique?"
}

# TODO
# Speed up using a Huggingface Dataset or Pytorch Dataset (whatever is available).
# Need to make this more reliable by limiting the accepted answers to ones with a high confidence level.
# Need to remove extra characters from the answers such as commas and = signs.
# Need to unite all answers in the same format (using a stemmer)

def parse_wiki_data_nlp(pline, wiki_generator, questions):
    results = {k:[] for k in questions.keys()}
    names = []
    for name, context in tqdm(wiki_generator, total=683): # need to calculate this based on n_fetch and/or generator length
        names.append(name)
        for key in questions:
            if context is None:
                results[key].append(None)
            else:
                answer = nlp({
                    'question': questions[key],
                    'context': context
                })['answer'].strip()
                answer = '' if answer is None else answer
                results[key].append(answer)
    return pd.DataFrame(data=results, index=names)


In [16]:
results = parse_wiki_data_nlp(nlp, manager.generate_page_content(n_fetch=100), questions)



  lis = BeautifulSoup(html).find_all('li')
  1%|          | 7/683 [00:26<39:49,  3.54s/it]

Nothing found for Nostalgie


  2%|▏         | 16/683 [00:53<31:18,  2.82s/it]

Nothing found for Sud Radio


  3%|▎         | 21/683 [01:07<30:27,  2.76s/it]

Nothing found for Latina


  3%|▎         | 23/683 [01:13<31:10,  2.83s/it]

Nothing found for Guadeloupe 1ère


 15%|█▍        | 100/683 [05:39<32:59,  3.40s/it] 


In [17]:
results.rename(columns={'question': 'privatization'}})

Unnamed: 0,status,group,question
rfi,"spécifique à l'Afrique,","France Médias Monde,",publique
france_inter,généraliste,Radio France,publiques
rmc,généraliste,"Max Brusset,",monopole d'État
france_info,radio publique d'information française,Maison de la Radio.===,public
rtl,généraliste,RTL Group.,privée


In [21]:
if not os.path.isdir(os.path.join('../data/channels/')):
        os.mkdir(os.path.join('../data/channels/'))
results.to_csv('../data/channels/scraped_channels.csv')