### Installation

In [None]:
!pip install transformers
!pip install beautifulsoup4

### Import libraries

In [1]:
import requests
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Load data

In [2]:
import opportunities_db.data.load as load

data_raw = load.data_raw
data_processed = load.data_processed
new_version = load.new_version

### Parse data

In [3]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
url = 'https://gijn.org/2022/12/05/fellowships-to-attend-the-2023-global-investigative-journalism-conference/'
req = Request(url , headers={'User-Agent': 'Mozilla/5.0'})

webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
page_soup

<!DOCTYPE html>

<!--[if lt IE 7]> <html lang="en-US" class="no-js ie6"> <![endif]-->
<!--[if IE 7]>    <html lang="en-US" class="no-js ie7"> <![endif]-->
<!--[if IE 8]>    <html lang="en-US" class="no-js ie8"> <![endif]-->
<!--[if IE 9]>    <html lang="en-US" class="no-js ie9"> <![endif]-->
<!--[if (gt IE 9)|!(IE)]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0" name="viewport"/>
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<link href="https://gijn.org/xmlrpc.php" rel="pingback"/>
<!-- Fundraise Up: the new standard for online giving -->
<script>(function(w,d,s,n,a){if(!w[n]){var l='call,catch,on,once,set,then,track'
	.split(','),i,o=function(n){return'function'==typeof n?o.l.push([arguments])&&o
	:function(){return o.l.push([n,arguments])&&o}},t=d.getElementsByTagName(s)[0],
	j=d.createElement(s);j.asy

In [5]:
# Define url parameter
url = 'https://gijn.org/2022/12/05/fellowships-to-attend-the-2023-global-investigative-journalism-conference/'

# Disable warning
disable_warnings(InsecureRequestWarning)

# Make requests to the specified urls
response = requests.get(url, verify=False)

# Return content of the response
html = response.text

# Function to remove tags
def remove_tags(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)


# Print the extracted data
url_text = remove_tags(html)
print(url_text)

403 Forbidden 403 Forbidden nginx


In [15]:
# Apply BeautifulSoup module
soup = BeautifulSoup(urlopen(url))

# Extract title
title = soup.title.get_text()

# Display title
print(title)

2023 Summer School – Social ComQuant


### Load [model](https://huggingface.co/MaRiOrOsSi/t5-base-finetuned-question-answering) for text generation

In [16]:
tokenizer = AutoTokenizer.from_pretrained("MaRiOrOsSi/t5-base-finetuned-question-answering")

model = AutoModelForSeq2SeqLM.from_pretrained("MaRiOrOsSi/t5-base-finetuned-question-answering")

In [17]:
# Create instance for text2text-generation task
get_answer = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Define question to answer
question = 'When is the deadline?'

# Apply question answering function to text
deadline = get_answer(f'question: {question}  context: {url_text}', truncation=True, max_length=512)

deadline_text = deadline[0].get('generated_text')

print(deadline_text)


February 19, 2023


### Load [model](https://huggingface.co/knkarthick/MEETING_SUMMARY) for summarizing

In [18]:
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
summary = summarizer(url_text, truncation=True, max_length=512)

summary_text = summary[0].get('summary_text')

# Display the summary
print(summary_text)

The Social Comquant Project invites applications for the 3rd Summer School on Computational Social Science. It will take place July 24-28, 2023 at the ISI Foundation, in Turin, Italy. The school will gather a number of diverse and outstanding speakers who will teach different methods and approaches to the computational social sciences, providing guidance through practical examples and coding exercises. Students will conduct small projects in which they will apply the newly learned methods and the lecturers will supervise them. The project will also provide a bursary for selected students. 


### Compare dataframes

In [3]:
df_old = pd.read_csv('../data/processed/opportunities_db.csv')
df_new = pd.read_csv('../assets/new_urls.csv')

In [4]:
# Filter dataframe
df_old = df_old['url'].to_frame()

In [5]:
diff = (pd.concat([df_old,df_new]).drop_duplicates(keep=False))

diff

Unnamed: 0,url
4,https://sicss.io/2023/berlin/
5,https://www.rightscon.org/2023-participant-sup...
6,https://www.datajconf.com/
7,https://www.goethe.de/prj/aia/en/aus.html
8,https://gijn.org/2023/02/24/cyber-training/
9,https://abraji.org.br/noticias/abraji-recebe-s...


In [6]:
urls = diff.values.tolist()
urls

[['https://sicss.io/2023/berlin/'],
 ['https://www.rightscon.org/2023-participant-support-initiatives/#review-timeline'],
 ['https://www.datajconf.com/'],
 ['https://www.goethe.de/prj/aia/en/aus.html'],
 ['https://gijn.org/2023/02/24/cyber-training/'],
 ['https://abraji.org.br/noticias/abraji-recebe-sugestoes-para-a-programacao-do-18o-congresso']]

### Update dataframes

In [5]:
df1 = pd.read_csv(data_raw)
df2 = pd.read_csv(new_version)



In [11]:
df_join = df1.merge(df2, how='outer', on='url')
df_join

Unnamed: 0,url
0,https://rjionline.org/about-rji-fellowships/
1,https://socialcomquant.ku.edu.tr/2023-summer-s...
2,https://reutersinstitute.politics.ox.ac.uk/our...
3,https://gijn.org/2022/12/05/fellowships-to-att...
4,https://sicss.io/2023/berlin/
5,https://www.rightscon.org/2023-participant-sup...
6,https://www.datajconf.com/
7,https://www.goethe.de/prj/aia/en/aus.html
8,https://gijn.org/2023/02/24/cyber-training/
9,https://abraji.org.br/noticias/abraji-recebe-s...
