In [1]:
import requests
import requests
import feedparser
from datetime import datetime, timezone
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pyperclip
import time
import cv2
import numpy as np
import requests
import json
from io import BytesIO
from skimage.feature import match_template
from skimage.color import rgb2gray
from transformers import pipeline
import os
import torch

# Enable device-side assertions
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cuda.use_deterministic_algorithms = True
torch.backends.cuda.cufft_plan_cache.clear()
torch.backends.cudnn.cache_enabled = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.enabled = True
torch.backends.cuda.deterministic = True
torch.backends.cuda.flags = {
    'assert': str(int(True)),
    'allow_tf32': str(int(False)),
    'cudnn_deterministic': str(int(True)),
    'cudnn_benchmark': str(int(False)),
    'use_deterministic_algorithms': str(int(True)),
    'cufft_plan_cache_clear': str(int(True)),
    'cudnn_cache_enabled': str(int(False)),
    'cudnn_allow_tf32': str(int(False)),
    'cudnn_enabled': str(int(True)),
    'cudnn_deterministic': str(int(True)),
}
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

endpoint = "http://hasura.192.168.0.100.nip.io/v1/graphql"
admin_key = "arrive@AD123"

def query_hasura_graphql(endpoint, admin_key, query, variables):
    headers = {
        'Content-Type': 'application/json',
        'x-hasura-admin-secret': f'{admin_key}'
    }

    data = {
        'query': query,
        'variables': variables
    }
    response = requests.post(endpoint, json=data, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Request failed with status code {response.status_code}")
        return None

def is_valid_timezone_format(published):
    try:
        # Attempt to parse the string
        date_format = "%a, %d %b %Y %H:%M:%S %z"
        date_object = datetime.strptime(published, date_format)
        
        hasura_timestamp = date_object.astimezone(timezone.utc).isoformat()
        return True, hasura_timestamp
    except ValueError:
        # If parsing fails, the string is not in the correct format
        return False, None

def check_date_format(date_string):
    try:
        datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S%z')
        return True
    except ValueError:
        return False
        
def mutation_hasura_graphql(endpoint, admin_key, mutation_query, mutation_variables):
    headers = {
        'Content-Type': 'application/json',
        'x-hasura-admin-secret': f'{admin_key}'
    }
    response = requests.post(endpoint, json={'query': mutation_query, 'variables': mutation_variables}, headers=headers)
    if response.ok:
        data = response.json()
        print(data)
        return True, data
    else:
        print(f"Mutation failed with status code {response.status_code}: {response.text}")
        return False, None

def update_articles_toi():
    graphql_query = '''
    query MyQuery($outlet: String!, $link_type: Int!) {
    rss1_links(where: {rss1_link_type: {_eq: $link_type}, outlet: {_eq: $outlet}}) {
        rss1_link
    }
    }
    '''
    # Define the variables dictionary
    variables = {
        "link_type": 11,
        "outlet": "timesofindia"
    }
    rss1_links_array = []
    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    if response_data:
        rss1_links_array = [item["rss1_link"] for item in response_data["data"]["rss1_links"]]
    #print(rss1_links_array)
    graphql_query = """
    query MyQuery($outlet: String!) {
        rss1_outlets(where: {outlet: {_eq: $outlet}}) {
            logo_url
        }
    }
    """
    # Define the variables dictionary
    variables = {
        "outlet": "timesofindia"
    }
    mutation_query = """
    mutation MyMutation($objects: [rss1_articals_insert_input!] = {}) {
    insert_rss1_articals(objects: $objects, on_conflict: {constraint: rss1_articals_post_link_key}) {
        returning {
        id
        }
    }
    }
    """

    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    for feed_link in rss1_links_array:
        NewsFeed = feedparser.parse(feed_link)
        is_default_image = 0
        logo_url = response_data['data']['rss1_outlets'][0]['logo_url']
        print("############################################################")
        print(feed_link)
        articles = []
        for entry in NewsFeed.entries:
            # print(entry.link)
            is_default_image = 0
            title = entry.title
            summary_nofil = entry.summary
            summary = re.sub('<[^<]+?>', '', summary_nofil)
            image_url = logo_url
            for link in entry.links:
                if link.type == "image/jpeg":
                    image_url= link.href
                    is_default_image = 1
                    break
            post_link = entry.link
            published = entry.published
            datevalidation = is_valid_timezone_format(published)
            if datevalidation[0]:
                hasura_timestamp = datevalidation[1]
            if check_date_format(published):
                hasura_timestamp = published
            else:
                hasura_timestamp = datetime.now().astimezone(timezone.utc).isoformat()
            if "author" in entry:
                author = entry.author
            else:
                author = "na"
            articles.append({
                    "rss1_link": feed_link,
                    "post_link": post_link,
                    "title": title,
                    "summary": summary,
                    "author": author,
                    "image_link" : image_url,
                    "post_published": hasura_timestamp,
                    "is_default_image": is_default_image,
                }
            )
            #print(feed_link, post_link, title, summary, author, image_url, hasura_timestamp, is_default_image)
        mutation_variables = {
            "objects": articles
        }
        #print({'query': mutation_query, 'variables': mutation_variables})
        out1 = mutation_hasura_graphql(endpoint = endpoint, admin_key = admin_key, mutation_query = mutation_query, mutation_variables = mutation_variables)

def update_articles_thehindu():
    graphql_query = '''
    query MyQuery($outlet: String!, $link_type: Int!) {
    rss1_links(where: {rss1_link_type: {_eq: $link_type}, outlet: {_eq: $outlet}}) {
        rss1_link
    }
    }
    '''
    # Define the variables dictionary
    variables = {
        "link_type": 11,
        "outlet": "thehindu"
    }
    rss1_links_array = []
    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    if response_data:
        rss1_links_array = [item["rss1_link"] for item in response_data["data"]["rss1_links"]]
    #print(rss1_links_array)
    graphql_query = """
    query MyQuery($outlet: String!) {
        rss1_outlets(where: {outlet: {_eq: $outlet}}) {
            logo_url
        }
    }
    """
    # Define the variables dictionary
    variables = {
        "outlet": "thehindu"
    }
    mutation_query = """
    mutation MyMutation($objects: [rss1_articals_insert_input!] = {}) {
    insert_rss1_articals(objects: $objects, on_conflict: {constraint: rss1_articals_post_link_key}) {
        returning {
        id
        }
    }
    }
    """

    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    for feed_link in rss1_links_array:
        NewsFeed = feedparser.parse(feed_link)
        is_default_image = 0
        logo_url = response_data['data']['rss1_outlets'][0]['logo_url']
        print("############################################################")
        print(feed_link)
        articles = []
        for entry in NewsFeed.entries:
            # print(entry.link)
            is_default_image = 0
            title = entry.title
            summary_nofil = entry.summary
            summary = re.sub('<[^<]+?>', '', summary_nofil)
            image_url = logo_url
            for link in entry.links:
                if link.type == "image/jpeg":
                    image_url= link.href
                    is_default_image = 1
                    break
            post_link = entry.link
            published = entry.published
            datevalidation = is_valid_timezone_format(published)
            if datevalidation[0]:
                hasura_timestamp = datevalidation[1]
            if check_date_format(published):
                hasura_timestamp = published
            else:
                hasura_timestamp = datetime.now().astimezone(timezone.utc).isoformat()
            if "author" in entry:
                author = entry.author
            else:
                author = "na"
            articles.append({
                    "rss1_link": feed_link,
                    "post_link": post_link,
                    "title": title,
                    "summary": summary,
                    "author": author,
                    "image_link" : image_url,
                    "post_published": hasura_timestamp,
                    "is_default_image": is_default_image,
                }
            )
            #print(feed_link, post_link, title, summary, author, image_url, hasura_timestamp, is_default_image)
        mutation_variables = {
            "objects": articles
        }
        #print({'query': mutation_query, 'variables': mutation_variables})
        out1 = mutation_hasura_graphql(endpoint = endpoint, admin_key = admin_key, mutation_query = mutation_query, mutation_variables = mutation_variables)

def update_articles_cnn():
    graphql_query = '''
    query MyQuery($outlet: String!, $link_type: Int!) {
    rss1_links(where: {rss1_link_type: {_eq: $link_type}, outlet: {_eq: $outlet}}) {
        rss1_link
    }
    }
    '''
    # Define the variables dictionary
    variables = {
        "link_type": 11,
        "outlet": "cnn"
    }
    rss1_links_array = []
    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    if response_data:
        rss1_links_array = [item["rss1_link"] for item in response_data["data"]["rss1_links"]]
    #print(rss1_links_array)
    graphql_query = """
    query MyQuery($outlet: String!) {
        rss1_outlets(where: {outlet: {_eq: $outlet}}) {
            logo_url
        }
    }
    """
    # Define the variables dictionary
    variables = {
        "outlet": "cnn"
    }
    mutation_query = """
    mutation MyMutation($objects: [rss1_articals_insert_input!] = {}) {
    insert_rss1_articals(objects: $objects, on_conflict: {constraint: rss1_articals_post_link_key}) {
        returning {
        id
        }
    }
    }
    """

    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    for feed_link in rss1_links_array:
        NewsFeed = feedparser.parse(feed_link)
        is_default_image = 0
        logo_url = response_data['data']['rss1_outlets'][0]['logo_url']
        print("############################################################")
        print(feed_link)
        articles = []
        for entry in NewsFeed.entries:
            # print(entry.link)
            is_default_image = 0
            title = entry.title
            summary = ''
            if 'summary' in entry:
                summary_nofil = entry.summary
                summary = re.sub('<[^<]+?>', '', summary_nofil)
            image_url = logo_url
            if 'media_content' in entry:
                image_url = entry['media_content'][0]['url']
                is_default_image = 1
            for link in entry.links:
                if link.type == "image/jpeg":
                    image_url= link.href
                    is_default_image = 1
                    break
            post_link = entry.link
            published = datetime.now(timezone.utc).isoformat()
            if 'published' in entry:
                published = entry.published
            datevalidation = is_valid_timezone_format(published)
            if datevalidation[0]:
                hasura_timestamp = datevalidation[1]
            if check_date_format(published):
                hasura_timestamp = published
            else:
                hasura_timestamp = datetime.now().astimezone(timezone.utc).isoformat()
            if "author" in entry:
                author = entry.author
            else:
                author = "na"
            articles.append({
                    "rss1_link": feed_link,
                    "post_link": post_link,
                    "title": title,
                    "summary": summary,
                    "author": author,
                    "image_link" : image_url,
                    "post_published": hasura_timestamp,
                    "is_default_image": is_default_image,
                }
            )
            #print(feed_link, post_link, title, summary, author, image_url, hasura_timestamp, is_default_image)
        mutation_variables = {
            "objects": articles
        }
        #print({'query': mutation_query, 'variables': mutation_variables})
        out1 = mutation_hasura_graphql(endpoint = endpoint, admin_key = admin_key, mutation_query = mutation_query, mutation_variables = mutation_variables)

def update_articles_foxnews():
    graphql_query = '''
    query MyQuery($outlet: String!, $link_type: Int!) {
    rss1_links(where: {rss1_link_type: {_eq: $link_type}, outlet: {_eq: $outlet}}) {
        rss1_link
    }
    }
    '''
    # Define the variables dictionary
    variables = {
        "link_type": 11,
        "outlet": "foxnews"
    }
    rss1_links_array = []
    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    if response_data:
        rss1_links_array = [item["rss1_link"] for item in response_data["data"]["rss1_links"]]
    #print(rss1_links_array)
    graphql_query = """
    query MyQuery($outlet: String!) {
        rss1_outlets(where: {outlet: {_eq: $outlet}}) {
            logo_url
        }
    }
    """
    # Define the variables dictionary
    variables = {
        "outlet": "foxnews"
    }
    mutation_query = """
    mutation MyMutation($objects: [rss1_articals_insert_input!] = {}) {
    insert_rss1_articals(objects: $objects, on_conflict: {constraint: rss1_articals_post_link_key}) {
        returning {
        id
        }
    }
    }
    """

    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    for feed_link in rss1_links_array:
        NewsFeed = feedparser.parse(feed_link)
        is_default_image = 0
        logo_url = response_data['data']['rss1_outlets'][0]['logo_url']
        print("############################################################")
        print(feed_link)
        articles = []
        for entry in NewsFeed.entries:
            # print(entry.link)
            is_default_image = 0
            title = entry.title
            summary = ''
            if 'summary' in entry:
                summary_nofil = entry.summary
                summary = re.sub('<[^<]+?>', '', summary_nofil)
            image_url = logo_url
            if 'media_content' in entry:
                image_url = entry['media_content'][0]['url']
                is_default_image = 1
            if 'links' in entry:
                for link in entry.links:
                    if link.type == "image/jpeg":
                        image_url= link.href
                        is_default_image = 1
                        break
            post_link = entry.link
            published = datetime.now(timezone.utc).isoformat()
            if 'published' in entry:
                published = entry.published
            datevalidation = is_valid_timezone_format(published)
            if datevalidation[0]:
                hasura_timestamp = datevalidation[1]
            if check_date_format(published):
                hasura_timestamp = published
            else:
                hasura_timestamp = datetime.now().astimezone(timezone.utc).isoformat()
            if "author" in entry:
                author = entry.author
            else:
                author = "na"
            articles.append({
                    "rss1_link": feed_link,
                    "post_link": post_link,
                    "title": title,
                    "summary": summary,
                    "author": author,
                    "image_link" : image_url,
                    "post_published": hasura_timestamp,
                    "is_default_image": is_default_image,
                }
            )
            #print(feed_link, post_link, title, summary, author, image_url, hasura_timestamp, is_default_image)
        mutation_variables = {
            "objects": articles
        }
        #print({'query': mutation_query, 'variables': mutation_variables})
        out1 = mutation_hasura_graphql(endpoint = endpoint, admin_key = admin_key, mutation_query = mutation_query, mutation_variables = mutation_variables)

def load_image_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return cv2.imdecode(np.frombuffer(response.content, np.uint8), -1)
    else:
        return None

def update_article_detail_toi_with_images():
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        rss1_articals(offset: $offset, limit: $limit, where: {is_in_detail: {_eq: 0}, rss1LinkByRss1Link: {outlet: {_eq: "timesofindia"}}}, order_by: {post_published: desc}) {
            post_link
            is_default_image
            image_link
            id
        }
        }
    '''
    offset = 0
    mutation_query = """
    mutation MyMutation($objects: [rss1_articles_detail_insert_input!] = {}, $updates: [rss1_articals_updates!] = {where: {}}) {
    insert_rss1_articles_detail(objects: $objects, on_conflict: {constraint: rss1_articles_detail_post_link_key}) {
        affected_rows
        returning {
        id
        }
    }
    update_rss1_articals_many(updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """    
    options = webdriver.EdgeOptions()
    options.use_chromium = True
    options.add_argument('--enable-immersive-reader')
    driver = webdriver.Edge(options=options)
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        #print(variables, response_data)
        #print(response_data)
        post_links_array = []
        ids=[]
        if response_data:
            post_links_array = [item["post_link"] for item in response_data["data"]["rss1_articals"]]
            is_default_image_array = [item["is_default_image"] for item in response_data["data"]["rss1_articals"]]
            image_link_array = [item["image_link"] for item in response_data["data"]["rss1_articals"]]
            ids=[item["id"] for item in response_data["data"]["rss1_articals"]]
        articles_detail = []
        articles_update = []
        if len(post_links_array) == 0:
            break
        try:
            for a in range(len(post_links_array)):
                main_link = post_links_array[a]
                print(main_link)
                read_link= "read://"+main_link
                driver.get(read_link)
                time.sleep(5)
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('c').key_up(Keys.CONTROL).perform()
                text = pyperclip.paste()
                text2 = text
                text3 = text2.split('\n')
                text3 = [s.replace('\r', '') for s in text3]
                special_chars = set("!@#$%^&*()_+[]{}|;:'\",<>?")
                text4 = [s for s in text3 if len(s) > 0 and (s[0] not in special_chars or s[-1] not in special_chars)]
                my_list = text4
                my_set = set()
                desription = []
                for item in my_list:
                    if item not in my_set:
                        desription.append(item)
                        my_set.add(item)
                #print(desription)
                driver.get(main_link)
                time.sleep(5)
                try: 
                    xpath = f"""//*/img[@alt="{desription[0]}"]"""
                    elements = driver.find_elements(By.XPATH, xpath)
                    #print(elements)
                    images = [element.get_attribute("src") for element in elements]
                    #print(images)
                except:
                    images = []
                if is_default_image_array[a] == 1:
                    images_1 = [image_link_array[a]] + images
                else:
                    images_1 = images
                images_final = list(set(images_1))
                images_to_remove = []
                if len(images_final) > 1:
                    image_urls = images_final
                    for i in range(len(image_urls)):
                        for j in range(i+1, len(image_urls)):
                            image1 = load_image_from_url(image_urls[i])
                            image2 = load_image_from_url(image_urls[j])

                            if image1 is None or image2 is None:
                                print(f"Failed to load one or both images for comparison between image {i+1} and image {j+1}.")
                                continue

                            # Resize the images to the same dimensions for comparison
                            height, width, _ = image1.shape
                            image3 = cv2.resize(image2, (width, height))

                            # Convert the images to grayscale
                            gray_image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
                            gray_image2 = cv2.cvtColor(image3, cv2.COLOR_BGR2GRAY)

                            # Calculate the structural similarity
                            result = match_template(gray_image1, gray_image2)
                            ssim = np.max(result)

                            # Display the SSIM value (a higher value indicates more similarity)
                            #print(f"SSIM between image {i+1} and image {j+1}: {ssim}")
                            height1, width1, _ = image1.shape
                            height2, width2, _ = image2.shape
                            height1, width1, _ = image1.shape
                            height2, width2, _ = image2.shape
                            if height1 * width1 >= height2 * width2 and ssim > 0.8:
                                images_to_remove.append(image_urls[i+1])
                            elif height1 * width1 < height2 * width2 and ssim > 0.8:
                                images_to_remove.append(image_urls[i])

                    # Remove the images that were marked for removal
                    for image_url in images_to_remove:
                        if image_url in images_final:
                            images_final.remove(image_url)
                #print(images_final)
                articles_detail.append({
                    "article_id": ids[a],
                    "title": desription[0],
                    "discription": desription[1:],
                    "image_link": images_final,
                }
                )
                if (is_default_image_array[a] == 0 and len(images_final) > 0):
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1 , "image_link": images_final[0], "is_default_image": 1}
                    })
                else:
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1}
                    })
                
                #print(main_link, desription[0], desription[1:], images_final)
            #print(articles_update)
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        except:
            offset = offset + 1
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        
    driver.quit() 

def update_article_detail_toi():
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        rss1_articals(offset: $offset, limit: $limit, where: {is_in_detail: {_eq: 0}, rss1LinkByRss1Link: {outlet: {_eq: "timesofindia"}}}, order_by: {post_published: desc}) {
            post_link
            is_default_image
            image_link
            id
        }
        }
    '''
    offset = 0
    mutation_query = """
    mutation MyMutation($objects: [rss1_articles_detail_insert_input!] = {}, $updates: [rss1_articals_updates!] = {where: {}}) {
    insert_rss1_articles_detail(objects: $objects, on_conflict: {constraint: rss1_articles_detail_article_id_key}) {
        affected_rows
        returning {
        id
        }
    }
    update_rss1_articals_many(updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """    
    options = webdriver.EdgeOptions()
    options.use_chromium = True
    options.add_argument('--enable-immersive-reader')
    driver = webdriver.Edge(options=options)
    while True:
        variables = {
        "limit": 20,
        "offset": offset
        }
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        #print(variables, response_data)
        #print(response_data)
        post_links_array = []
        ids=[]
        if response_data:
            post_links_array = [item["post_link"] for item in response_data["data"]["rss1_articals"]]
            is_default_image_array = [item["is_default_image"] for item in response_data["data"]["rss1_articals"]]
            image_link_array = [item["image_link"] for item in response_data["data"]["rss1_articals"]]
            ids=[item["id"] for item in response_data["data"]["rss1_articals"]]
        articles_detail = []
        articles_update = []
        if len(post_links_array) == 0:
            break
        try:
            for a in range(len(post_links_array)):
                main_link = post_links_array[a]
                print(main_link)
                read_link= "read://"+main_link
                driver.get(read_link)
                time.sleep(5)
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('c').key_up(Keys.CONTROL).perform()
                text = pyperclip.paste()
                text2 = text
                text3 = text2.split('\n')
                text3 = [s.replace('\r', '') for s in text3]
                special_chars = set("!@#$%^&*()_+[]{}|;:'\",<>?")
                text4 = [s for s in text3 if len(s) > 0 and (s[0] not in special_chars or s[-1] not in special_chars)]
                my_list = text4
                my_set = set()
                desription = []
                for item in my_list:
                    if item not in my_set:
                        desription.append(item)
                        my_set.add(item)
                #print(desription)
                images_final = []
                articles_detail.append({
                    "article_id": ids[a],
                    "title": desription[0],
                    "description": desription[1:],
                    "image_link": images_final,
                }
                )
                if (is_default_image_array[a] == 0 and len(images_final) > 0):
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1 , "image_link": images_final[0], "is_default_image": 1}
                    })
                else:
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1}
                    })
                
                #print(main_link, desription[0], desription[1:], images_final)
            #print(articles_update)
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        except:
            offset = offset + 1
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        
    driver.quit() 

def update_article_detail_cnn(offset1):
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        rss1_articals(offset: $offset, limit: $limit, where: {is_in_detail: {_eq: 0}, rss1LinkByRss1Link: {outlet: {_eq: "cnn"}}}, order_by: {post_published: desc}) {
            post_link
            is_default_image
            image_link
            id
        }
        }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($objects: [rss1_articles_detail_insert_input!] = {}, $updates: [rss1_articals_updates!] = {where: {}}) {
    insert_rss1_articles_detail(objects: $objects, on_conflict: {constraint: rss1_articles_detail_article_id_key}) {
        affected_rows
        returning {
        id
        }
    }
    update_rss1_articals_many(updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """    
    options = webdriver.EdgeOptions()
    options.use_chromium = True
    options.add_argument('--enable-immersive-reader')
    driver = webdriver.Edge(options=options)
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        #print(variables, response_data)
        #print(response_data)
        post_links_array = []
        ids=[]
        if response_data:
            post_links_array = [item["post_link"] for item in response_data["data"]["rss1_articals"]]
            is_default_image_array = [item["is_default_image"] for item in response_data["data"]["rss1_articals"]]
            image_link_array = [item["image_link"] for item in response_data["data"]["rss1_articals"]]
            ids=[item["id"] for item in response_data["data"]["rss1_articals"]]
        articles_detail = []
        articles_update = []
        if len(post_links_array) == 0:
            break
        try:
            for a in range(len(post_links_array)):
                main_link = post_links_array[a]
                print(main_link)
                driver.get(main_link)
                get_url = driver.current_url
                read_link= "read://"+get_url
                driver.get(read_link)
                time.sleep(5)
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('c').key_up(Keys.CONTROL).perform()
                text = pyperclip.paste()
                text2 = text
                text3 = text2.split('\n')
                text3 = [s.replace('\r', '') for s in text3]
                special_chars = set("!@#$%^&*()_+[]{}|;:'\",<>?")
                text4 = [s for s in text3 if len(s) > 0 and (s[0] not in special_chars or s[-1] not in special_chars)]
                my_list = text4
                my_set = set()
                desription = []
                for item in my_list:
                    if item not in my_set:
                        desription.append(item)
                        my_set.add(item)
                #print(desription)
                images_final = []
                articles_detail.append({
                    "article_id": ids[a],
                    "title": desription[0],
                    "description": desription[1:],
                    "image_link": images_final,
                }
                )
                if (is_default_image_array[a] == 0 and len(images_final) > 0):
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1 , "image_link": images_final[0], "is_default_image": 1}
                    })
                else:
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1}
                    })
                
                #print(main_link, desription[0], desription[1:], images_final)
            #print(articles_update)
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        except:
            offset = offset + 1
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        
    driver.quit() 

def update_article_detail_foxnews(offset1):
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        rss1_articals(offset: $offset, limit: $limit, where: {is_in_detail: {_eq: 0}, rss1LinkByRss1Link: {outlet: {_eq: "thehindu"}}}, order_by: {post_published: desc}) {
            post_link
            is_default_image
            image_link
            id
        }
        }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($objects: [rss1_articles_detail_insert_input!] = {}, $updates: [rss1_articals_updates!] = {where: {}}) {
    insert_rss1_articles_detail(objects: $objects, on_conflict: {constraint: rss1_articles_detail_article_id_key}) {
        affected_rows
        returning {
        id
        }
    }
    update_rss1_articals_many(updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """    
    options = webdriver.EdgeOptions()
    options.use_chromium = True
    options.add_argument('--enable-immersive-reader')
    driver = webdriver.Edge(options=options)
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        #print(variables, response_data)
        #print(response_data)
        post_links_array = []
        ids=[]
        if response_data:
            post_links_array = [item["post_link"] for item in response_data["data"]["rss1_articals"]]
            is_default_image_array = [item["is_default_image"] for item in response_data["data"]["rss1_articals"]]
            image_link_array = [item["image_link"] for item in response_data["data"]["rss1_articals"]]
            ids=[item["id"] for item in response_data["data"]["rss1_articals"]]
        articles_detail = []
        articles_update = []
        if len(post_links_array) == 0:
            break
        try:
            for a in range(len(post_links_array)):
                main_link = post_links_array[a]
                print(main_link)
                driver.get(main_link)
                get_url = driver.current_url
                read_link= "read://"+get_url
                driver.get(read_link)
                time.sleep(5)
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('c').key_up(Keys.CONTROL).perform()
                text = pyperclip.paste()
                text2 = text
                text3 = text2.split('\n')
                text3 = [s.replace('\r', '') for s in text3]
                special_chars = set("!@#$%^&*()_+[]{}|;:'\",<>?")
                text4 = [s for s in text3 if len(s) > 0 and (s[0] not in special_chars or s[-1] not in special_chars)]
                my_list = text4
                my_set = set()
                desription = []
                for item in my_list:
                    if item not in my_set:
                        desription.append(item)
                        my_set.add(item)
                #print(desription)
                images_final = []
                articles_detail.append({
                    "article_id": ids[a],
                    "title": desription[0],
                    "description": desription[1:],
                    "image_link": images_final,
                }
                )
                if (is_default_image_array[a] == 0 and len(images_final) > 0):
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1 , "image_link": images_final[0], "is_default_image": 1}
                    })
                else:
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1}
                    })
                
                #print(main_link, desription[0], desription[1:], images_final)
            #print(articles_update)
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        except:
            offset = offset + 1
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        
    driver.quit() 

def summerizer(offset1): 
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0)
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
    rss1_articles_detail(limit: $limit, offset: $offset, where: {summary: {_is_null: true}}) {
        title
        description
        rss1_artical {
        title
        summary
        }
        article_id
    }
    }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($updates: [rss1_articles_detail_updates!] = {where: {}}) {
        update_rss1_articles_detail_many(updates: $updates) {
            affected_rows
            returning {
            id
            }
        }
        }
    """
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        rss1_articles_detail_updates = []
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['rss1_articles_detail']) == 0:
            break
        for response in response_data['data']['rss1_articles_detail']:
            print(response['title'])
            article=""
            article = article + response['title'] + " " +  response['rss1_artical']['title'] + " " +  response['rss1_artical']['summary'] + ', '.join(response['description'])
            chunks=[]
            max_length = 0
            min_length = 0
            if len(article) < 1000:
                max_length = 150
                min_length = 100
                chunks.append(article)
            elif len(article) < 3000:
                max_length = 300
                min_length = 200
                chunks.append(article)
            elif len(article) < 4000:
                max_length = 400
                min_length = 250
                chunks.append(article)
            elif len(article) < 8000:
                max_length = 200
                min_length = 150
                midpoint = len(article) // 2
                chunks.append(article[:midpoint])
                chunks.append(article[midpoint:])
            else:
                article=article[:8000]
                max_length = 200
                min_length = 150
                midpoint = len(article) // 2
                chunks.append(article[:midpoint])
                chunks.append(article[midpoint:])

            summerize=""
            for chunk in chunks:
                summerize=summerize + summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']+ " "
            if len(summerize) > 0:
                rss1_articles_detail_updates.append({
                    "where": {"article_id" : { "_eq": response['article_id'] }},
                    "_set": {"summary": summerize }
                })
        mutation_variables = {
            "updates": rss1_articles_detail_updates,
            }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        

In [2]:
#update_articles_toi()
#update_articles_thehindu()
#update_articles_cnn()
#update_articles_foxnews()

#update_article_detail_toi()
#update_article_detail_cnn(0)
#update_article_detail_foxnews(0)
summerizer(10)

US pours billions into Cold War submarine program as China bolsters navy


In [8]:
graphql_query = '''
query MyQuery($title: String!) {
  rss1_articals(where: {title: {_iregex: $title}}) {
    title
    summary
    id
  }
}

'''
# Define the variables dictionary
variables = {
    "title": "trump"
}
response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
articles = []
ids = []
for article  in response_data['data']['rss1_articals']:
  articles.append([article['title'], article['summary']])
  ids.append(article['id'])
graphql_query = '''
query MyQuery($title: String!) {
  rss1_articals(where: {title: {_iregex: $title}}) {
    title
    summary
    id
  }
}

'''
# Define the variables dictionary
variables = {
    "title": "babu"
}
response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
for article  in response_data['data']['rss1_articals']:
  articles.append([article['title'], article['summary']])
  ids.append(article['id'])

In [9]:

s1= []
for i in range(0, len(articles) - 1):
    s1.append([['Represent the news article sentence for custering and retrieval:  ', articles[i][0] +" " + articles[i][1]]])
    


In [6]:
from InstructorEmbedding import INSTRUCTOR
import numpy
model = INSTRUCTOR('hkunlp/instructor-xl', device=0)


load INSTRUCTOR_Transformer
max_seq_length  512


In [10]:
embeddings = []
for s in s1:
    list_embeddings = numpy.ravel(model.encode(s)).tolist()
    embeddings.append(list_embeddings)
print(len(embeddings))
print(len(embeddings[0]))

In [95]:

mutation_query = """
mutation MyMutation($objects: [articles_vector1_insert_input!] = {}) {
  insert_articles_vector1(objects: $objects, on_conflict: {constraint: articles_vector1_article_id_key}) {
    affected_rows
    returning {
      article_id
    }
  }
}
"""
embeds=[]
for i in range(0,len(ids)-1):
  embeds.append({
    "article_id": ids[i],
    "embedding": str(embeddings[i]),
    }
    )
mutation_variables = {
    "objects": embeds
}
#print({'query': mutation_query, 'variables': mutation_variables})
out1 = mutation_hasura_graphql(endpoint = endpoint, admin_key = admin_key, mutation_query = mutation_query, mutation_variables = mutation_variables)

{'data': {'insert_articles_vector1': {'affected_rows': 11, 'returning': [{'article_id': 12}, {'article_id': 98}, {'article_id': 106}, {'article_id': 666}, {'article_id': 1042}, {'article_id': 1043}, {'article_id': 1044}, {'article_id': 1045}, {'article_id': 1272}, {'article_id': 127}, {'article_id': 608}]}}}


In [16]:
options = webdriver.EdgeOptions()
options.use_chromium = True
options.add_argument('--enable-immersive-reader')
driver = webdriver.Edge(options=options)


In [21]:
read_link="https://www.cnn.com/2023/09/02/americas/guatemalan-president-elect-arevalo-alleged-coup-inlt-hnk/index.html"
driver.get(read_link)
get_url = driver.current_url
read_link= "read://"+get_url
driver.get(read_link)

In [1]:


ARTICLE = ''' New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
Prosecutors said the marriages were part of an immigration scam.
On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
2010 marriage license application, according to court documents.
'''


In [1]:
import torch

# create a tensor on the CPU
x = torch.randn(10, 10)


# move the tensor to the GPU
x = x.to("cuda")

# check if the tensor is on the GPU
print(x.device)  # should print "cuda:0"

cuda:0


In [13]:
chunks=[]
max_length = 0
min_length = 0
if len(ARTICLE) < 1000:
    max_length = 150
    min_length = 100
    chunks.append(ARTICLE)
elif len(ARTICLE) < 3000:
    max_length = 300
    min_length = 200
    chunks.append(ARTICLE)
elif len(ARTICLE) < 4000:
    max_length = 400
    min_length = 250
    chunks.append(ARTICLE)
elif len(ARTICLE) < 8000:
    max_length = 200
    min_length = 150
    midpoint = len(ARTICLE) // 2
    chunks.append(ARTICLE[:midpoint])
    chunks.append(ARTICLE[midpoint:])
else:
    ARTICLE=ARTICLE[:8000]
    max_length = 200
    min_length = 150
    midpoint = len(ARTICLE) // 2
    chunks.append(ARTICLE[:midpoint])
    chunks.append(ARTICLE[midpoint:])

summerize=""
for chunk in chunks:
    summerize=summerize + summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']+ " "

print(summerize)

['Liana Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. She is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application. If convicted, she faces up to four years in prison. Her next court appearance is scheduled for May 18, according to her attorney, Christopher Wright, who declined to comment further. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'sInvestigation Division.', "Liana Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men, and at one time, she was married to eight men at once. Her eighth

In [18]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)


In [20]:
sequence_to_classify = """
Scientists identify secret ingredient in Leonardo da Vinci paintings
01b old masters egg yolk painting mona lisa RESTRICTED
Editor’s Note: Sign up for CNN’s Wonder Theory science newsletter. Explore the universe with news on fascinating discoveries, scientific advancements and more.

CNN  — 
“Old Masters” such as Leonardo da Vinci, Sandro Botticelli and Rembrandt may have used proteins, especially egg yolk, in their oil paintings, according to a new study.

Trace quantities of protein residue have long been detected in classic oil paintings, though they were often ascribed to contamination. A new study published Tuesday in the journal Nature Communications found the inclusion was likely intentional — and sheds light on the technical knowledge of the Old Masters, the most skilled European painters of the 16th, 17th, or early 18th century, and the way they prepared their paints.

“There are very few written sources about this and no scientific work has been done before to investigate the subject in such depth,” said study author Ophélie Ranquet of the Institute of Mechanical Process Engineering and Mechanics at the Karlsruhe Institute of Technology in Germany, in a phone interview. “Our results show that even with a very small amount of egg yolk, you can achieve an amazing change of properties in the oil paint, demonstrating how it might have been beneficial for the artists.”

Simply adding some egg yolk to their works, it turns out, could have long-lasting effects that went beyond just aesthetics.

Eggs vs. oil
Compared with the medium formulated by ancient Egyptians called tempera — which combines egg yolk with powdered pigments and water — oil paint creates more intense colors, allows for very smooth color transitions and dries far less quickly, so it can be used for several days after its preparation. However, oil paint, which uses linseed or safflower oil instead of water, also has drawbacks, including being more susceptible to color darkening and damage caused by exposure to light.

Because making paint was an artisanal and experimental process, it is possible that the Old Masters might have added egg yolk, a familiar ingredient, to the newer type of paint, which first showed up in the seventh century in Central Asia before spreading to Northern Europe in the Middle Ages and Italy during the Renaissance. In the study, the researchers recreated the process of paint-making by using four ingredients — egg yolk, distilled water, linseed oil and pigment — to mix two historically popular and significant colors, lead white and ultramarine blue.

“The addition of egg yolk is beneficial because it can tune the properties of these paints in a drastic way,” Ranquet said, “For example by showing aging differently: It takes a longer time for the paint to oxidize, because of the antioxidants contained in the yolk.”

The chemical reactions between the oil, the pigment and the proteins in the yolk directly affect the paint’s behavior and viscosity. “For example, the lead white pigment is quite sensitive to humidity, but if you coat it with a protein layer, it makes it a lot more resistant to it, making the paint quite easy to apply,” Ranquet said.

“On the other hand, if you wanted something stiffer without having to add a lot of pigment, with a bit of egg yolk you can create a high impasto paint,” she added, referring to a painting technique where the paint is laid out in a stroke thick enough that the brushstrokes are still visible. Using less pigment would have been desirable centuries ago, when certain pigments — such as lapis lazuli, which was used to make ultramarine blue — were more expensive than gold, according to Ranquet.

"The Madonna of the Carnation," on display at the Alte Pinakothek in Munich, Germany, is one of Leonardo da Vinci's earliest paintings. Wrinkling of the oil paint is evident on the faces. 
"The Madonna of the Carnation," on display at the Alte Pinakothek in Munich, Germany, is one of Leonardo da Vinci's earliest paintings. Wrinkling of the oil paint is evident on the faces.
DeAgostini/Getty Images
A direct evidence of the effect of egg yolk in oil paint, or lack thereof, can be seen in Leonardo da Vinci’s “Madonna of the Carnation,” one of the paintings observed during the study. Currently on display at the Alte Pinakothek in Munich, Germany, the work shows evident wrinkling on the face of Mary and the child.

“Oil paint starts to dry from the surface down, which is why it wrinkles,” Ranquet said.

One reason for wrinkling may be an insufficient quantity of pigments in the paint, and the study has shown that this effect could be avoided with the addition of egg yolk: “That’s quite amazing because you have the same quantity of pigment in your paint, but the presence of the egg yolk changes everything.”

Because wrinkling occurs within days, it’s likely that Leonardo and other Old Masters might have caught onto this particular effect, as well as additional beneficial properties of egg yolk in oil paint, including resistance to humidity. The “Madonna of Carnation” is one of Leonardo’s earliest paintings, created at a time when he might have been still trying to master the then newly popular medium of oil paint.

New understanding of the classics
Another painting observed during the study was “The Lamentation Over the Dead Christ,” by Botticelli, also on display at the Alte Pinakothek. The work is mostly made with tempera, but oil paint has been used for the background and some secondary elements.

“We knew that some parts of the paintings show brushstrokes that are typical for what we call an oil painting, and yet we detected the presence of proteins,” Ranquet said. “Because it’s a very small quantity and they are difficult to detect, this might be dismissed as contamination: In workshops, artists used many different things, and maybe the eggs were just from the tempera.”

However, because adding egg yolk had such desirable effects on oil paint, the presence of proteins in the work might be an indication of deliberate use instead, the study suggested. Ranquet hopes that these preliminary findings might attract more curiosity toward this understudied topic.

Maria Perla Colombini, a professor of analytical chemistry at the University of Pisa in Italy, who was not involved in the study, agreed. “This exciting paper provides a new scenario for the understanding of old painting techniques,” she said in an email.

“The research group, reporting results from molecular level up to a macroscopic scale, contributes to a new knowledge in the use of egg yolk and oil binders. They are not more looking at simply identifying the materials used by Old Masters but explain how they could produce wonderful and glittering effects by employing and mixing the few available natural materials. They try to discover the secrets of old recipes of which little or nothing is written,” she added.

“This new knowledge contributes not only to a better conservation and preservation of artworks but also to a better comprehension of art history.”


Top image: The “Mona Lisa” by Leonardo Da Vinci
"""
candidate_labels = [ 'Politics' , 'World News' , 'Local News' , 'Business' , 'Technology' , 'Science' , 'Health' , 'Education' , 'Environment' , 'Sports' , 'Entertainment' , 'Culture' , 'Economy' , 'Finance' , 'Crime' , 'Law' , 'Immigration' , 'Climate Change' , 'Weather' , 'International Relations' , 'Government' , 'Elections' , 'Terrorism' , 'Religion' , 'Human Rights' , 'Social Issues' , 'Immigration' , 'Education Reform' , 'Healthcare' , 'COVID-19' , 'Vaccination' , 'Cybersecurity' , 'Artificial Intelligence' , 'Space Exploration' , 'Biotechnology' , 'Mental Health' , 'Travel' , 'Food' , 'Fashion' , 'Lifestyle' , 'Celebrity News' , 'Arts' , 'Music' , 'Film' , 'Television' , 'Books' , 'Gaming' , 'Sports Highlights' , 'Olympic Games' , 'Auto Industry' , 'Real Estate' , 'Stock Market' , 'Trade' , 'Infrastructure' , 'Energy' , 'Sustainability' , 'Wildlife' , 'Conservation' , 'Natural Disasters' , 'Education Policy' , 'Higher Education' , 'Student Life' , 'Parenting' , 'Aging' , 'Disability' , 'LGBTQ+ Rights' , 'Gender Equality' , 'Race Relations' , 'Immigration Policy' , 'Criminal Justice' , 'Courts' , 'Police' , 'Drug Policy' , 'Climate Policy' , 'Renewable Energy' , 'Air Pollution' , 'Oceans' , 'Space Policy' , 'Artificial Intelligence Ethics' , 'Robotics' , 'Social Media' , 'Data Privacy' , 'Hacking' , 'Astronomy' , 'Biology' , 'Psychology' , 'Fitness' , 'Nutrition' , 'TravelDestinations' , 'Culinary Trends' , 'DIY' , 'Celebrity Interviews' , 'Theater' , 'Visual Arts' , 'Pop Culture' , 'Music Festivals' , 'Gaming Industry' , 'Electric Vehicles' , 'Housing Market' , 'Small Business']
classifier(sequence_to_classify, candidate_labels)

{'sequence': '\nScientists identify secret ingredient in Leonardo da Vinci paintings\n01b old masters egg yolk painting mona lisa RESTRICTED\nEditor’s Note: Sign up for CNN’s Wonder Theory science newsletter. Explore the universe with news on fascinating discoveries, scientific advancements and more.\n\nCNN  — \n“Old Masters” such as Leonardo da Vinci, Sandro Botticelli and Rembrandt may have used proteins, especially egg yolk, in their oil paintings, according to a new study.\n\nTrace quantities of protein residue have long been detected in classic oil paintings, though they were often ascribed to contamination. A new study published Tuesday in the journal Nature Communications found the inclusion was likely intentional — and sheds light on the technical knowledge of the Old Masters, the most skilled European painters of the 16th, 17th, or early 18th century, and the way they prepared their paints.\n\n“There are very few written sources about this and no scientific work has been done 

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
#nltk.download('punkt')

tokenizer = AutoTokenizer.from_pretrained("fabiochiu/t5-base-tag-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("fabiochiu/t5-base-tag-generation")

text = """
Scientists identify secret ingredient in Leonardo da Vinci paintings
01b old masters egg yolk painting mona lisa RESTRICTED
Editor’s Note: Sign up for CNN’s Wonder Theory science newsletter. Explore the universe with news on fascinating discoveries, scientific advancements and more.

CNN  — 
“Old Masters” such as Leonardo da Vinci, Sandro Botticelli and Rembrandt may have used proteins, especially egg yolk, in their oil paintings, according to a new study.

Trace quantities of protein residue have long been detected in classic oil paintings, though they were often ascribed to contamination. A new study published Tuesday in the journal Nature Communications found the inclusion was likely intentional — and sheds light on the technical knowledge of the Old Masters, the most skilled European painters of the 16th, 17th, or early 18th century, and the way they prepared their paints.

“There are very few written sources about this and no scientific work has been done before to investigate the subject in such depth,” said study author Ophélie Ranquet of the Institute of Mechanical Process Engineering and Mechanics at the Karlsruhe Institute of Technology in Germany, in a phone interview. “Our results show that even with a very small amount of egg yolk, you can achieve an amazing change of properties in the oil paint, demonstrating how it might have been beneficial for the artists.”

Simply adding some egg yolk to their works, it turns out, could have long-lasting effects that went beyond just aesthetics.

Eggs vs. oil
Compared with the medium formulated by ancient Egyptians called tempera — which combines egg yolk with powdered pigments and water — oil paint creates more intense colors, allows for very smooth color transitions and dries far less quickly, so it can be used for several days after its preparation. However, oil paint, which uses linseed or safflower oil instead of water, also has drawbacks, including being more susceptible to color darkening and damage caused by exposure to light.

Because making paint was an artisanal and experimental process, it is possible that the Old Masters might have added egg yolk, a familiar ingredient, to the newer type of paint, which first showed up in the seventh century in Central Asia before spreading to Northern Europe in the Middle Ages and Italy during the Renaissance. In the study, the researchers recreated the process of paint-making by using four ingredients — egg yolk, distilled water, linseed oil and pigment — to mix two historically popular and significant colors, lead white and ultramarine blue.

“The addition of egg yolk is beneficial because it can tune the properties of these paints in a drastic way,” Ranquet said, “For example by showing aging differently: It takes a longer time for the paint to oxidize, because of the antioxidants contained in the yolk.”

The chemical reactions between the oil, the pigment and the proteins in the yolk directly affect the paint’s behavior and viscosity. “For example, the lead white pigment is quite sensitive to humidity, but if you coat it with a protein layer, it makes it a lot more resistant to it, making the paint quite easy to apply,” Ranquet said.

“On the other hand, if you wanted something stiffer without having to add a lot of pigment, with a bit of egg yolk you can create a high impasto paint,” she added, referring to a painting technique where the paint is laid out in a stroke thick enough that the brushstrokes are still visible. Using less pigment would have been desirable centuries ago, when certain pigments — such as lapis lazuli, which was used to make ultramarine blue — were more expensive than gold, according to Ranquet.

"The Madonna of the Carnation," on display at the Alte Pinakothek in Munich, Germany, is one of Leonardo da Vinci's earliest paintings. Wrinkling of the oil paint is evident on the faces. 
"The Madonna of the Carnation," on display at the Alte Pinakothek in Munich, Germany, is one of Leonardo da Vinci's earliest paintings. Wrinkling of the oil paint is evident on the faces.
DeAgostini/Getty Images
A direct evidence of the effect of egg yolk in oil paint, or lack thereof, can be seen in Leonardo da Vinci’s “Madonna of the Carnation,” one of the paintings observed during the study. Currently on display at the Alte Pinakothek in Munich, Germany, the work shows evident wrinkling on the face of Mary and the child.

“Oil paint starts to dry from the surface down, which is why it wrinkles,” Ranquet said.

One reason for wrinkling may be an insufficient quantity of pigments in the paint, and the study has shown that this effect could be avoided with the addition of egg yolk: “That’s quite amazing because you have the same quantity of pigment in your paint, but the presence of the egg yolk changes everything.”

Because wrinkling occurs within days, it’s likely that Leonardo and other Old Masters might have caught onto this particular effect, as well as additional beneficial properties of egg yolk in oil paint, including resistance to humidity. The “Madonna of Carnation” is one of Leonardo’s earliest paintings, created at a time when he might have been still trying to master the then newly popular medium of oil paint.

New understanding of the classics
Another painting observed during the study was “The Lamentation Over the Dead Christ,” by Botticelli, also on display at the Alte Pinakothek. The work is mostly made with tempera, but oil paint has been used for the background and some secondary elements.

“We knew that some parts of the paintings show brushstrokes that are typical for what we call an oil painting, and yet we detected the presence of proteins,” Ranquet said. “Because it’s a very small quantity and they are difficult to detect, this might be dismissed as contamination: In workshops, artists used many different things, and maybe the eggs were just from the tempera.”

However, because adding egg yolk had such desirable effects on oil paint, the presence of proteins in the work might be an indication of deliberate use instead, the study suggested. Ranquet hopes that these preliminary findings might attract more curiosity toward this understudied topic.

Maria Perla Colombini, a professor of analytical chemistry at the University of Pisa in Italy, who was not involved in the study, agreed. “This exciting paper provides a new scenario for the understanding of old painting techniques,” she said in an email.

“The research group, reporting results from molecular level up to a macroscopic scale, contributes to a new knowledge in the use of egg yolk and oil binders. They are not more looking at simply identifying the materials used by Old Masters but explain how they could produce wonderful and glittering effects by employing and mixing the few available natural materials. They try to discover the secrets of old recipes of which little or nothing is written,” she added.

“This new knowledge contributes not only to a better conservation and preservation of artworks but also to a better comprehension of art history.”


Top image: The “Mona Lisa” by Leonardo Da Vinci


"""

inputs = tokenizer([text], max_length=512, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=30,
                        max_length=64)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
tags = list(set(decoded_output.strip().split(", ")))

print(tags)
# ['Programming', 'Code', 'Software Development', 'Programming Languages',
#  'Software', 'Developer', 'Python', 'Software Engineering', 'Science',
#  'Engineering', 'Technology', 'Computer Science', 'Coding', 'Digital', 'Tech',
#  'Python Programming']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gskch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Advice', 'Art', 'Painting Tips', 'Creativity', 'Tips And Tricks', 'Tips', 'Painting', 'Artist', 'Creative']
