In [1]:
import os
import requests
import feedparser
import re
from datetime import datetime, timezone
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pyperclip
import time
from transformers import pipeline
import numpy
import torch
from InstructorEmbedding import INSTRUCTOR
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json 
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI

from langchain.prompts import PromptTemplate
import tiktoken

from langchain.chains.summarize import load_summarize_chain
import textwrap
from time import monotonic

#git add . && git commit -m "initial commit" && git push origin main

endpoint = "https://enabling-elk-81.hasura.app/v1/graphql"
admin_key = "bdAHRgu0lLGgF38TkQ0eL3ynNGLC23jxB4tnFzMiiSFh94YVMMHiIIouK4YfnEoB"

def query_hasura_graphql(endpoint, admin_key, query, variables):
    headers = {
        'Content-Type': 'application/json',
        'x-hasura-admin-secret': f'{admin_key}'
    }

    data = {
        'query': query,
        'variables': variables
    }
    response = requests.post(endpoint, json=data, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Request failed with status code {response.status_code}")
        return None

def is_valid_timezone_format(published):
    try:
        # Attempt to parse the string
        date_format = "%a, %d %b %Y %H:%M:%S %z"
        date_object = datetime.strptime(published, date_format)
        
        hasura_timestamp = date_object.astimezone(timezone.utc).isoformat()
        return True, hasura_timestamp
    except ValueError:
        # If parsing fails, the string is not in the correct format
        return False, None

def check_date_format(date_string):
    try:
        datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S%z')
        return True
    except ValueError:
        return False
        
def mutation_hasura_graphql(endpoint, admin_key, mutation_query, mutation_variables):
    headers = {
        'Content-Type': 'application/json',
        'x-hasura-admin-secret': f'{admin_key}'
    }
    response = requests.post(endpoint, json={'query': mutation_query, 'variables': mutation_variables}, headers=headers)
    if response.ok:
        data = response.json()
        print(data)
        return True, data
    else:
        print(f"Mutation failed with status code {response.status_code}: {response.text}")
        return False, None

def update_articles():
    graphql_query = '''
    query MyQuery($link_type: Int!) {
        articles_t_v1_rss1_feed_links(where: {rss1_link_type: {_eq: $link_type}}) {
            rss1_link
            outlet
        }
    }
    '''
    # Define the variables dictionary
    variables = {
        "link_type": 11
    }
    rss1_links_array = []
    outlet = []
    response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
    if response_data:
        rss1_links_array = [item["rss1_link"] for item in response_data["data"]["articles_t_v1_rss1_feed_links"]]
        outlet = [item["outlet"] for item in response_data["data"]["articles_t_v1_rss1_feed_links"]]
    mutation_query = """
    mutation MyMutation($objects: [articles_T_v1_rss1_articals_insert_input!] = {}) {
        insert_articles_T_v1_rss1_articals(objects: $objects, on_conflict: {constraint: T_v1_rss1_articals_post_link_key}) {
            affected_rows
            returning {
            id
            }
        }
    }

    """
    
    for i in range(0,len(rss1_links_array)-1):
        NewsFeed = feedparser.parse(rss1_links_array[i])
        print("############################################################")
        print(rss1_links_array[i])
        articles = []
        for entry in NewsFeed.entries:
            # print(entry.link)
            is_default_image = 0
            title = entry.title
            summary = ''
            if 'summary' in entry:
                summary_nofil = entry.summary
                summary = re.sub('<[^<]+?>', '', summary_nofil)
            image_url = ""
            if 'media_content' in entry:
                image_url = entry['media_content'][0]['url']
                is_default_image = 1
            if 'links' in entry:
                for link in entry.links:
                    if link.type == "image/jpeg":
                        image_url= link.href
                        is_default_image = 1
                        break
            post_link = entry.link
            published = datetime.now(timezone.utc).isoformat()
            if 'published' in entry:
                published = entry.published
            datevalidation = is_valid_timezone_format(published)
            if datevalidation[0]:
                hasura_timestamp = datevalidation[1]
            if check_date_format(published):
                hasura_timestamp = published
            else:
                hasura_timestamp = datetime.now().astimezone(timezone.utc).isoformat()
            if "author" in entry:
                author = entry.author
            else:
                author = "na"
            if outlet[i] in post_link:
                articles.append({
                        "rss1_link": rss1_links_array[i],
                        "post_link": post_link,
                        "title": title,
                        "summary": summary,
                        "author": author,
                        "image_link" : image_url,
                        "post_published": hasura_timestamp,
                        "is_default_image": is_default_image,
                    }
                )
            #print(feed_link, post_link, title, summary, author, image_url, hasura_timestamp, is_default_image)
        mutation_variables = {
            "objects": articles
        }
        #print({'query': mutation_query, 'variables': mutation_variables})
        out1 = mutation_hasura_graphql(endpoint = endpoint, admin_key = admin_key, mutation_query = mutation_query, mutation_variables = mutation_variables)

def update_article_details(offset1):
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
    articles_T_v1_rss1_articals(limit: $limit, offset: $offset, where: {is_in_detail: {_eq: 0}}, order_by: {post_published: desc}) {
        post_link
        is_default_image
        image_link
        id
        }
    }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($objects: [articles_T_v1_rss1_articles_detail_insert_input!] = {}, $updates: [articles_T_v1_rss1_articals_updates!] = {where: {}}) {
        insert_articles_T_v1_rss1_articles_detail(objects: $objects, on_conflict: {constraint: T_v1_rss1_articles_detail_article_id_key}) {
            affected_rows
        }
        update_articles_T_v1_rss1_articals_many(updates: $updates) {
            affected_rows
        }
        }

    """    
    options = webdriver.EdgeOptions()
    options.use_chromium = True
    options.page_load_strategy = 'eager'
    options.add_argument('--enable-immersive-reader')
    driver = webdriver.Edge(options=options)
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        #print(variables, response_data)
        #print(response_data)
        post_links_array = []
        ids=[]
        if response_data:
            post_links_array = [item["post_link"] for item in response_data["data"]["articles_T_v1_rss1_articals"]]
            is_default_image_array = [item["is_default_image"] for item in response_data["data"]["articles_T_v1_rss1_articals"]]
            image_link_array = [item["image_link"] for item in response_data["data"]["articles_T_v1_rss1_articals"]]
            ids=[item["id"] for item in response_data["data"]["articles_T_v1_rss1_articals"]]
        articles_detail = []
        articles_update = []
        if len(post_links_array) == 0:
            break
        try:
            for a in range(len(post_links_array)):
                main_link = post_links_array[a]
                print(main_link)
                driver.get(main_link)
                get_url = driver.current_url
                read_link= "read://"+get_url
                driver.get(read_link)
                time.sleep(5)
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('a').key_up(Keys.CONTROL).perform()
                ActionChains(driver).key_down(Keys.CONTROL).send_keys('c').key_up(Keys.CONTROL).perform()
                text = pyperclip.paste()
                text2 = text
                text3 = text2.split('\n')
                text3 = [s.replace('\r', '') for s in text3]
                special_chars = set("!@#$%^&*()_+[]{}|;:'\",<>?")
                text4 = [s for s in text3 if len(s) > 0 and (s[0] not in special_chars or s[-1] not in special_chars)]
                my_list = text4
                if my_list[0] == "Hmmm… can't reach this page":
                    offset = offset + 1
                    break
                my_set = set()
                desription = []
                for item in my_list:
                    if item not in my_set:
                        desription.append(item)
                        my_set.add(item)
                #print(desription)
                images_final = []
                articles_detail.append({
                    "article_id": ids[a],
                    "title": desription[0],
                    "description": desription[1:],
                    "image_link": images_final,
                }
                )
                if (is_default_image_array[a] == 0 and len(images_final) > 0):
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1 , "image_link": images_final[0], "is_default_image": 1}
                    })
                else:
                    articles_update.append({
                        "where": {"post_link" : { "_eq": main_link }},
                        "_set": {"is_in_detail": 1}
                    })
                
                #print(main_link, desription[0], desription[1:], images_final)
            #print(articles_update)
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        except:
            offset = offset + 1
            mutation_variables = {
            "objects": articles_detail,
            "updates": articles_update,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)
        
    driver.quit() 

def summerizer(offset1): 
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
    articles_T_v1_rss1_articles_detail(limit: $limit, offset: $offset, where: {summary: {_is_null: true}}) {
        title
        description
        article_id
        T_v1_rss1_artical {
        title
        summary
        }
    }
    }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($updates: [articles_T_v1_rss1_articles_detail_updates!] = {where: {}}) {
    update_articles_T_v1_rss1_articles_detail_many(updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        rss1_articles_detail_updates = []
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_T_v1_rss1_articles_detail']) == 0:
            break
        for response in response_data['data']['articles_T_v1_rss1_articles_detail']:
            print(response['title'])
            article=""
            article = article + response['title'] + " " +  response['T_v1_rss1_artical']['title'] + " " +  response['T_v1_rss1_artical']['summary'] + ', '.join(response['description'])
            chunks=[]
            max_length = 0
            min_length = 0
            if len(article) < 1000:
                max_length = 150
                min_length = 100
                chunks.append(article)
            elif len(article) < 3000:
                max_length = 300
                min_length = 200
                chunks.append(article)
            elif len(article) < 4000:
                max_length = 400
                min_length = 250
                chunks.append(article)
            elif len(article) < 8000:
                max_length = 200
                min_length = 150
                midpoint = len(article) // 2
                chunks.append(article[:midpoint])
                chunks.append(article[midpoint:])
            else:
                article=article[:8000]
                max_length = 200
                min_length = 150
                midpoint = len(article) // 2
                chunks.append(article[:midpoint])
                chunks.append(article[midpoint:])

            summerize=""
            for chunk in chunks:
                summerize=summerize + summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']+ " "
            if len(summerize) > 0:
                rss1_articles_detail_updates.append({
                    "where": {"article_id" : { "_eq": response['article_id'] }},
                    "_set": {"summary": summerize }
                })
        mutation_variables = {
            "updates": rss1_articles_detail_updates,
            }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def vectorize(offset1):
    model = INSTRUCTOR('hkunlp/instructor-large')
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
    articles_T_v1_rss1_articals(limit: $limit, offset: $offset, where: {is_vectorized: {_eq: 0}, is_in_detail: {_eq: 1}}) {
        id
        title
        summary
        T_v1_rss1_articles_detail {
        summary
        tags
        }
    }
    }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($objects: [articles_t_v1_rss1_article_vectors_insert_input!] = {}, $updates: [articles_T_v1_rss1_articals_updates!] = {where: {}}) {
    insert_articles_t_v1_rss1_article_vectors(objects: $objects, on_conflict: {constraint: t_v1_rss1_article_vectors_article_id_key}) {
        affected_rows
        returning {
        article_id
        }
    }
    update_articles_T_v1_rss1_articals_many(updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """
    while True:
        variables = {
        "limit": 1,
        "offset": offset
        }
        articles_vector1_insert_input_loc=[]
        rss1_articals_updates_loc=[]
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_T_v1_rss1_articals']) == 0:
            break
        #print(json.dumps(response_data, indent=4))
        s1= []
        ids=[]
        for response in response_data['data']['articles_T_v1_rss1_articals']:
            article=""
            tags=""
            if (response['T_v1_rss1_articles_detail']['tags']) is None:
                tags = " "
            else:
                tags = ", ".join(response['T_v1_rss1_articles_detail']['tags'])
            article = article + response['title'] + " " +  response['summary'] + " " +  response['T_v1_rss1_articles_detail']['summary'] + tags
            s1.append([['Represent the news article for custering and retrieval:  ', article]])
            ids.append(response['id'])
        embeddings = []
        for s in s1:
            list_embeddings = numpy.ravel(model.encode(s)).tolist()
            embeddings.append(list_embeddings)
        for i in range(0,len(ids)):
            articles_vector1_insert_input_loc.append({
                "article_id": ids[i],
                "vector1": str(embeddings[i]),
                }
                )
            rss1_articals_updates_loc.append({
                "where": {"id" : { "_eq": ids[i] }},
                "_set": {"is_vectorized": 1}
                })

        mutation_variables = {
        "objects": articles_vector1_insert_input_loc,
        "updates": rss1_articals_updates_loc,
        }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def grouping(offset1):
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        articles_T_v1_rss1_articals(limit: $limit, offset: $offset, where: {is_vectorized: {_eq: 1}, is_in_detail: {_eq: 1}, is_grouped: {_eq: 0}}) {
            id
            t_v1_rss1_article_vector {
            vector1
            }
        }
        }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($objects: [articles_t_v1_articles_groups_insert_input!] = {}, $updates: [articles_T_v1_rss1_articals_updates!] = {where: {}}) {
        insert_articles_t_v1_articles_groups(objects: $objects, on_conflict: {constraint: t_v1_articles_groups_article_id_key}) {
            affected_rows
            returning {
            article_id
            }
        }
        update_articles_T_v1_rss1_articals_many(updates: $updates) {
            affected_rows
            returning {
            id
            }
        }
        }
    """
    func_query = '''
    query MyQuery($p_article_id: bigint!) {
        articles_get_similar_articles_group(args: {p_article_id: $p_article_id}) {
            article_id
        }
        }
    '''
    while True:
        variables = {
        "limit": 20,
        "offset": offset
        }
        articles_groups_insert_input_loc=[]
        rss1_articals_updates_loc=[]
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_T_v1_rss1_articals']) == 0:
            break
        #print(json.dumps(response_data, indent=4))
        s1= []
        ids=[]
        for response in response_data['data']['articles_T_v1_rss1_articals']:
            func_variables = {
                "p_article_id": response['id']
                }
            func_response_data = query_hasura_graphql(endpoint, admin_key, func_query, func_variables)
            article_group = []
            #print(json.dumps(func_response_data, indent=4))
            if len(func_response_data['data']['articles_get_similar_articles_group']) > 0:
                for func_response in func_response_data['data']['articles_get_similar_articles_group']:
                    article_group.append(func_response['article_id'])
            
            articles_groups_insert_input_loc.append({
                "article_id": response['id'],
                "initial_group": article_group,
                }
                )
            rss1_articals_updates_loc.append({
                "where": {"id" : { "_eq": response['id'] }},
                "_set": {"is_grouped": 1}
                })

        mutation_variables = {
        "objects": articles_groups_insert_input_loc,
        "updates": rss1_articals_updates_loc,
        }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def grouping_l1(offset1):
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
          articles_T_v1_rss1_articals(where: {is_grouped: {_eq: 1}}, limit: $limit, offset: $offset) {
            id
            t_v1_articles_group {
            initial_group
            }
        }
        }
    '''
    offset = offset1
    mutation_query = """
        mutation MyMutation($objects: [articles_t_v1_articals_groups_l1_insert_input!] = {}, $updates: [articles_T_v1_rss1_articals_updates!] = {where: {}}, $updates1: [articles_t_v1_articals_groups_l1_updates!] = {where: {}}) {
        insert_articles_t_v1_articals_groups_l1(objects: $objects) {
            affected_rows
        }
        update_articles_T_v1_rss1_articals_many(updates: $updates) {
            affected_rows
        }
        update_articles_t_v1_articals_groups_l1_many(updates: $updates1) {
            affected_rows
        }
        }
    """
    query2 = '''
    query MyQuery($articleid: [bigint!] = [20]) {
        articles_t_v1_articles_groups(where: {initial_group: {_contains: $articleid}}) {
            article_id
            initial_group
        }
        }
    '''
    query3 = '''
    query MyQuery($articleid: [bigint!] = [20]) {
        articles_t_v1_articals_groups_l1(where: {articles_group: {_contains: $articleid}}) {
            articles_group
            id
        }
        }
    '''
    while True:
        variables = {
        "limit": 1,
        "offset": offset
        }
        articles_grouped_l1_insert_input_loc=[]
        rss1_articals_updates_loc=[]
        articles_grouped_l1_updates=[]
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_T_v1_rss1_articals']) == 0:
            break
        #print(json.dumps(response_data, indent=4))
        for response in response_data['data']['articles_T_v1_rss1_articals']:

            variables2 = {
                "articleid": [response['id']]
                }
            func_response_data = query_hasura_graphql(endpoint, admin_key, query2, variables2)
            articles_ids = []
            print(response['id'])
            if len(func_response_data['data']['articles_t_v1_articles_groups']) > 0:
                for func_response in func_response_data['data']['articles_t_v1_articles_groups']:
                    articles_ids.append(func_response['initial_group'])
            
            func_response_data1 = query_hasura_graphql(endpoint, admin_key, query3, variables2)
            
            
            if (len(func_response_data1['data']['articles_t_v1_articals_groups_l1']) == 0):
                new_lst = []
                for sublist in articles_ids:
                    for element in sublist:
                        new_lst.append(element)
                my_list = list(set(new_lst))
                print(my_list)
                articles_grouped_l1_insert_input_loc.append({
                    "articles_group": my_list,
                    'articles_in_group': len(my_list)
                    }
                    )
                rss1_articals_updates_loc.append({
                    "where": {"id" : { "_eq": response['id'] }},
                    "_set": {"is_grouped": 2}
                    })
            else:
                articles_ids.append(func_response_data1['data']['articles_t_v1_articals_groups_l1'][0]['articles_group'])
                new_lst = []
                for sublist in articles_ids:
                    for element in sublist:
                        new_lst.append(element)
                my_list = list(set(new_lst))
                articles_grouped_l1_updates.append({
                    "where": {"id" : { "_eq": func_response_data1['data']['articles_t_v1_articals_groups_l1'][0]['id'] }},
                    "_set": {"articles_group": my_list, 'articles_in_group': len(my_list)}
                    })
                rss1_articals_updates_loc.append({
                    "where": {"id" : { "_eq": response['id'] }},
                    "_set": {"is_grouped": 2}
                    })
                print(my_list)     
        
        mutation_variables = {
        "objects": articles_grouped_l1_insert_input_loc,
        "updates": rss1_articals_updates_loc,
        "updates1": articles_grouped_l1_updates,
        }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def num_tokens_from_string(string: str, encoding_name: str) -> int:    
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def gen_article():
    gpt_35_turbo_max_tokens = 4097
    verbose = True
    prompt_template = """Write a unbiased professional news article for:


        {text}


        CONSCISE UNBIASED detailed news article with at least 500 words:"""
    OPENAI_API_KEY= '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'
    model_name = "gpt-3.5-turbo"

    llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model_name=model_name)
    prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        model_name=model_name
    )
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        articles_t_v1_articals_groups_l1(where: {requires_update: {_eq: 0}, articles_in_group: {_gt: 1}}, limit: $limit, offset: $offset) {
            articles_group
            id
        }
        }
    '''
    graphql_grquery_article = '''
    query MyQuery($article_id: bigint!) {
    articles_T_v1_rss1_articals(where: {id: {_eq: $article_id}}) {
        title
        summary
        T_v1_rss1_articles_detail {
        summary
        }
    }
    }
    '''
    offset = 0
    mutation_query = """
    mutation MyMutation($objects: [articles_t_v1_articals_groups_l1_detail_insert_input!] = {}, $updates: [articles_t_v1_articals_groups_l1_updates!] = {where: {}}) {
        insert_articles_t_v1_articals_groups_l1_detail(objects: $objects, on_conflict: {constraint: t_v1_articals_groups_l1_detail_article_group_id_key}) {
            affected_rows
        }
        update_articles_t_v1_articals_groups_l1_many(updates: $updates) {
            affected_rows
        }
    }
    """
    while True:
        variables = {
        "limit": 1,
        "offset": offset
        }
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_t_v1_articals_groups_l1']) == 0:
            break
        for response in response_data['data']['articles_t_v1_articals_groups_l1']:
            llm_text = ''
            articles_t_v1_articals_groups_l1_detail_insert_input=[]
            articles_t_v1_articals_groups_l1_updates=[]
            for article in response['articles_group']:
                article_variables = {
                "article_id": article
                }
                article_response_data = query_hasura_graphql(endpoint, admin_key, graphql_grquery_article, article_variables)
                llm_text = llm_text + "\n" +article_response_data['data']['articles_T_v1_rss1_articals'][0]['title'] + "\n" + article_response_data['data']['articles_T_v1_rss1_articals'][0]['summary'] + "\n" + article_response_data['data']['articles_T_v1_rss1_articals'][0]['T_v1_rss1_articles_detail']['summary']
            max_tokens = 3000
            if len(llm_text.split()) > max_tokens:
                llm_text = ' '.join(llm_text.split()[:max_tokens])
            print(num_tokens_from_string(llm_text, model_name))
            
            texts = text_splitter.split_text(llm_text)
            docs = [Document(page_content=t) for t in texts]
            max_tokens = 4000
            if len(llm_text.split()) > max_tokens:
                llm_text = ' '.join(llm_text.split()[:max_tokens])
            prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

            num_tokens = num_tokens_from_string(llm_text, model_name)
            print(num_tokens)
            if num_tokens < gpt_35_turbo_max_tokens:
                chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
            else:
                print("map reduce")
                chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=prompt, combine_prompt=prompt, verbose=verbose)

            summary = chain.run(docs)

            #print(f"Chain type: {chain.__class__.__name__}")
            #print(f"Run time: {monotonic() - start_time}")
            #print(f"Summary: {textwrap.fill(summary, width=100)}")
            articles_t_v1_articals_groups_l1_detail_insert_input.append({
                        "article_group_id": response['id'],
                        'summary': summary
                        }
                        )
            articles_t_v1_articals_groups_l1_updates.append({
                        "where": {"id" : { "_eq": response['id'] }},
                        "_set": {"requires_update": 1}
                        })
            mutation_variables = {
            "objects": articles_t_v1_articals_groups_l1_detail_insert_input,
            "updates": articles_t_v1_articals_groups_l1_updates,
            }
            out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def gen_title():
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
            articles_t_v1_articals_groups_l1_detail(where: {title: {_is_null: true}}, limit: $limit, offset: $offset) {
            article_group_id
            summary
        t_v1_articals_groups_l1 {
        articles_group
        }
        }
        }
    '''
    offset = 0
    mutation_query= """
    mutation MyMutation($updates: [articles_t_v1_articals_groups_l1_detail_updates!] = {where: {}}) {
  update_articles_t_v1_articals_groups_l1_detail_many(updates: $updates) {
    affected_rows
  }
}

    """
    variables = {
        "limit": 1,
        "offset": offset
        }
    while True:
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_t_v1_articals_groups_l1_detail']) == 0:
            break
        update_articles_t_v1_articals_groups_l1_detail_many_loc=[]
        for response in response_data['data']['articles_t_v1_articals_groups_l1_detail']:
            input_text = "generate a intresting and viral news title for:  " + "\n" + response['summary']
            max_length = 512
            input_text = input_text[:max_length]
            input_ids = tokenizer.encode(input_text, return_tensors="pt")
            outputs = model.generate(input_ids, max_new_tokens=50)
            generated_text = tokenizer.decode(outputs[0])
            clean_text = re.sub('<.*?>', '', generated_text) # remove data between < and >
            print(clean_text)
            update_articles_t_v1_articals_groups_l1_detail_many_loc.append({
                "where": {"article_group_id" : { "_eq": response['article_group_id'] }},
                "_set": {"title": clean_text}
                })
        mutation_variables = {
            "updates": update_articles_t_v1_articals_groups_l1_detail_many_loc,
            }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def gen_images():
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
        articles_t_v1_articals_groups_l1_detail(where: {image_urls: {_is_null: true}, _and: {logo_urls: {_is_null: true}}}, limit: $limit, offset: $offset) {
            article_group_id
            id
            t_v1_articals_groups_l1 {
            articles_group
            }
        }
        }
    '''
    offset = 0
    graphql_query2 = '''
    query MyQuery($article_id: bigint! ) {
    articles_T_v1_rss1_articals(where: {id: {_eq: $article_id}}) {
        image_link
        t_v1_rss1_feed_link {
        t_v1_outlet {
            logo_url
        }
        }
    }
    }
    '''
    mutation_query= """
    mutation MyMutation($updates: [articles_t_v1_articals_groups_l1_detail_updates!] = {where: {}}) {
    update_articles_t_v1_articals_groups_l1_detail_many(updates: $updates) {
    affected_rows
    }
    }

    """
    variables = {
        "limit": 1,
        "offset": offset
        }
    while True:
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        #print(json.dumps(response_data, indent=4))
        if len(response_data['data']['articles_t_v1_articals_groups_l1_detail']) == 0:
            break
        articles_t_v1_articals_groups_l1_detail_updates_loc=[]
        for response in response_data['data']['articles_t_v1_articals_groups_l1_detail']:
            #print(json.dumps(response, indent=4))
            image_links=[]
            logo_links=[]
            for article_id in response['t_v1_articals_groups_l1']['articles_group']:
                variables2 = {
                    "article_id": article_id
                    }
                response_data_article = query_hasura_graphql(endpoint, admin_key, graphql_query2, variables2)
                if response_data_article['data']['articles_T_v1_rss1_articals'][0]['image_link'] != '':
                    image_links.append(response_data_article['data']['articles_T_v1_rss1_articals'][0]['image_link'])
                if response_data_article['data']['articles_T_v1_rss1_articals'][0]['t_v1_rss1_feed_link']['t_v1_outlet']['logo_url'] != '':
                    logo_links.append(response_data_article['data']['articles_T_v1_rss1_articals'][0]['t_v1_rss1_feed_link']['t_v1_outlet']['logo_url'])
            #print(image_links)
            #print(logo_links)
            image_links = list(set(image_links))
            logo_links = list(set(logo_links))
            articles_t_v1_articals_groups_l1_detail_updates_loc.append({
                "where": {"article_group_id" : { "_eq": response['article_group_id'] }},
                "_set": {"image_urls": image_links, "logo_urls": logo_links}
                })  
        mutation_variables = {
            "updates": articles_t_v1_articals_groups_l1_detail_updates_loc,
            }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)

def summerizer_60_words(offset1): 
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    graphql_query = '''
    query MyQuery($limit: Int!, $offset: Int!) {
    articles_t_v1_articals_groups_l1_detail(limit: $limit, offset: $offset, where: {summary_60_words: {_is_null: true}}) {
        article_group_id
        summary
    }
    }
    '''
    offset = offset1
    mutation_query = """
    mutation MyMutation($updates: [articles_t_v1_articals_groups_l1_detail_updates!] = {where: {}}) {
    update_articles_t_v1_articals_groups_l1_detail_many (updates: $updates) {
        affected_rows
        returning {
        id
        }
    }
    }
    """
    while True:
        variables = {
        "limit": 2,
        "offset": offset
        }
        articles_t_v1_articals_groups_l1_detail_updates_loc = []
        response_data = query_hasura_graphql(endpoint, admin_key, graphql_query, variables)
        if len(response_data['data']['articles_t_v1_articals_groups_l1_detail']) == 0:
            break
        for response in response_data['data']['articles_t_v1_articals_groups_l1_detail']:
            print(response['article_group_id'])
            article = response['summary'] 
            max_length = 65
            min_length = 45
            
            summerize= summarizer(article, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']+ " "
            if len(summerize) > 0:
                articles_t_v1_articals_groups_l1_detail_updates_loc.append({
                    "where": {"article_group_id" : { "_eq": response['article_group_id'] }},
                    "_set": {"summary_60_words": summerize }
                })
        mutation_variables = {
            "updates": articles_t_v1_articals_groups_l1_detail_updates_loc,
            }
        out1 = mutation_hasura_graphql(endpoint=endpoint, admin_key=admin_key, mutation_query=mutation_query, mutation_variables=mutation_variables)



In [9]:
#update_articles()
#update_article_details(0)
#summerizer(0)
#vectorize(0)
#grouping(0)
#grouping_l1(0)
#gen_article()
#gen_title()
gen_images()
summerizer(0)

{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1}]}}
{'data': {

12
15
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1, 'returning': [{'id': 8}]}, {'affected_rows': 1, 'returning': [{'id': 6}]}]}}
40
54
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1, 'returning': [{'id': 11}]}, {'affected_rows': 1, 'returning': [{'id': 4}]}]}}
27
44
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1, 'returning': [{'id': 5}]}, {'affected_rows': 1, 'returning': [{'id': 9}]}]}}
37
14
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1, 'returning': [{'id': 1}]}, {'affected_rows': 1, 'returning': [{'id': 13}]}]}}
17
25
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1, 'returning': [{'id': 2}]}, {'affected_rows': 1, 'returning': [{'id': 3}]}]}}
53
39
{'data': {'update_articles_t_v1_articals_groups_l1_detail_many': [{'affected_rows': 1, 'returning': [{'id': 7}]}, {'affected_rows': 1, 'return