In [1]:
from flask import Flask, render_template, request
import csv, openai, os, requests, time
import pandas as pd

In [2]:
# Scopus API key
# API_KEY = ''
API_KEY = ''

# OpenAI API key
openai.api_key = "sk-"

In [3]:
def download_scopus_extracts(query):
    # API endpoint
    base_url = 'https://api.elsevier.com/content/search/scopus'

    # API request parameters
    params = {
        'apiKey': API_KEY,
        'query': query,
        #'view': 'COMPLETE',
        'field': 'dc:identifier,dc:title,prism:coverDate,prism:aggregationType,subtypeDescription,dc:description,authkeywords',
        # 'field': 'dc:identifier,dc:title,prism:coverDate,prism:aggregationType,subtypeDescription',
        'count': 10
    }

    # API request
    response = requests.get(base_url, params=params)

    # Check if request was successful
    if response.status_code == 200:
        # Extract data from API response
        data = response.json()['search-results']['entry']
        # for i in data:
        #     print(i,"\n\n")

        for i in range(len(data)):
            data[i].pop('@_fa')
            data[i].pop('prism:url')
            data[i]['Scopus ID'] = data[i].pop('dc:identifier')[10:]
            data[i]['Title'] = data[i].pop('dc:title')
            data[i]['Cover Date'] = data[i].pop('prism:coverDate')
            data[i]['Aggregation Type'] = data[i].pop('prism:aggregationType')
            data[i].pop('subtype')
            data[i]['Subtype'] = data[i].pop('subtypeDescription')
            # data[i]['Abstract'] = data[i].pop('dc:description')

        # Convert data to a Pandas DataFrame
        df = pd.DataFrame(data)
        # print(df)

        if not os.path.exists("scopus_extracts"):
            os.mkdir("scopus_extracts")
        df.to_json('./scopus_extracts/data.json', orient='records', indent=4)
        df.to_csv('./scopus_extracts/data.csv', index=False)

        # print("Write successful")
    else:
        print('Error:', response.status_code)
    return True

In [4]:
def extract_keywords():
    system_intel = "You are a keyword extraction tool. You will be given the title, abstract, and author's keywords for a research/journal paper. Your job is to find out the most appropriate keywords by analyzing the given information. The selected keywords MUST only be 'technological'. Your response may contain 0 to 3 keywords but DO NOT INCLUDE ANY NON-TECHNOLOGICAL KEYWORDS. Your response to me should only contain the keywords in lowercase separated by ',' without any extra words, spaces, or dots."

    keywords = []

    file_name = "./scopus_extracts/sample3.csv"

    with open(file_name, encoding="utf8", errors='ignore') as file:
        reader = csv.DictReader(file)
        for row in reader:
            title = row['Titles']
            abstract = row['Abstract']
            author_keywords = row["Author Keywords"]
            prompt = "\nTitle:" + title + "\nAbstract: " + abstract + "\nAuthor's Keywords: " + author_keywords

            result = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": system_intel},
                    {"role": "user", "content": prompt},
                ],
            )
            response = result["choices"][0]["message"]["content"]
            print(response)
            keywords_ = response.split(', ')
            for keyword in keywords_:
                keywords.append(keyword)
            # print('Title:', title, '\nAbstract:', abstract, '\nAuthor\'s Keywords:', author_keywords, '\nKeywords:', keywords, '\n')
    print("All keywords:", keywords)

    return keywords

In [5]:
def refine_keywords(keywords):
    system_intel = "You are an NLP model. You will be given a list of keywords. The list may contain some unwanted sentences and keywords. Your job is to DISCARD ANY NON-TECHNOLOGICAL KEYWORDS,  KEYWORDS WHICH DO NOT CONTAIN A NOUN AND UNWANTED SENTENCES (IF ANY) FROM THE LIST. Your response should STRICTLY FOLLOW the below conditions: 1. Return the processed list with only TECHNOLOGY RELATED KEYWORDS separated by comma. 2. Your response MUST NOT CONTAIN NOTES, REASONS OR ANY OTHER WORDS OTHER THAN THE KEYWORDS."

    # keywords = ['Artificial Intelligence', 'Machine Learning', 'Wildlife Tourism', 'Donut', 'Headphones', 'Cloud Computing']

    messages=[{"role": "system", "content": system_intel},{"role": "user", "content": "Keywords : "+str(keywords)}]

    result = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages= messages,
    )
    response = result["choices"][0]["message"]["content"]

    if(", " in response):
        refined_keywords = response.strip().split(", ")
    else:
        refined_keywords = response.strip().split(",")

    print("Refined keywords:", refined_keywords)

    return refined_keywords

In [6]:
def cluster(keywords):
    system_intel = "You are an NLP model. Your will be given a list of keywords. Your job is to analyze the keywords given to you with the help of your NLP abilities, and generate a number (for each pair) in the range of [0,1] which defines how closely those keywords are related to each other. YOUR RESPONSE SHOULD BE IN THE FOLLOWING FORMAT. 1. DISPLAY NO REASONS OR ANY OTHER EXTRA WORDS. 2. FOR EACH PAIR, Keyword1, Keyword2, Relatedness Score SEPERATED BY COMMA. 3. SEPARATE EACH PAIR DATA WITH A NEWLINE."
    c = 1
    while(c < 4):
        result = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": system_intel},{"role": "user", "content": "Keywords : "+str(keywords)}],
        )
        response = result["choices"][0]["message"]["content"]
        print("Response:\n", response, sep="")

        output = response.split("\n")

        n = len(output)
        i = 0
        while(i < n):
            if(len(output[i])==0):
                output.pop(i)
                n -= 1
                i -= 1
            i += 1
        for i in range(len(output)):
            output[i] = output[i].split(",")
            output[i][0] = output[i][0].strip()
            output[i][1] = output[i][1].strip()
            output[i][2] = output[i][2].strip()
            if output[i][2][-1] == "\\":
                output[i][2] = output[i][2][:-1]
            output[i][2] = float(output[i][2])
        # print(output)
        def get_index_of_cluster_containing_k(k):
            for i in range(len(clusters)):
                if k in clusters[i]:
                    return i

        clusters = []
        inserted_keywords = []

        for i in range(len(output)):
            k1 = output[i][0]
            k2 = output[i][1]
            score = output[i][2]
            if score >= 0.6:
                if k1 in inserted_keywords and k2 in inserted_keywords:
                    # search for k1
                    j1 = get_index_of_cluster_containing_k(k1)
                    # print("j1:",j1,end=" ")
                    j2 = get_index_of_cluster_containing_k(k2)
                    if(len(clusters[j1])==1):
                        clusters[j1].remove(k1)
                        clusters[j2].append(k1)
                    elif(len(clusters[j2])==1):
                        clusters[j2].remove(k2)
                        clusters[j1].append(k2)
                    # print('0',end="")
                elif k1 in inserted_keywords:
                    # search for k1 and insert k2 in that cluster
                    j = get_index_of_cluster_containing_k(k1)
                    clusters[j].append(k2)
                    inserted_keywords.append(k2)
                    # print('1',end="")
                elif k2 in inserted_keywords:
                    # search for k2 and insert k1 in that cluster
                    j = get_index_of_cluster_containing_k(k2)
                    clusters[j].append(k1)
                    inserted_keywords.append(k1)
                    # print('2',end="")
                else:
                    clusters.append([k1, k2])
                    inserted_keywords.append(k1)
                    inserted_keywords.append(k2)
                    # print('3',end="")
            else:
                if k1 not in inserted_keywords:
                    clusters.append([k1])
                    inserted_keywords.append(k1)
                    # print('4',end="")
                if k2 not in inserted_keywords:
                    clusters.append([k2])
                    inserted_keywords.append(k2)
                    # print('5',end="")
            # print(clusters,";",inserted_keywords)

        print("Clusters:", clusters)

        keywords_x = []
        clusters_y = []
        for i in range(len(clusters)):
            for keyword in clusters[i]:
                keywords_x.append(keyword)
                clusters_y.append(i+1)
        # print(keywords_x,"\n",clusters_y,sep="")

        unique_clusters = []
        for i in range(len(clusters_y)):
            clusters_y[i] = str(clusters_y[i])
            if str(clusters_y[i]) not in unique_clusters:
                unique_clusters.append(str(clusters_y[i]))

        import plotly.express as px

        data = dict(
            character=["Clusters"] + unique_clusters + keywords_x,
            parent=[""] + ["Clusters"]*len(unique_clusters) + clusters_y,
            value=[1]*(len(unique_clusters)+len(keywords_x)+1)
        )

        fig = px.sunburst(
            data,
            names='character',
            parents='parent',
            values='value',
        )

        fig.update_traces(
            insidetextorientation='radial'
        )

        fig.update_layout(
            autosize=False,
            width=900,
            height=900,
            font=dict(
                family="Comic Sans MS",
                size=18
            )
        )

        fig.show()

        if not os.path.exists("static/images"):
            os.mkdir("static/images")
        fig.write_image("static/images/clusters"+str(c)+".png")
        c += 1

In [7]:
app = Flask(__name__)
@app.route('/', methods=['GET', 'POST'])
def home():
    bc = False
    if request.method == 'POST':
        query = request.form['keyword']

        download_scopus_extracts(query)
        print("Scopus data extraction: successful.\n")
        
        time.sleep(70)
        keywords = extract_keywords()
        print("Keyword extraction: successful.\n")
        
        time.sleep(70)
        keywords = refine_keywords(keywords)
        print("Keyword refinement: successful.\n")
        
        time.sleep(70)
        cluster(keywords)
        print("Clustering: successful.\n")
        bc = True
        return render_template('index.html',bc=bc)
    
    return render_template('index.html')

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


Scopus data extraction: successful.

charging infrastructure, electric vehicles, machine learning
brain waves, memory, learning
deep neural network, diagnosis test, feature importance
All keywords: ['charging infrastructure', 'electric vehicles', 'machine learning', 'brain waves', 'memory', 'learning', 'deep neural network', 'diagnosis test', 'feature importance']
Keyword extraction: successful.

Refined keywords: ['charging infrastructure', 'electric vehicles', 'machine learning', 'deep neural network', 'diagnosis test', 'feature importance']
Keyword refinement: successful.

Response:
charging infrastructure, electric vehicles, 0.2
charging infrastructure, machine learning, 0.1
charging infrastructure, deep neural network, 0.1
charging infrastructure, diagnosis test, 0.1
charging infrastructure, feature importance, 0

electric vehicles, machine learning, 0.2
electric vehicles, deep neural network, 0.2
electric vehicles, diagnosis test, 0.1
electric vehicles, feature importance, 0

mac

Response:
charging infrastructure, electric vehicles, 0.8
charging infrastructure, machine learning, 0.2
charging infrastructure, deep neural network, 0.1
charging infrastructure, diagnosis test, 0.3
charging infrastructure, feature importance, 0.4
electric vehicles, machine learning, 0.3
electric vehicles, deep neural network, 0.1
electric vehicles, diagnosis test, 0.2
electric vehicles, feature importance, 0.1
machine learning, deep neural network, 0.9
machine learning, diagnosis test, 0.7
machine learning, feature importance, 0.6
deep neural network, diagnosis test, 0.5
deep neural network, feature importance, 0.3
diagnosis test, feature importance, 0.7
Clusters: [['charging infrastructure', 'electric vehicles'], [], ['deep neural network', 'machine learning', 'diagnosis test', 'feature importance'], [], []]


Response:
charging infrastructure, electric vehicles, 0.8
machine learning, deep neural network, 0.9
diagnosis test, feature importance, 0.2
Clusters: [['charging infrastructure', 'electric vehicles'], ['machine learning', 'deep neural network'], ['diagnosis test'], ['feature importance']]


127.0.0.1 - - [27/Jul/2023 22:27:15] "POST / HTTP/1.1" 200 -


Clustering: successful.

