## gui for Data Science

In [1]:
import sys
!{sys.executable} -m pip install PyPDF2 pandas spacy nltk elasticsearch elasticsearch_dsl fuzzywuzzy




In [2]:
import PyPDF2
import pandas as pd
import spacy
import nltk
import elasticsearch
from collections import Counter
from string import punctuation
from tqdm.auto import tqdm
from tkinter import font, ttk

In [None]:
import tkinter as tk
from tkinter import ttk, font
import pandas as pd
from fuzzywuzzy import fuzz, process
import random

def perform_search(query):
    results_text.delete('1.0', tk.END)

    # Read the CSV file
    df = pd.read_csv('../csv/gdit_articles.csv')

    # Perform exact matching based on 'Keyphrases' and 'Keywords' columns
    mask = df['Keyphrases'].str.contains(query, case=False) | df['Keywords'].str.contains(query, case=False)
    search_results = df.loc[mask, 'Title'].tolist()

    # Use fuzzy matching only if no exact matches are found or if the query has a minimum length
    if not search_results and len(query) >= 3:
        # Find the closest match to the query using fuzzy matching on 'Keywords' column
        titles = df['Keywords'].tolist()
        match_result = process.extractOne(query, titles, scorer=fuzz.partial_ratio)

        if match_result[1] >= 80:
            suggestion = match_result[0]
            # Extract the first word from the suggestion
            first_word = suggestion.split()[0]
            results_text.config(state="normal")  # Enable editing of the text widget
            # Make the "Did you mean" suggestion bigger, remove comma, and add a question mark
            results_text.tag_configure("big", font=("ariel", 14))
            results_text.insert(tk.END, "Did you mean: ", "big")
            results_text.insert(tk.END, f"{first_word}?", "big")
            results_text.insert(tk.END, "\n\n")

    # If the query is spelled correctly or suggestions are not available, return matching titles
    if not search_results:
        mask = df['Title'].str.contains(query, case=False)
        search_results = df.loc[mask, 'Title'].tolist()

    # Remove duplicates from search results
    search_results = list(set(search_results))

    # Limit the results to 6 random articles
    if len(search_results) > 6:
        search_results = random.sample(search_results, 6)

    return search_results



def populate_summary(selected_title):
    df = pd.read_csv('../csv/gdit_articles.csv')
    summary = df.loc[df['Title'] == selected_title, 'Abstract Summary'].item()
    keywords = df.loc[df['Title'] == selected_title, 'Keywords'].item()
    keyphrases = df.loc[df['Title'] == selected_title, 'Keyphrases'].item()

    summary_text.delete('1.0', tk.END)
    summary_text.insert(tk.END, f"{summary}")

    keywords_text.delete('1.0', tk.END)
    keywords_text.insert(tk.END, f"{keywords}\n{keyphrases}")


def clear_results_text():
    results_text.delete('1.0', tk.END)


# Main Window
window = tk.Tk()
window.title("DISCOVER - Data Intelligence and Search Capability")
window.geometry("900x600")
window.configure(background="light grey")

# Define a custom font with the desired size
font_size = 18
custom_font = font.Font(size=font_size)

# Search Bar
search_frame = ttk.Frame(window)
search_frame.pack(pady=15)

search_box = ttk.Entry(search_frame, width=48, font=("Arial", 18))
search_box.configure(background="white")
search_box.grid(row=0, column=0, padx=0)
search_box.insert(tk.END, "Search Here")

def clear_search_box(event):
    search_box.delete(0, tk.END)

search_box.bind("<FocusIn>", clear_search_box)

# search button
def search_button_clicked():
    # Clear previous search results
    results_text.delete('1.0', tk.END)
    summary_text.delete('1.0', tk.END)
    keywords_text.delete('1.0', tk.END)

    # Remove previous result titles
    for child in results_text.winfo_children():
        child.destroy()

    # Retrieve query from search_box widget
    query = search_box.get()

    # Perform new search
    search_results = perform_search(query)

    # Clear search_box widget
    search_box.delete(0, tk.END)

    # Update display with new search results
    results_text.insert(tk.END, f"Showing {len(search_results)} results for '{query}':\n\n")
    for result_summary in search_results:
        result_button = ttk.Button(results_text, text=result_summary, command=lambda selected_title=result_summary: populate_summary(selected_title))
        result_button.pack(anchor='w')

button_style = ttk.Style()
button_style.configure("Custom.TButton", font=custom_font, padding=3, background="light blue")
search_button = ttk.Button(search_frame, text="Search", command=search_button_clicked, style="Custom.TButton")
search_button.grid(row=0, column=1, padx=3)

# Article Title Display
results_frame = ttk.Frame(window)
results_frame.pack(pady=10, side='left', fill='both', expand=True)  # Use pack for results_frame

results_label = ttk.Label(results_frame, text="Article Results:", font=custom_font)
results_label.pack(pady=10)
results_label.configure(background="white", foreground="black")

# Create a scrollbar for the results_text widget (vertical)
results_scrollbar = ttk.Scrollbar(results_frame, orient=tk.VERTICAL)
results_scrollbar.pack(side='right', fill='y')

# Set fixed width and height for results_text
results_text = tk.Text(results_frame, height=35, width=70, wrap='none', yscrollcommand=results_scrollbar.set)
results_text.pack(side='left', fill='y')  # Use pack with side='left' for results_text

results_text.configure(background="white", foreground="black")
results_text.config(state="disabled")  # Disable editing of the text widget

# Configure the scrollbar to scroll the results_text widget
results_scrollbar.config(command=results_text.yview)

# Summary Display
summary_frame = ttk.Frame(window)
summary_frame.pack(pady=0, padx=10, side='right')

summary_label = ttk.Label(summary_frame, text="Summary:", font=custom_font)
summary_label.pack(pady=10)
summary_label.configure(background="white")

summary_text = tk.Text(summary_frame, height=15, width=50, wrap=tk.WORD)
summary_text.pack()
summary_text.configure(background="white", foreground="black")

# Keywords Display
keywords_label = ttk.Label(summary_frame, text="Keywords:", font=custom_font)
keywords_label.pack(pady=10)
keywords_label.configure(background="white")

keywords_text = tk.Text(summary_frame, height=8, width=50, wrap=tk.WORD)
keywords_text.pack()
keywords_text.configure(background="white", foreground="black")

# Start GUI
window.mainloop()


In [4]:
df = pd.read_csv('../csv/gdit_articles.csv') #run if i didnt run the code to generate the df already