**Keyword Identification Squad: Group Members**


1.   ANAS KP (MS21224)
2.   INDRAJITH (MS21092)
3.   FAYIZ M (MS21078)




**Procedure**

1. Inspect the data source
2. Scrape HTML Content From IISER Pune webpage (Accessing IISER Mohali site was difficult due to security issues)
4. Parse HTML Code With Beautiful Soup
5. Store the data in SQLite database
6. Perform analysis with words and digits.
7. Plotted graph
8. Visualization
9. Error check and crosscheck using another small website (IISER Mohali Moodle website)

In [None]:
# Scraping HTML Contents from IISER Pune website (Main websites) and parsing the data

import requests
from bs4 import BeautifulSoup


whole_data=''

collected_urls = ["https://www.iiserpune.ac.in/",
                  "https://www.iiserpune.ac.in/institute/about",
                  "https://www.iiserpune.ac.in/institute/people",
                  "https://www.iiserpune.ac.in/institute/campus",
                  "https://www.iiserpune.ac.in/admissionapplication",
                  "https://www.iiserpune.ac.in/research",
                  "https://www.iiserpune.ac.in/research/department/biology",
                  "https://www.iiserpune.ac.in/research/department/chemistry",
                  "https://www.iiserpune.ac.in/research/department/data-science",
                  "https://www.iiserpune.ac.in/research/department/earth-and-climate-science",
                  "https://www.iiserpune.ac.in/research/department/humanities-and-social-sciences",
                  "https://www.iiserpune.ac.in/research/department/mathematics",
                  "https://www.iiserpune.ac.in/research/department/physics",
                  "https://www.iiserpune.ac.in/research/department/science-education",
                  "https://www.iiserpune.ac.in/research/research-centres-and-initiatives",
                  "https://www.iiserpune.ac.in/research/research-facilities",
                  "https://www.iiserpune.ac.in/research/publications",
                  "https://www.iiserpune.ac.in/news",
                  "https://www.iiserpune.ac.in/events/",
                  "https://www.iiserpune.ac.in/education",
                  "https://www.iiserpune.ac.in/engage/outreach-and-training",
                  "https://www.iiserpune.ac.in/engage/partnerships",
                  "https://www.iiserpune.ac.in/library",
                  "https://www.iiserpune.ac.in/opportunities",
                  ]

for url in collected_urls:
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.text, 'html.parser')
    data = soup.get_text()
    whole_data = whole_data + data
    print(data)

In [None]:
#Extracting words and filtered the data (Removed conjunctions)
import spacy
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS


nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000
new_data=whole_data.replace("\u200e", " ")
filtered_data=[]
doc = nlp(new_data)

# Extract and create a list of words
alphabet_words = [token.text for token in doc if token.is_alpha]

for word1 in alphabet_words:
  sample=nlp.vocab[word1]
  if sample.is_stop == False and len(word1)>1:
    filtered_data.append(word1)

print("Unfiltered_data: ",alphabet_words)
print("Filtered_data: ",filtered_data)

In [42]:
# Store the data in Sqlite database

import sqlite3

conn = sqlite3.connect('DATA_BANK.db')
cursor = conn.cursor()


cursor.execute('''
    CREATE TABLE IF NOT EXISTS Words (
        id INTEGER PRIMARY KEY NOT NULL,
        word TEXT NOT NULL
    )
''')

conn.commit()
conn.close()

In [43]:

conn = sqlite3.connect('DATA_BANK.db')
cursor = conn.cursor()


for word in filtered_data:
  cursor.execute("INSERT INTO Words (word) VALUES (?)", (word,))


conn.commit()
conn.close()

In [None]:

conn = sqlite3.connect('DATA_BANK.db')
cursor = conn.cursor()

cursor.execute("SELECT word FROM Words")

keywords = cursor.fetchall()

conn.close()

for word in keywords:
    print(word[0])

In [None]:
# Extract and create a list of numerical values and digits
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000
doc = nlp(new_data)


numericals_and_digits = [token.text for token in doc if token.is_digit]

print("Numericals and Digits:", numericals_and_digits)


In [None]:
# unique words and sort them alphabetically
unique = sorted(set(filtered_data))
for unique_words in unique:
    print(unique_words)




In [None]:
# Sorting words by their length
length = sorted(set(filtered_data), key=len)
for words_by_length in length:
    print(words_by_length)

In [None]:
#Extracting words and to count their number of occurence or repeatation
# Count the frequency of each word
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000


word_freq = Counter(alphabet_words)
keyword_freq = Counter(filtered_data)
total_keywords = len(filtered_data)
print(word_freq)
total_word = len(alphabet_words)
Total_data = whole_data.split()
totalwordcount = len(Total_data)



In [None]:
#Number of words
print("Total Number of texts extracted: ", totalwordcount)
print("Total Number of words:", total_word)
print("Total Number of Keywords:", total_keywords)


In [None]:
#Barchart
import matplotlib.pyplot as plt


common_words = keyword_freq.most_common(10)
words, frequencies = zip(*common_words)


colors = ['darkred', 'violet', 'black', 'magenta', 'lightgreen',
          'lightsalmon', 'lightblue', 'lightgray', 'blue', 'yellow']

plt.figure(figsize=(12, 8))
plt.bar(words, frequencies, color=colors)
plt.xlabel('Different Words', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Top 10 Most Common Words', fontsize=16)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)


plt.grid(axis='y', linestyle='--', alpha=0.7)


for i, v in enumerate(frequencies):
    plt.text(i, v, str(v), ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Display the word cloud for the words
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400).generate(' '.join(keyword_freq))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Display the word cloud for keywords
word_list=[]
for ele in common_words:
  word_list=word_list+[ele[0]]

wordcloud = WordCloud(width=800, height=400).generate(' '.join(word_list))


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

**Cross checking**


*   Cross checking was done using another small website containing less number of data (IISER Mohali moodle website)


In [None]:
check_url = "https://web.iisermohali.ac.in/"
response = requests.get(check_url)
check_soup = BeautifulSoup(response.text, 'html.parser')
check_data = check_soup.get_text()

print(check_data)

In [None]:
check_words = check_data.split()
wordcount = len(check_words)
print(wordcount)

In [None]:
#Extracting words and to count their number of occurence or repeatation

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000  # Set a higher limit (adjust as needed)
new_check_data = check_data.replace("\u200e"," ")
doc1 = nlp(new_check_data)
words = [token.text for token in doc1 if token.is_alpha]
#print(words)
# Count the frequency of each word
word_freq = Counter(words)
total_words = len(words)

#words = [token.text for token in doc if token.is_alpha]
#word_freq = Counter(words)
print(words)
print(word_freq)
print("Total Number of Words:", total_words)