# Read the data from text files, scrape the data from URL's and store them in a JSON file.

## 1. Read and clean text data

In [46]:
# Read data
filename = "StudentLife.txt"

with open(filename, 'r') as fh:
    data = fh.read()

In [47]:
# Split data into chunks
data_chunks = data.split('=====')

In [None]:
def clean_data(text:str):
    '''Clean the text data by removing some special characters using regular expression.
    
    Parameters
    ----------
    text (str):
        Raw text to be cleaned

    Returns
    -------
    clean_text (str): Cleaned text
    '''

    clean_text = re.sub("'", " ", text)
    clean_text = re.sub("\s\s*", " ", clean_text)

    return clean_text

In [49]:
# Remove some characters from the text
import re

clean_text_data = []

for text in data_chunks:
    clean_text = clean_data(text)
    clean_text_data.append(clean_text)

In [50]:
# Store data in a dictionary

data_list = []

for i in range(1, len(clean_text_data)+1):
    data_list.append({
        'f_no': 'tf_' + str(i),
        'f_order': 1,
        'text': clean_text_data[i-1]
    })

In [51]:
data_list[:5]

[{'f_no': 'tf_1',
  'f_order': 1,
  'text': 'Student to Student Support Peer support recognizes that students naturally turn to each other for support and connection. Our Student Ambassadors utilize their lived experience as international students to provide friendly, respectful support to help students build a healthy and successful college experience. Click here(https://tbcollege0.sharepoint.com/:b:/s/ExternShare/EWPHCe-qNINHgqTW486g3-YBD-0vkdNy2Ke0sKixC01BLQ?e=cAGXGg) to know more. '},
 {'f_no': 'tf_2',
  'f_order': 1,
  'text': ' STUDENT SUCCESS: Student to Student Support Peer support recognizes that students naturally turn to each other for support and connection. Our Student Ambassadors utilize their lived experience as international students to provide friendly, respectful support to help students build a healthy and successful college experience. Click here(https://tbcollege0.sharepoint.com/:b:/s/ExternShare/EWPHCe-qNINHgqTW486g3-YBD-0vkdNy2Ke0sKixC01BLQ?e=cAGXGg) to know more

## 2. Read and clean URL data

In [53]:
# Read url list from a JSON file
with open('url_list.json', 'r') as fh:
    url_list = json.loads(fh.read())

In [54]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Define the web paths
web_paths = url_list

# Initialize the loader with more generic parse options to ensure content is captured
loader = WebBaseLoader(
    web_paths=web_paths,
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    ),
)

# Load the documents
blog_docs = loader.load()

In [None]:
# Split the text into chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

data_splits = text_splitter.split_documents(blog_docs)

In [57]:
data_splits

[Document(metadata={'source': 'https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/'}, page_content='TestimonialsFind Us OnlineStudent Code of ConductLoyalist College is a community of learners where respectful living and learning is a cherished principle. The College uses operational policies and the Student Code of Conduct to inform its students of its expectations for acceptable student conduct in academic and non-academic matters. The primary purpose of the Loyalist College Student Code of Conduct is to be a guideline for appropriate non-academic behaviour across Loyalist College’s diversified community.View the Student Code of ConductStudent Code of Conduct ComplaintsLoyalist College believes all members of the College community have the right to study or work in an environment that is safe, secure and supportive. The College expects students to be responsible for conducting themselves in a manner that respects the rights of employees, other students and Co

In [58]:
# Store text chunks in a dictionary

name = ""
order_no = 1

for x in data_splits:

    re_text = re.sub("'", " ", x.page_content)
    re_text = re.sub("\s\s*", " ", re_text)

    
    if name == x.metadata['source']:
        order_no += 1
    else:
        order_no = 1

    data_list.append({
        'f_no': x.metadata['source'],
        'f_order': order_no,
        'text': re_text
    })

    name = x.metadata['source']

In [59]:
data_list[-6:]

[{'f_no': 'https://loyalistcollege.com/about-loyalist/policies/research-data-management-strategy/',
  'f_order': 4,
  'text': 'the principles of Ownership, Control, Access and Possession (OCAP) and CARE to guide RDM plans.Guiding Principles for Research Data ManagementResearch Excellence'},
 {'f_no': 'https://loyalistcollege.com/about-loyalist/policies/research-data-management-strategy/',
  'f_order': 5,
  'text': 'Research Excellence is achieved when data is gathered, examined, and reported ethically and equitably. Implementing RDM strategies, with clearly defined and accessible guidelines, will contribute to the ongoing culture of Research Excellence at Loyalist.Institutional Support for Researchers Loyalist will deliver guidance and support to our internal researchers with respect to best practices around the management of data. This may include supporting researchers as they build their own tools and storage, or integrating work into already existing data storage repositories, whil

In [60]:
for d in data_list:
    print(d['f_order'], d['f_no'])

1 tf_1
1 tf_2
1 tf_3
1 tf_4
1 tf_5
1 tf_6
1 tf_7
1 tf_8
1 tf_9
1 tf_10
1 tf_11
1 tf_12
1 tf_13
1 tf_14
1 tf_15
1 tf_16
1 tf_17
1 tf_18
1 tf_19
1 tf_20
1 tf_21
1 tf_22
1 tf_23
1 tf_24
1 tf_25
1 tf_26
1 tf_27
1 tf_28
1 tf_29
1 tf_30
1 tf_31
1 tf_32
1 tf_33
1 tf_34
1 tf_35
1 tf_36
1 tf_37
1 tf_38
1 tf_39
1 tf_40
1 tf_41
1 https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/
2 https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/
3 https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/
4 https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/
5 https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/
6 https://loyalistcollege.com/about-loyalist/policies/student-code-of-conduct/
1 https://loyalistcollege.com/about-loyalist/policies/acad-105-advisory-committees/
2 https://loyalistcollege.com/about-loyalist/policies/acad-105-advisory-committees/
3 https://loyalistcollege.com/about-loy

In [61]:
# Save the data list containing data from text file and URL's in a JSON file

import json

with open('text_data_ordered.json', 'w') as fh:
    fh.write(json.dumps(data_list, indent=4))