###  Loading

In [1]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("1.1 Files/to_visit_links.csv", header=None)
df = df.rename(columns={df.columns[0]: "Link"})
df

Unnamed: 0,Link
0,https://usjr.edu.ph/
1,https://usjr.edu.ph/about/
2,https://usjr.edu.ph/admissions/
3,https://usjr.edu.ph/academics/
4,https://usjr.edu.ph/sas/
...,...
1303284,https://usjr.edu.ph/events/event/coe-ambassado...
1303285,https://usjr.edu.ph/events/event/coe-camping-b...
1303286,https://usjr.edu.ph/portfolio/sample1/
1303287,https://usjr.edu.ph/portfolio/high-school/


###  Remove Duplicates

In [2]:
# Remove duplicates
df = df.drop_duplicates()
df.to_csv('1.2 Files/RemoveDuplicates.csv', index=False)
df

Unnamed: 0,Link
0,https://usjr.edu.ph/
1,https://usjr.edu.ph/about/
2,https://usjr.edu.ph/admissions/
3,https://usjr.edu.ph/academics/
4,https://usjr.edu.ph/sas/
...,...
1286522,https://usjr.edu.ph/events/event/coe-ambassado...
1286523,https://usjr.edu.ph/events/event/coe-camping-b...
1291216,https://usjr.edu.ph/portfolio/sample1/
1292382,https://usjr.edu.ph/portfolio/high-school/


###  Sorting

#### By base

In [3]:
import urllib.parse

# Define a function to extract the base URL from a link
def get_base(url):
    parsed = urllib.parse.urlparse(url)
    return '/'.join(parsed.path.split('/')[:2])

# Ensure that all entries in the 'Link' column are strings
df.loc[:, "Link"] = df["Link"].astype(str)

# Create a new column 'base' with the extracted base URLs
df.loc[:, 'base'] = df["Link"].apply(get_base)

# Sort the DataFrame by 'base' and 'Link'
df = df.sort_values(['base', "Link"])

# Drop the temporary 'base' column if desired
df = df.drop(columns=['base'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'base'] = df["Link"].apply(get_base)


#### By length

In [4]:
def get_base(url):
    parsed = urllib.parse.urlparse(url)
    return '/'.join(parsed.path.split('/')[:2])

def get_route_length(url):
    parsed = urllib.parse.urlparse(url)
    return len(parsed.path.split('/')) - 1  # subtract 1 because an initial '/' results in an extra empty string

df["Link"] = df["Link"].astype(str)  # ensuring that all entries are strings
df['base'] = df["Link"].apply(get_base)
df['route_length'] = df["Link"].apply(get_route_length)
df = df.sort_values(['route_length', 'base', "Link"])
df = df.drop(columns=['base'])  # if you want to drop the temporary 'base' column
df.reset_index(drop=True, inplace=True)

print(df)

                                                   Link  route_length
0                                http://edp.usjr.edu.ph             0
1                        http://external@usjr.edu.ph%20             0
2                               http://icto.usjr.edu.ph             0
3                  http://recoletoscongress.usjr.edu.ph             0
4                             http://webdev@usjr.edu.ph             0
...                                                 ...           ...
2554  https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...             8
2555  https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...             8
2556  https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...             8
2557  https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...             8
2558  https://rmrj.usjr.edu.ph/https%3A//rmrj.usjr.e...             9

[2559 rows x 2 columns]


### Removing tags and betas

In [5]:
df = df[~df["Link"].str.contains('beta.usjr.edu.ph')]
df = df[~df["Link"].str.contains('usjr.edu.ph/tag')]
df = df[~df["Link"].str.contains('rmrj.usjr.edu.ph/index.php/tag')]
df = df[~df["Link"].str.contains('eclass.')]
df = df[~df["Link"].str.contains('edp.usjr')]
df = df[~df["Link"].str.contains('webdev')]
df = df[~df["Link"].str.contains("external@usjr")]
df = df[~df["Link"].str.contains("ezproxy")]
df = df[~df["Link"].str.contains("recoletoscongress.usjr.edu.ph")]
df = df[~df["Link"].str.contains("icto.usdjr")]
df

Unnamed: 0,Link,route_length
2,http://icto.usjr.edu.ph,0
14,https://rmrj.usjr.edu.ph,0
15,https://usjr.edu.ph,0
16,https://www.usjr.edu.ph,0
18,http://opac.usjr.edu.ph:8080/,1
...,...,...
2554,https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...,8
2555,https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...,8
2556,https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...,8
2557,https://rmrj.usjr.edu.ph/rmrj/index.php/RMRJ/a...,8


In [6]:
import pandas as pd
from urllib.parse import urlparse

# Assuming your dataframe is named df
df['base_route'] = df['Link'].apply(lambda x: '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse(x)))

# Group by base_route and filter rows where route_length > 0
new_df = df[df['route_length'] > -1].groupby('base_route').apply(lambda group: group)

# Optionally, reset the index for the new dataframe
new_df = new_df.reset_index(drop=True)
new_df

new_df.to_csv("grouped.csv")

df_zeroes = df[df['route_length'] == 0].copy()

# Reset the index of df_zeroes
df_zeroes.reset_index(drop=True, inplace=True)
df = new_df


In [7]:
df

Unnamed: 0,Link,route_length,base_route
0,http://icto.usjr.edu.ph,0,http://icto.usjr.edu.ph/
1,http://opac.usjr.edu.ph:8080/,1,http://opac.usjr.edu.ph:8080/
2,http://rmrj.usjr.edu.ph/index.php/RMRJ,2,http://rmrj.usjr.edu.ph/
3,https://rmrj.usjr.edu.ph,0,https://rmrj.usjr.edu.ph/
4,https://rmrj.usjr.edu.ph/,1,https://rmrj.usjr.edu.ph/
...,...,...,...
2031,https://usjr.edu.ph/events/category/sdpc/page/2/,6,https://usjr.edu.ph/
2032,https://usjr.edu.ph/events/category/sdpc/page/3/,6,https://usjr.edu.ph/
2033,https://usjr.edu.ph/events/category/senior-hig...,6,https://usjr.edu.ph/
2034,https://www.facebook.com/sao.usjr.edu.ph/,2,https://www.facebook.com/


In [8]:
dfs = {}
unique_routes = df['base_route'].unique()
print(unique_routes)
for i, route in enumerate(unique_routes):
    dfs[f'df_{i}'] = df[df['base_route'] == route]

['http://icto.usjr.edu.ph/' 'http://opac.usjr.edu.ph:8080/'
 'http://rmrj.usjr.edu.ph/' 'https://rmrj.usjr.edu.ph/'
 'https://usjr.edu.ph/' 'https://www.facebook.com/'
 'https://www.usjr.edu.ph/']


In [9]:
for key, value in dfs.items():
    if isinstance(value, pd.DataFrame):
        value = value.drop('base_route', axis=1)
        dfs[key] = value.copy()
    else:
        print(f"Element at key {key} is not a DataFrame.")


In [10]:
from urllib.parse import urlparse

# loop through each dataframe in dfs dictionary
for key, df in dfs.items():
    # Exclude rows where route_length is 0
    df = df[df['route_length'] > 0]
    # create new column for each first route
    df['first_route'] = df['Link'].apply(lambda x: urlparse(x).path.split('/')[1] if len(urlparse(x).path.split('/')) > 1 else '')
    # create new dataframe with first route as columns
    df_grouped = df.groupby('first_route')['Link'].apply(list).apply(pd.Series).T
    # save the transformed dataframe back into the dictionary
    dfs[key] = df_grouped


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['first_route'] = df['Link'].apply(lambda x: urlparse(x).path.split('/')[1] if len(urlparse(x).path.split('/')) > 1 else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['first_route'] = df['Link'].apply(lambda x: urlparse(x).path.split('/')[1] if len(urlparse(x).path.split('/')) > 1 else '')


In [11]:
# loop through each dataframe in dfs dictionary
for key, value in dfs.items():
    # save the dataframe to a csv file
    value.to_csv(f"1.2 Files/dfs/{key}.csv", index=False)
