# Creating [CESNET-L](https://www.cesnet-l.net/) final dataset

Emilio Lehoucq - 8/20/24

## Import libraries

In [1]:
import pandas as pd
from glob import glob

## Append data for all weekly compilations

In [2]:
# Get all csv files for each weekly compilation
csv_files = glob("weekly_data/*.csv")

# Read first file
data_all_compilations = pd.read_csv(csv_files[0])

# Variable to count number of files with errors in appending
counter = 0

# Iterate over the rest of the files appending them to the first
for i in range(1, len(csv_files)):
    try:
        data_all_compilations = pd.concat([data_all_compilations, pd.read_csv(csv_files[i])], ignore_index = True)
    except Exception as e:
        counter += 1
        print(f"Exception with file '{csv_files[i]}'.")
        print(f'Exception: {e}.\n')
        pass

# Number of weekly compilations
print(f'\n\n\nNumber of weekly compilations: {len(csv_files)}')

# Number of possibly empty files
print(f'Number of possibly empty files: {counter}')

# Number of total rows
count_rows = data_all_compilations.shape[0]
print(f'Number of total rows: {count_rows}')

# Number of rows without data (all NA)
count_no_postings = data_all_compilations[data_all_compilations.iloc[:, 2:].isna().all(axis=1)].shape[0]
print(f'Number of rows without data: {count_no_postings}')

# Number of postings
print(f'Number of rows with data: {count_rows - count_no_postings}')

Exception with file 'weekly_data/july_2019_week_5_1907E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/december_2019_week_5_1912E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/may_2020_week_2_2005B.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/july_2024_week_5_2407E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/july_2023_week_5_2307E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/october_2014_week_5_1410E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/april_2023_week_5_2304E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/may_2017_week_5_1705E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/december_2020_week_5_2012E.csv'.
Exception: No columns to parse from file.

Exception with file 'weekly_data/august_2018_

These exceptions happen because the CSVs are empty. The number of exceptions is lower than when I searched for less keywords, which is reassuring. 

This is what the CSV looks like:

In [3]:
data_all_compilations

Unnamed: 0,week,url_compilation,url_posting,timestamp,source_code_message,text_message
0,"April 2023, Week 2",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:08:09,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n Re...
1,"April 2023, Week 2",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:09:29,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n Ps...
2,"April 2023, Week 2",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:10:44,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n As...
3,"April 2023, Week 2",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:12:01,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n De...
4,"April 2023, Week 2",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:13:06,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n Jo...
...,...,...,...,...,...,...
4197,"September 2015, Week 3",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-20 10:01:58,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n St...
4198,"September 2015, Week 3",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-20 10:02:34,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n Yo...
4199,"July 2015, Week 5",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-20 09:54:27,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n Ar...
4200,"July 2015, Week 5",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-20 09:55:20,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,LISTSERV 16.5 - CESNET-L Archives\n Print\n DE...


## Checking the quality of the data

In [4]:
print(f"Dimensions of the dataset: {data_all_compilations.shape}")
print("Number of missing values per column:")
data_all_compilations.isnull().sum()


Dimensions of the dataset: (4202, 6)
Number of missing values per column:


week                    0
url_compilation         0
url_posting             0
timestamp               0
source_code_message     0
text_message           61
dtype: int64

In [5]:
# Exploring the rows where text_message is missing
missing_text_message_rows = data_all_compilations[data_all_compilations['text_message'].isnull()]

for i, row in missing_text_message_rows.iterrows():
    print("=============================================================================")
    print("=============================================================================")
    print(f"Row {i}")
    print(row['week'])
    print(row['url_compilation'])
    print(row['url_posting'])
    print(row['timestamp'])
    print(row['source_code_message'])

Row 129
February 2016, Week 1
https://listserv.kent.edu/cgi-bin/wa.exe?A1=ind1602A&L=CESNET-L&X=F428852318C1A1DEB9&Y=emilio%40northwestern.edu
https://listserv.kent.edu/cgi-bin/wa.exe?A2=ind1602A&L=CESNET-L&X=C7CD61500D1E2B661E&Y=emilio%40northwestern.edu&P=7633
2024-08-20 09:28:11
<html><head>













<meta http-equiv="Refresh" content="0;URL=/cgi-bin/wa.exe?A3=ind1602A&amp;L=CESNET%2dL&amp;E=0&amp;P=478740&amp;B=%2d%2d001a113cc0420310ac052af6d1bd&amp;T=text%2Fplain%3b%20charset=UTF%2d8&amp;header=1&amp;X=953B59669F1690B800&amp;Y=emilio%40northwestern.edu">
</head>
<body>


</body></html>
Row 130
February 2016, Week 1
https://listserv.kent.edu/cgi-bin/wa.exe?A1=ind1602A&L=CESNET-L&X=F428852318C1A1DEB9&Y=emilio%40northwestern.edu
https://listserv.kent.edu/cgi-bin/wa.exe?A2=ind1602A&L=CESNET-L&X=C7CD61500D1E2B661E&Y=emilio%40northwestern.edu&P=5470
2024-08-20 09:30:50
<html><head>













<meta http-equiv="Refresh" content="0;URL=/cgi-bin/wa.exe?A3=ind1602A&amp;L=CESNET%2dL&am

Looking at these urls, they do have text. Something went wrong during the process of scraping. Fortunately it's a very small proportion of the total number of rows.

In [6]:
# Check for 'FAILURE' in source_code_message and text_message
failures = data_all_compilations[(data_all_compilations['source_code_message'] == 'FAILURE') | (data_all_compilations['text_message'] == 'FAILURE')]
print(f"Number of rows with 'FAILURE' in source_code_message or text_message: {failures.shape[0]}")

Number of rows with 'FAILURE' in source_code_message or text_message: 0


I checked for this because `scrape.py` was designed to store 'FAILURE' if it failed. It seems that it went well.

In [7]:
# Check for rows where source_code_message or text_message includes login_text
login_text = 'Please enter your email address and your LISTSERV password and click on the "Log In" button.'

login_rows = data_all_compilations[
    (data_all_compilations['source_code_message'].str.contains(login_text, na=False)) |
    (data_all_compilations['text_message'].str.contains(login_text, na=False))
]

print(f"Number of rows with login text in source_code_message or text_message: {login_rows.shape[0]}. These are the rows:")
login_rows

Number of rows with login text in source_code_message or text_message: 49. These are the rows:


Unnamed: 0,week,url_compilation,url_posting,timestamp,source_code_message,text_message
174,"March 2024, Week 1",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 16:05:52,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
245,"February 2024, Week 3",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 16:05:52,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
367,"December 2016, Week 1",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-20 08:52:55,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
428,"March 2018, Week 4",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-19 16:02:40,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
562,"January 2022, Week 3",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-19 10:33:19,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
617,"November 2023, Week 1",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 16:05:56,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
659,"March 2023, Week 1",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:04:06,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
672,"March 2022, Week 4",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-19 10:03:22,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
673,"March 2022, Week 4",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-19 10:04:26,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...
701,"February 2023, Week 3",https://listserv.kent.edu/cgi-bin/wa.exe?A1=in...,https://listserv.kent.edu/cgi-bin/wa.exe?A2=in...,2024-08-16 17:03:52,<html><head>\n<title>Kent State University - L...,Kent State University - LISTSERV 16.5 - Login ...


It seems that `scraper.py` failed to login a few times. Fortunately it's a very small proportion of the total number of rows and there doesn't seem to be a strong pattern.

In [8]:
# Looking at a random subset of rows to spot any other potential issues
random_rows = data_all_compilations.sample(n=100, random_state=82024)

# Iterate over the random rows
for i, row in random_rows.iterrows():
    print("\n\n\n\n=============================================================================")
    print("=============================================================================")
    print("=============================================================================")
    print("=============================================================================")
    print("=============================================================================")
    print("=============================================================================")
    print(f"Row {i}")
    print(row['week'])
    print(row['url_compilation'])
    print(row['url_posting'])
    print(row['timestamp'])
    print(row['text_message'])





Row 3838
September 2022, Week 3
https://listserv.kent.edu/cgi-bin/wa.exe?A1=ind2209C&L=CESNET-L&X=F428852318C1A1DEB9&Y=emilio%40northwestern.edu
https://listserv.kent.edu/cgi-bin/wa.exe?A2=ind2209C&L=CESNET-L&X=BEDE87B736309E92B7&Y=emilio%40northwestern.edu&P=8588
2024-08-19 09:45:55
LISTSERV 16.5 - CESNET-L Archives
 Print
FACULTY POSITION DESCRIPTION
Counseling and Psychology Department
Clinical Mental Health Counseling Program
Position: Open Rank, Clinical Mental Health Counseling Program
Starting Date: January 8, 2023
Summary:
The Department of Counseling and Human Development at Malone University, in Canton, Ohio, is seeking qualified applicants for a position as an Open Rank professor in the department of Counselor Education. This is a full time, in-residence or fully remote position, 10-month Counselor Education position, with a start date of January 8, 2023. Please complete all application materials by December 1, 2022.
The successful candidate will have an earned doctorate

There are no apparent issues.

In [9]:
# Drop rows with missing text_message or login text in source_code_message or text_message
data_all_compilations = data_all_compilations.dropna(subset=['text_message'])
data_all_compilations = data_all_compilations[
    ~(data_all_compilations['source_code_message'].str.contains(login_text, na=False)) &
    ~(data_all_compilations['text_message'].str.contains(login_text, na=False))
]
print(f"Dimensions of the dataset after dropping rows: {data_all_compilations.shape}")

Dimensions of the dataset after dropping rows: (4092, 6)


## Saving final data

In [10]:
data_all_compilations.to_csv('cesnet_data_august_20_2024.csv', index = False)