# Creating [CESNET-L](https://www.cesnet-l.net/) final dataset

Emilio Lehoucq - 3/6/24

## Import libraries

In [1]:
import pandas as pd
from glob import glob

## Append data for all weekly compilations

In [2]:
# Get all csv files in current working directory
csv_files = glob("*.csv")

# Read first file
data_all_compilations = pd.read_csv(csv_files[0])

# Variable to count number of files with errors in appending
counter = 0

# Iterate over the rest of the files appending them to the first
for i in range(1, len(csv_files)):
    try:
        data_all_compilations = pd.concat([data_all_compilations, pd.read_csv(csv_files[i])], ignore_index = True)
    except Exception as e:
        counter += 1
        print(f"Exception with file '{csv_files[i]}'.")
        print(f'Exception: {e}.\n')
        pass

# Number of weekly compilations
print(f'\n\n\nNumber of weekly compilations: {len(csv_files)}')

# Number of possibly empty files
print(f'Number of possibly empty files: {counter}')

# Number of total rows
count_rows = data_all_compilations.shape[0]
print(f'Number of total rows: {count_rows}')

# Number of rows without data (all NA)
count_no_postings = data_all_compilations[data_all_compilations.iloc[:, 2:].isna().all(axis=1)].shape[0]
print(f'Number of rows without data: {count_no_postings}')

# Number of postings
print(f'Number of rows with data: {count_rows - count_no_postings}')

Exception with file 'data_compilation_june_2019_week_4.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_july_2020_week_3.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_june_2019_week_5.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_august_2015_week_5.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_may_2020_week_2.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_june_2023_week_5.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_november_2017_week_5.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_november_2020_week_5.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_may_2020_week_5.csv'.
Exception: No columns to parse from file.

Exception with file 'data_compilation_may_2017_week_5.cs

These exceptions happen because the CSVs are empty. I checked all the weekly compilations for those weeks and they didn't contain the search terms.

## Exploring data

In [3]:
# Columns with data of interest
relevant_columns = data_all_compilations.columns[data_all_compilations.columns.str.startswith(('plain_', 'html_'))]

# Dataframe only with rows without missing data on columns of interest
df_no_na = data_all_compilations.dropna(subset = relevant_columns, how = 'any')

print(f"Number of rows where source code is different between plain text and html: {df_no_na[df_no_na['plain_text_message_source_code'] != df_no_na['html_message_source_code']].shape[0]}")
print(f"Number of rows where soup object is different between plain text and html: {df_no_na[df_no_na['plain_text_message_soup'] != df_no_na['html_message_soup']].shape[0]}")
print(f"Number of rows where extracted text is different between plain text and html: {df_no_na[df_no_na['plain_text_message_text'] != df_no_na['html_message_text']].shape[0]}")

Number of rows where source code is different between plain text and html: 0
Number of rows where soup object is different between plain text and html: 0
Number of rows where extracted text is different between plain text and html: 0


Plain text and html data are the same.

## Reshaping data

In [4]:
# Creating new columns based on either html or plain text data
data_all_compilations['source_code'] = data_all_compilations.apply(
    lambda row: row['html_message_source_code'] if pd.isnull(row['plain_text_message_source_code']) else row['plain_text_message_source_code'],
    axis=1
)

data_all_compilations['soup'] = data_all_compilations.apply(
    lambda row: row['html_message_soup'] if pd.isnull(row['plain_text_message_soup']) else row['plain_text_message_soup'],
    axis=1
)

data_all_compilations['text'] = data_all_compilations.apply(
    lambda row: row['html_message_text'] if pd.isnull(row['plain_text_message_text']) else row['plain_text_message_text'],
    axis=1
)

# Deleting old columns
data_all_compilations.drop(columns = relevant_columns, inplace = True)

# Number of rows with missing data on resulting dataframe
print(f"Number of rows with missing data on resulting dataframe: {data_all_compilations[data_all_compilations.isnull().any(axis = 1)].shape[0]}")

# Dimensions of resulting dataframe
print(f'Dimensions of resulting dataframe: {data_all_compilations.shape}')

# What the resulting data looks like
data_all_compilations.head()

Number of rows with missing data on resulting dataframe: 0
Dimensions of resulting dataframe: (2308, 5)


Unnamed: 0,timestamp_collection,week_compilation,source_code,soup,text
0,2024-03-05 16:39:46,"December 2018, Week 5",<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,\nLISTSERV 16.5 - CESNET-L Archives\n\n\n\n\n\...
1,2024-03-06 04:08:28,"January 2015, Week 1",<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,\nLISTSERV 16.5 - CESNET-L Archives\n\n\n\n\n\...
2,2024-03-06 04:09:02,"January 2015, Week 1",<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,\nLISTSERV 16.5 - CESNET-L Archives\n\n\n\n\n\...
3,2024-03-06 04:09:36,"January 2015, Week 1",<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,\nLISTSERV 16.5 - CESNET-L Archives\n\n\n\n\n\...
4,2024-03-05 04:50:14,"January 2022, Week 1",<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,<html><head>\n<title>LISTSERV 16.5 - CESNET-L ...,\nLISTSERV 16.5 - CESNET-L Archives\n\n\n\n\n\...


## Saving final data

In [5]:
data_all_compilations.to_csv('cesnet_data_march_6_2024.csv', index = False)