# Neccessary Import Statements

In [1]:
# data manipulation and storage
import numpy as np
import pandas as pd
# file and directory navigation
import os
# text parsing, we do a lot of it here.
import re

# See What Structure The Data Is In

### Get a List of All of the Files

Note that this data can be found at [this link](https://www.kaggle.com/crawford/20-newsgroups/data?select=list.csv).

In [2]:
# navigate to directory in which the files are stored
data_dir = '/Users/sebgo/Documents/Hindsight/hierarchical_text_classification/20_newsgroup/data/raw_data/'
os.chdir(data_dir)

In [3]:
all_files = os.listdir()
file_names = [file for file in all_files if file[-4::] == '.txt']
print(file_names)

['sci.crypt.txt', 'comp.sys.mac.hardware.txt', 'misc.forsale.txt', 'soc.religion.christian.txt', 'rec.sport.baseball.txt', 'rec.sport.hockey.txt', 'comp.sys.ibm.pc.hardware.txt', 'talk.politics.guns.txt', 'rec.autos.txt', 'alt.atheism.txt', 'comp.os.ms-windows.misc.txt', 'sci.electronics.txt', 'comp.windows.x.txt', 'talk.religion.misc.txt', 'talk.politics.mideast.txt', 'sci.med.txt', 'rec.motorcycles.txt', 'comp.graphics.txt', 'sci.space.txt', 'talk.politics.misc.txt']


### See What Information We Can Get From the "list" CSV File

In [4]:
master_df = pd.read_csv('talk.religion.misc.list.csv')
master_df.head(10)

Unnamed: 0,newsgroup,document_id
0,talk.religion.misc,82757
1,talk.religion.misc,82758
2,talk.religion.misc,82759
3,talk.religion.misc,82760
4,talk.religion.misc,82763
5,talk.religion.misc,82766
6,talk.religion.misc,82767
7,talk.religion.misc,82771
8,talk.religion.misc,82772
9,talk.religion.misc,82774


In [5]:
grouped_by_group_df = master_df.groupby(by = "newsgroup")
grouped_by_group_df.count()

Unnamed: 0_level_0,document_id
newsgroup,Unnamed: 1_level_1
talk.religion.misc,628


Evidently, for whatever reason, this CSV file only contains entries for the `talk.religion.misc` category. Thus, if we want a similar master document for all of the categories, we're going to have to create it ourselves...this should NOT be hard to do once we figure out a good way to parse through the txt files. **Thus**, we have re-named the file to `talk.religion.misc.list.csv`

In [6]:
grouped_by_id_df = master_df.groupby(by = 'document_id')
print(grouped_by_id_df.count().max() == grouped_by_id_df.count().min(), end = '\n\n\n')
print(grouped_by_id_df.count().min() == grouped_by_id_df.count().mean(), end = '\n\n\n')
print(grouped_by_id_df.count().max() == grouped_by_id_df.count().mean())

newsgroup    True
dtype: bool


newsgroup    True
dtype: bool


newsgroup    True
dtype: bool


We can tell that each row in this df is **unique** as should be the case.

### Write A Function That Opens and Parses Each Txt File

In [7]:
def txt_to_df(file_name: str, *args, **kwargs):
    """
    Purpose
    -------
    The purpose of this function is to take the content that exists in one of the .txt files given
    by the data dump on Kaggle (see link above) and parse it so that the result is a DataFrame whose
    rows correspond to each document that is referenced in the file. This is not an immediately 
    straight-forward task due to the fact that each file is simply a blob of text. For that reason,
    the parsing is done using regular expressions that take advantage of the minimal structure that
    is present. What exactly is taken advantage of is spelled out in the comments of this function's
    source code.
    
    Parameters
    ----------
    file_name - (str) This string represents the file name that contains information about the 
                      hierachical class that you are interested in wrangling. You may include a path
                      to the file if this notebook is not in the directory in which it is stored on 
                      your machine.
    
    Returns
    -------
    to_return - (Pandas DataFrame) this table has columns ["News_Group", "document_id", "From",
                                   "Subject", "raw_text"] and each of its rows corresponds to different
                                   documents referenced in the .txt file.
    
    References
    ----------
    1. https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285
    2. https://stackoverflow.com/questions/6109882/regex-match-all-characters-between-two-strings
    3. https://docs.python.org/3/library/re.html
    """
    ### first we have to open up the file and save it to a string
    my_file = open(file_name, 'rb')
        # reading the file as bytes seemed to be the only way to get this to work
    file_str = str(my_file.read())
        # reading the files as bytes means that we have to manually make it into a string
    my_file.close()
        # always have to close any file that you open.
    assert( type(file_str) == str )
        # you can never be too sure.
    
    ### Now parse the file
    # First, begin by spliting the string based off of the occurances of "Newsgroup: file (without 
    # .txt extension)" since that seperates each of the different documents present in this massive file.
    # Another reason why this is a nice string to split in is that each reference to a new document has
    # the exact same news group line.
    newsgroup_name = re.sub(pattern = r".txt",
                            repl = "",
                            string = file_name)
        # just have to remove the file extension from the file name that the user has specified.
    documents_list = re.split(pattern = "Newsgroup: {}".format(newsgroup_name),
                              string = file_str)
    
    # For each document, extact all of the desired information
    extracted_info_dict = {}
    to_dock_index = 0
        # see below for what this is used for.
    for doc_index, doc_str in enumerate(documents_list):
        if len(doc_str) < 10:
            # first, check to see if it's a legit document. Cases in which it won't be are ones such 
            # as whenthe string technically starts with "b" that is then followed by "\\nNewsgroup: " 
            # meaning that spliting on the news group string will mean that the first string in the 
            # resulting list will simply be something like "b\\n" which is not at all useful. The threshold 
            # of 10 can be changed if one that performs better is determined.
            to_dock_index += 1
        else:
            doc_index -= to_dock_index
                # simply to reset the indices to make sure everything is consistent.
            # then, do your searches
            doc_id_match = re.search(r"(?<=document_id: )[0-9]+(?=\\n)",
                                     doc_str,
                                     re.IGNORECASE)
                # NOTE that this is a Match object.
            
                # the ?<= part is a look-behind meaning that the search will only return results that are
                # a series of numbers preceded by the string specified in the look-behind. Another nice 
                # thing about this is that the part that is in the look-behind will NOT get returned.
                # The ?= is a look-ahead that operates in the exact same way. The `re.IGNORECASE` is used
                # because although each new instance of a document includes an assignment of a document ID,
                # the casing to do so varies.
            doc_from_match = re.search(r"(?<=From: ).*?(?=\\n)",
                                       doc_str,
                                       re.IGNORECASE)
                # the `.*?` signifies a "lazy" search which means that search will return whatever it 
                # finds first as opposed to `.*` which would return the last instance it finds of what 
                # the search specifies which is not useful here because that would be the next document!
            doc_subject_match = re.search(r"(?<=Subject: ).*?(?=\\n)",
                                          doc_str,
                                          re.IGNORECASE)
            try:
                # the reason for the try and except block is that if any of the searchs comes up empty, then
                # the use of the `.end()` method will simply raise an exception that will interupt the entire
                # function.
                index_to_start_text = max([ doc_id_match.end(), 
                                            doc_from_match.end(),
                                            doc_subject_match.end() ])
                    # the `.end()` method returns an integer that represents the index of the last character
                    # of the string that it found in the search. The reason why we want the max final index
                    # of all of these searches is that it represents our best guess as to when the text starts.
            except:
                index_to_start_text = 0
            extracted_info_dict[doc_index] = {"News_Group" : newsgroup_name,
                                              "document_id" : int(doc_id_match.group(0)) if doc_id_match else None,
                                              "From" : doc_from_match.group(0) if doc_from_match else None,
                                              "Subject" : doc_subject_match.group(0) if doc_subject_match else None,
                                              "raw_text" : doc_str[index_to_start_text::]}
    
    ### Wrangle All of The Extracted Information Into a DataFrame
    extracted_info_df = pd.DataFrame(extracted_info_dict).T
    to_return = extracted_info_df
    
    return to_return

### Run it on Each File

In [8]:
parsed_dfs_list = [ txt_to_df(file) for file in file_names ]
assert( len(parsed_dfs_list) == 20 )

In [9]:
parsed_dfs_list[np.random.randint(0, 20)].head(20)

Unnamed: 0,News_Group,document_id,From,Subject,raw_text
0,rec.motorcycles,101725,jeff@mri.com (Jonathan Jefferies),Re: Lexan Polish?,\n\nIn article <C41soE.M62@ns1.nodak.edu> wilk...
1,rec.motorcycles,102616,blgardne@javelin.sim.es.com (Dances With Bikers),FAQ - What is the DoD?,\n\nThis is a periodic posting intended to ans...
2,rec.motorcycles,103117,manes@magpie.linknet.com (Steve Manes),Re: Oops! Oh no!,\n\nWm. L. Ranck (ranck@joesbar.cc.vt.edu) wro...
3,rec.motorcycles,103118,randy@megatek.com (Randy Davis),Re: A Miracle in California,\n\nIn article <1ppvof$92a@seven-up.East.Sun.C...
4,rec.motorcycles,103119,Stafford@Vax2.Winona.MSUS.Edu (John Stafford),Re: more DoD paraphernalia,\n\nIn article <1pppnrINNitg@cronkite.Central....
5,rec.motorcycles,103120,maven@eskimo.com (Norman Hamer),Re: A Miracle in California,"\n\nRe: Waving...\n\nI must say, that the cour..."
6,rec.motorcycles,103121,MJMUISE@1302.watstar.uwaterloo.ca (Mike Muise),Re: Drinking and Riding,"\n\nIn article <C4wKBp.B9w@eskimo.com>, maven@..."
7,rec.motorcycles,103122,egreen@East.Sun.COM (Ed Green - Pixel Cruncher),Re: insect impacts,"\n\nIn article 7290@rd.hydro.on.ca, jlevine@rd..."
8,rec.motorcycles,103123,egreen@East.Sun.COM (Ed Green - Pixel Cruncher),Re: A Miracle in California,\n\nIn article 602CV3dTx01@JUTS.ccc.amdahl.com...
9,rec.motorcycles,103124,jrwaters@eos.ncsu.edu (JACK ROGERS WATERS),Re: Cultural Enquiries,\n\nIn article <C50uGG.9As@cmptrc.lonestar.org...


### Put the Returned DataFrames Together

In [10]:
total_df_0 = pd.concat(parsed_dfs_list).reset_index(drop = True)
assert( total_df_0.shape[0] == sum( [df.shape[0] for df in parsed_dfs_list] ) )
total_df_0.tail(20)

Unnamed: 0,News_Group,document_id,From,Subject,raw_text
37642,talk.politics.misc,179093,mst4298@rigel.tamu.edu (Mitchell S Todd),Re: Waco Burnout,\n\nrja@mahogany126.cray.com (Russ Anderson) w...
37643,talk.politics.misc,179094,demon@desire.wright.edu (Not a Boomer),"Re: Not talking to soldiers, part II","\n\nIn article <C5trFx.B38@csulb.edu>, sicherm..."
37644,talk.politics.misc,179095,V2110A@VM.TEMPLE.EDU (Richard Hoenes),Waco,\n\nDo all those who are saying the government...
37645,talk.politics.misc,179097,ck347@cleveland.Freenet.Edu (Richard A. Mulac),Re: Waco Burnout,\n\n\nDoesn\'t seem like those responsible for...
37646,talk.politics.misc,179098,asper@calvin.uucp (Alan E. Asper),Re: Janet Reno killed the Waco children,\n\nIn article <1r1pn6$nap@lll-winken.llnl.gov...
37647,talk.politics.misc,179099,goykhman@apollo.hp.com (Red Herring),Re: Janet Reno killed the Waco children,\n\nIn article <fern.735342004@camelot> fern@c...
37648,talk.politics.misc,179102,hallam@dscomsa.desy.de (Phill Hallam-Baker),Re: WACO burning,"\n\n\nIn article <1r7bh0$cc2@nwfocus.wa.com>, ..."
37649,talk.politics.misc,179103,thyat@sdf.lonestar.org (Tom Hyatt),Re: Waco survivors 1715 19 April,\n\nIn article <1993Apr20.135819.14473@e2big.m...
37650,talk.politics.misc,179105,visser@convex.com (Lance Visser),Re: Waco Investigation Paranoia,\n\nIn <16BB98B5A.V2110A@VM.TEMPLE.EDU> V2110A...
37651,talk.politics.misc,179106,cramer@optilink.COM (Clayton Cramer),The Government Is LYING,"\n\n\nYesterday, the FBI was saying that at le..."


### Check For Duplicates

In [11]:
total_df_0.document_id.value_counts()

54185     8
54180     8
54250     8
54251     8
54253     8
         ..
51818     2
51817     2
51816     2
51815     2
124146    2
Name: document_id, Length: 15404, dtype: int64

In [12]:
total_df_0.groupby(by = "document_id").get_group(54185)
    # Note that the 54185 ID was used because it was one of the IDs that occured 
    # 8 (!!) times.

Unnamed: 0,News_Group,document_id,From,Subject,raw_text
10594,rec.sport.hockey,54185,gld@cunixb.cc.columbia.edu (Gary L Dare),Re: TV Schedule for Next Week,\n\njpc@philabs.philips.com (John P. Curcio) w...
11593,rec.sport.hockey,54185,gld@cunixb.cc.columbia.edu (Gary L Dare),Re: TV Schedule for Next Week,\n\njpc@philabs.philips.com (John P. Curcio) w...
13930,talk.politics.guns,54185,lvc@cbnews.cb.att.com (Larry Cipriani),Re: My Gun is like my....,\n\nIn article <1993Apr16.194708.13273@vax.oxf...
14840,talk.politics.guns,54185,lvc@cbnews.cb.att.com (Larry Cipriani),Re: My Gun is like my....,\n\nIn article <1993Apr16.194708.13273@vax.oxf...
18321,alt.atheism,54185,mathew <mathew@mantis.co.uk>,Re: Cults Vs. Religions?,\n\nmangoe@cs.umd.edu (Charley Wingate) writes...
19120,alt.atheism,54185,mathew <mathew@mantis.co.uk>,Re: Cults Vs. Religions?,\n\nmangoe@cs.umd.edu (Charley Wingate) writes...
21946,sci.electronics,54185,gsulliva@enuxha.eas.asu.edu (Glenn A Sullivan),Re: How do you build neural networks?,\n\nmmoss@ic.sunysb.edu (Matthew D Moss) write...
22929,sci.electronics,54185,gsulliva@enuxha.eas.asu.edu (Glenn A Sullivan),Re: How do you build neural networks?,\n\nmmoss@ic.sunysb.edu (Matthew D Moss) write...


**Evidently**, we have instances in which there are **duplicates** between the new group where the rows are exactly the same **as well as** when the **same document ID** is used accross the different News Groups. To remedy this, we will: 
1. First, drop all of the duplicates which will address the issue of when the exact same row appears more than once.
2. Drop the `document_id` column since it is not its entries are NOT useful unique IDs since a give ID is used multiple times across the different News Groups.
3. Reset the index while setting the `drop` argument of that method to `True` so as to not have that `index` column in the result. 
4. Lastly, perform adjusted-steps of above to ensure these changes had the desired effect.

In [13]:
total_df_1 = total_df_0.drop_duplicates(subset = "Subject").reset_index(drop = True).drop(columns = "document_id")
    # the "Subject" column was the column that made the most sense to check for Duplicates since one could be the
    # author of multiple documents, making the `From` column useless, and there can be so many nuanced differences
    # between otherwise identical texts in the `raw_text` column.
total_df_1

Unnamed: 0,News_Group,From,Subject,raw_text
0,sci.crypt,Marc VanHeyningen <mvanheyn@cs.indiana.edu>,RIPEM Frequently Asked Questions,"\n\nArchive-name: ripem/faq\nLast-update: Sun,..."
1,sci.crypt,mvanheyn@cs.indiana.edu (Marc VanHeyningen),RIPEM Frequently Noted Vulnerabilities,\n\nArchive-name: ripem/attacks\nLast-update: ...
2,sci.crypt,Jim-Miller@suite.com,Certifying Authority question answered.,"\n\n>>If you have access to FTP, try FTPing to..."
3,sci.crypt,C445585@mizzou1.missouri.edu (John Kelsey),"""Rubber-hose cryptanalysis""",\n\n Some sick part of me really liked that ...
4,sci.crypt,grady@netcom.com (1016/2EF221),Declassifying media,\n\nThere are many Urban Legends (maybe this o...
...,...,...,...,...
10382,talk.politics.misc,garrod@dynamo.ecn.purdue.edu (David Garrod),WACO burning,"\n\n\n\nIt is interesting, sometimes, to liste..."
10383,talk.politics.misc,pyotr@halcyon.com (Peter D. Hampe),"Phill says Koresh == Hitler, was Welcome to Po...",\n\nhallam@dscomsa.desy.de (Phill Hallam-Baker...
10384,talk.politics.misc,Clinton-HQ@Campaign92.Org (The White House),CLINTON: Fact Sheet on Russian Statement 4.23.93,\n\n\n\n\n The White H...
10385,talk.politics.misc,cramer@optilink.COM (Clayton Cramer),The Government Is LYING,"\n\n\nYesterday, the FBI was saying that at le..."


In [14]:
total_df_1.groupby(by = "Subject").count().describe()

Unnamed: 0,News_Group,From,raw_text
count,10387.0,10387.0,10387.0
mean,1.0,1.0,1.0
std,0.0,0.0,0.0
min,1.0,1.0,1.0
25%,1.0,1.0,1.0
50%,1.0,1.0,1.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


We can now confidently conclude that each row in this DataFrame is in fact **unique**. The result is 10387 documents to work with.

### Now Check For Null Values

In [15]:
total_df_1.isnull().sum()

News_Group    0
From          0
Subject       0
raw_text      0
dtype: int64

**Nothing** to worry about here. Cool!

# Re-structure The Storage of The News Groups in a More Hierarchical Way

In [16]:
total_df_1[ ["Tier_{}".format(i) for i in range(1, 6, 1)] ] = total_df_1.News_Group.str.split(pat = '.', expand = True)
total_df_1

Unnamed: 0,News_Group,From,Subject,raw_text,Tier_1,Tier_2,Tier_3,Tier_4,Tier_5
0,sci.crypt,Marc VanHeyningen <mvanheyn@cs.indiana.edu>,RIPEM Frequently Asked Questions,"\n\nArchive-name: ripem/faq\nLast-update: Sun,...",sci,crypt,,,
1,sci.crypt,mvanheyn@cs.indiana.edu (Marc VanHeyningen),RIPEM Frequently Noted Vulnerabilities,\n\nArchive-name: ripem/attacks\nLast-update: ...,sci,crypt,,,
2,sci.crypt,Jim-Miller@suite.com,Certifying Authority question answered.,"\n\n>>If you have access to FTP, try FTPing to...",sci,crypt,,,
3,sci.crypt,C445585@mizzou1.missouri.edu (John Kelsey),"""Rubber-hose cryptanalysis""",\n\n Some sick part of me really liked that ...,sci,crypt,,,
4,sci.crypt,grady@netcom.com (1016/2EF221),Declassifying media,\n\nThere are many Urban Legends (maybe this o...,sci,crypt,,,
...,...,...,...,...,...,...,...,...,...
10382,talk.politics.misc,garrod@dynamo.ecn.purdue.edu (David Garrod),WACO burning,"\n\n\n\nIt is interesting, sometimes, to liste...",talk,politics,misc,,
10383,talk.politics.misc,pyotr@halcyon.com (Peter D. Hampe),"Phill says Koresh == Hitler, was Welcome to Po...",\n\nhallam@dscomsa.desy.de (Phill Hallam-Baker...,talk,politics,misc,,
10384,talk.politics.misc,Clinton-HQ@Campaign92.Org (The White House),CLINTON: Fact Sheet on Russian Statement 4.23.93,\n\n\n\n\n The White H...,talk,politics,misc,,
10385,talk.politics.misc,cramer@optilink.COM (Clayton Cramer),The Government Is LYING,"\n\n\nYesterday, the FBI was saying that at le...",talk,politics,misc,,


In [18]:
total_df_2 = total_df_1.drop(columns = "News_Group")
total_df_2.head(15)

Unnamed: 0,From,Subject,raw_text,Tier_1,Tier_2,Tier_3,Tier_4,Tier_5
0,Marc VanHeyningen <mvanheyn@cs.indiana.edu>,RIPEM Frequently Asked Questions,"\n\nArchive-name: ripem/faq\nLast-update: Sun,...",sci,crypt,,,
1,mvanheyn@cs.indiana.edu (Marc VanHeyningen),RIPEM Frequently Noted Vulnerabilities,\n\nArchive-name: ripem/attacks\nLast-update: ...,sci,crypt,,,
2,Jim-Miller@suite.com,Certifying Authority question answered.,"\n\n>>If you have access to FTP, try FTPing to...",sci,crypt,,,
3,C445585@mizzou1.missouri.edu (John Kelsey),"""Rubber-hose cryptanalysis""",\n\n Some sick part of me really liked that ...,sci,crypt,,,
4,grady@netcom.com (1016/2EF221),Declassifying media,\n\nThere are many Urban Legends (maybe this o...,sci,crypt,,,
5,gtoal@gtoal.com (Graham Toal),Re: Hard drive security for FBI targets,\n\n\tFrom: res@colnet.cmhnet.org (Rob Stampfl...,sci,crypt,,,
6,warlord@MIT.EDU (Derek Atkins),Re: disk safety measure?,\n\n-----BEGIN PGP SIGNED MESSAGE-----\n\nIn a...,sci,crypt,,,
7,neuhaus@bloch.informatik.uni-kl.de (Stephan Ne...,Re: PGP 2.2: general comments,\n\nneuhaus@vier.informatik.uni-kl.de (Stephan...,sci,crypt,,,
8,WHMurray@DOCKMASTER.NCSC.MIL,Licensing.....,\n\n\n>This thread brings up the more general ...,sci,crypt,,,
9,ho@cs.arizona.edu (Hilarie Orman),Re: Licensing of public key implementations,\n\nWith regard to your speculations on NSA in...,sci,crypt,,,


# Clean the Text Data That We Have Extracted

In [75]:
# let's get an example text so that we can figure out what we need to clean up.
example_text = total_df_2.raw_text[69]

In [76]:
example_text[:3000:]

"\\n\\nArchive-name: cryptography-faq/part02\\nLast-modified: 1993/4/15\\n\\n\\nFAQ for sci.crypt, part 2: Net Etiquette\\n\\nThis is the second of ten parts of the sci.crypt FAQ. The parts are\\nmostly independent, but you should read the first part before the rest.\\nWe don\\'t have the time to send out missing parts by mail, so don\\'t ask.\\nNotes such as ``[KAH67]\\'\\' refer to the reference list in the last part.\\n\\nThe sections of this FAQ are available via anonymous FTP to rtfm.mit.edu \\nas /pub/usenet/news.answers/cryptography-faq/part[xx].  The Cryptography \\nFAQ is posted to the newsgroups sci.crypt, sci.answers, and news.answers \\nevery 21 days.\\n\\n\\nContents:\\n\\n* What groups are around? What\\'s a FAQ? Who am I? Why am I here?\\n* Do political discussions belong in sci.crypt?\\n* How do I present a new encryption scheme in sci.crypt?\\n\\n\\n* What groups are around? What\\'s a FAQ? Who am I? Why am I here?\\n\\n  Read news.announce.newusers and news.answers fo

### Write a Function To Do This

In [78]:
def my_str_cleaner(row, *args, **kwargs):
    """
    Purpose
    -------
    
    Parameters
    ----------
    
    Returns
    -------
    
    References
    ----------
    1.
    """
    ###
    white_space_normalizer = lambda x: x if x[0] != ' ' else x[re.match(r" +", x).end()::]
    raw_str = row["raw_text"]
    
    split_on_new_lines_list = raw_str.split("\\n")
    almost_str = " ".join([ white_space_normalizer(text) for text in split_on_new_lines_list if len(text) and text[0:3] != '---' ])
    
    split_on_white_space_list = almost_str.split()
    final_str = " ".join([ text for text in split_on_white_space_list ])
    
    to_return = final_str
    return to_return

In [85]:
total_df_2["cleaned_text"] = total_df_2.apply(my_str_cleaner, axis = 1)

In [86]:
total_df_2

Unnamed: 0,From,Subject,raw_text,Tier_1,Tier_2,Tier_3,Tier_4,Tier_5,cleaned_text
0,Marc VanHeyningen <mvanheyn@cs.indiana.edu>,RIPEM Frequently Asked Questions,"\n\nArchive-name: ripem/faq\nLast-update: Sun,...",sci,crypt,,,,"Archive-name: ripem/faq Last-update: Sun, 7 Ma..."
1,mvanheyn@cs.indiana.edu (Marc VanHeyningen),RIPEM Frequently Noted Vulnerabilities,\n\nArchive-name: ripem/attacks\nLast-update: ...,sci,crypt,,,,Archive-name: ripem/attacks Last-update: 31 Ma...
2,Jim-Miller@suite.com,Certifying Authority question answered.,"\n\n>>If you have access to FTP, try FTPing to...",sci,crypt,,,,">>If you have access to FTP, try FTPing to rsa..."
3,C445585@mizzou1.missouri.edu (John Kelsey),"""Rubber-hose cryptanalysis""",\n\n Some sick part of me really liked that ...,sci,crypt,,,,Some sick part of me really liked that phrase....
4,grady@netcom.com (1016/2EF221),Declassifying media,\n\nThere are many Urban Legends (maybe this o...,sci,crypt,,,,There are many Urban Legends (maybe this ought...
...,...,...,...,...,...,...,...,...,...
10382,garrod@dynamo.ecn.purdue.edu (David Garrod),WACO burning,"\n\n\n\nIt is interesting, sometimes, to liste...",talk,politics,misc,,,"It is interesting, sometimes, to listen to U.S..."
10383,pyotr@halcyon.com (Peter D. Hampe),"Phill says Koresh == Hitler, was Welcome to Po...",\n\nhallam@dscomsa.desy.de (Phill Hallam-Baker...,talk,politics,misc,,,hallam@dscomsa.desy.de (Phill Hallam-Baker) wr...
10384,Clinton-HQ@Campaign92.Org (The White House),CLINTON: Fact Sheet on Russian Statement 4.23.93,\n\n\n\n\n The White H...,talk,politics,misc,,,The White House Office of the Press Secretary ...
10385,cramer@optilink.COM (Clayton Cramer),The Government Is LYING,"\n\n\nYesterday, the FBI was saying that at le...",talk,politics,misc,,,"Yesterday, the FBI was saying that at least th..."
