In [1]:
#importing neccesary files

import nltk
import io
import re
import zipfile
from tqdm import tqdm
import pickle as pkl
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/frostrot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Filtering
(i) Convert the text to lower case \
(ii) Perform word tokenization \
(iii) Remove stopwords from tokens \
(iv) Remove punctuation marks from tokens \
(v) Remove blank space tokens

In [2]:
#Remove Punctuations from the text

def remove_punc(tokens):
  table = string.punctuation
  ptokens = []
  for w in tokens:
    if w not in table:
      ptokens.append(w)
  ptokens = [s for s in ptokens if s]
  ptokens = [re.sub(r"[\n\t]+"," ",s) for s in ptokens]
  return ptokens

In [3]:
#Remove stopwords, and convert shorted words into there extended forms

def stopword(x):
  EXTENDED_FORMS = {"aren't": 'are not', "can't": 'cannot', "couldn't": 'could not', "didn't": 'did not', "doesn't": 'does not', "don't": 'do not', "hadn't": 'had not', "hasn't": 'has not', "haven't": 'have not', "he'd": 'he would', "he'll": 'he will', "he's": 'he is', "i'd": 'i would', "i'll": 'i will', "i'm": 'i am', "isn't": 'is not', "it's": 'it is', "it'll": 'it will', "i've": 'i have', "let's": 'let us', "mightn't": 'might not', "mustn't": 'must not',"n't": 'not', "shan't": 'shall not', "she'd": 'she would', "she'll": 'she will', "she's": 'she is', "shouldn't": 'should not', "that's": 'that is', "there's": 'there is', "they'd": 'they would', "they'll": 'they will', "they're": 'they are', "they've": 'they have', "we'd": 'we would', "we're": 'we are', "weren't": 'were not', "we've": 'we have', "what'll": 'what will', "what're": 'what are', "what's": 'what is', "what've": 'what have', "where's": 'where is', "who'd": 'who would', "who'll": 'who will', "who're": 'who are', "who's": 'who is', "who've": 'who have', "won't": 'will not', "wouldn't": 'would not', "you'd": 'you would', "you'll": 'you will', "you're": 'you are', "you've": 'you have', "'re": ' are', "wasn't": 'was not', "we'll": 'we will', "'cause": 'because', "could've": 'could have', "how'd": 'how did', "how'd'y": 'how do you', "how'll": 'how will', "how's": 'how is', "I'd": 'I would', "I'd've": 'I would have', "I'll": 'I will', "I'll've": 'I will have', "I'm": 'I am', "I've": 'I have', "i'd've": 'i would have', "i'll've": 'i will have', "it'd": 'it would', "it'd've": 'it would have', "it'll've": 'it will have', "ma'am": 'madam', "mayn't": 'may not', "might've": 'might have', "mightn't've": 'might not have', "must've": 'must have', "mustn't've": 'must not have', "needn't": 'need not', "needn't've": 'need not have', "o'clock": 'of the clock', "oughtn't": 'ought not', "oughtn't've": 'ought not have', "sha'n't": 'shall not', "shan't've": 'shall not have', "she'd've": 'she would have', "she'll've": 'she will have', "should've": 'should have', "shouldn't've": 'should not have', "so've": 'so have', "so's": 'so as', "this's": 'this is', "that'd": 'that would', "that'd've": 'that would have', "there'd": 'there would', "there'd've": 'there would have', "here's": 'here is', "they'd've": 'they would have', "they'll've": 'they will have', "to've": 'to have', "we'd've": 'we would have', "we'll've": 'we will have', "what'll've": 'what will have', "when's": 'when is', "when've": 'when have', "where'd": 'where did', "where've": 'where have', "who'll've": 'who will have', "why's": 'why is', "why've": 'why have', "will've": 'will have', "won't've": 'will not have', "would've": 'would have', "wouldn't've": 'would not have', "y'all": 'you all', "y'all'd": 'you all would', "y'all'd've": 'you all would have', "y'all're": 'you all are', "y'all've": 'you all have', "you'd've": 'you would have', "you'll've": 'you will have'}
  
  for i in range(len(x)):
    if x[i] in EXTENDED_FORMS:
      x[i] = EXTENDED_FORMS[x[i]]
    if x[i] in stopwords.words('english'):
      x[i]=''
  x=remove_punc(x)
  return " ".join(x)

In [4]:
#Filter the parsed text, by, converting them into lowercase, removing any tags, extra spaces.

def filter(item):
  if type(item)==str:
    item=item.lower()
    words = word_tokenize(item)
    item=stopword(words)
    item=re.sub(r'\\N','',item)
  return item

### Loading files

In [5]:
archive = zipfile.ZipFile('../Humor,Hist,Media,Food.zip', 'r')
data = []
error_files = []
file_list = archive.namelist()[1:]
for filename in file_list:
    try:
        with archive.open(filename,'r') as f:
            name = str(filename).split("/")[-1]
            textlist = []
            for line in io.TextIOWrapper(f,'latin-1'):
                textlist.append(line)
            content = " ".join(textlist)
            data.append({'file':name,'content':content})
    except:
        error_files.append(str(filename))

print(f"Data collected from {len(data)} files")
print(f"{len(error_files)} files had error")

Data collected from 1133 files
0 files had error


### Filtering and Storing

In [6]:
for files in tqdm(data):
    files['filtered_content'] = filter(files['content'])

100%|██████████| 1133/1133 [05:15<00:00,  3.59it/s]


In [7]:
#Sorting and storing files, wrt to the alphabetical file names

data.sort(key =lambda x: x['file'])

In [8]:
# Dumping in Pickle File

pkl.dump(data,open('./pickle_files/data.pkl','wb'))