# Text Preprocessing

In [1]:
%%capture

import pandas as pd
from data_describe.text.text_preprocessing import *
from data_describe.utilities.load_data import load_data

## Load Data

In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism']
newsgroups = fetch_20newsgroups(subset='train', categories=categories)['data']

In [3]:
newsgroups[0][:100]

'From: darice@yoyo.cc.monash.edu.au (Fred Rice)\nSubject: Re: Islam & Dress Code for women\nOrganizatio'

## Tokenize

In [4]:
newsgroups_tokens = tokenize(newsgroups)

In [5]:
newsgroups_tokens[0][:10]

['From',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'Fred',
 'Rice',
 ')',
 'Subject']

## Change to all lowercase

In [6]:
newsgroups_lower = to_lower(newsgroups_tokens)
newsgroups_lower[0][:10]

['from',
 ':',
 'darice',
 '@',
 'yoyo.cc.monash.edu.au',
 '(',
 'fred',
 'rice',
 ')',
 'subject']

## Remove punctuation

In [7]:
newsgroups_no_punct = remove_punct(newsgroups_lower)
newsgroups_no_punct[0][:10]

['from',
 'darice',
 'yoyo.cc.monash.edu.au',
 'fred',
 'rice',
 'subject',
 're',
 'islam',
 'dress',
 'code']

## Remove digits

In [8]:
remove_digits([['this', 'is', '3', 'a', 'test', '2c', 'if', 'it', 'works']])

[['this', 'is', '', 'a', 'test', '', 'if', 'it', 'works']]

## Remove single characters and spaces

In [9]:
remove_single_char_and_spaces([['this', 'is', '   ', 'a', 'test', '   ', 'b']])

[['this', 'is', 'test']]

## Remove stopwords

In [10]:
newsgroups_no_stop = remove_stopwords(newsgroups_no_punct)
newsgroups_no_stop[0][:10]

['darice',
 'yoyo.cc.monash.edu.au',
 'fred',
 'rice',
 'subject',
 'islam',
 'dress',
 'code',
 'women',
 'organization']

## Stem words

In [11]:
newsgroups_stemmed = stem(newsgroups_no_stop)
newsgroups_stemmed[0][:10]

['dar',
 'yoyo.cc.monash.edu.au',
 'fred',
 'ric',
 'subject',
 'islam',
 'dress',
 'cod',
 'wom',
 'org']

## Lemmatize words

In [12]:
newsgroups_lemmatized = lemmatize(newsgroups_no_stop)
newsgroups_lemmatized[0][:10]

['darice',
 'yoyo.cc.monash.edu.au',
 'fred',
 'rice',
 'subject',
 'islam',
 'dress',
 'code',
 'woman',
 'organization']

## Convert back to a single string

In [13]:
newsgroups_docs = bag_of_words_to_docs(newsgroups_lemmatized)
newsgroups_docs_no_digits = bag_of_words_to_docs(remove_digits(newsgroups_lemmatized))
newsgroups_docs[0][:1000]

"darice yoyo.cc.monash.edu.au fred rice subject islam dress code woman organization monash university melb australia line 120 16ba7103c3.i3150101 dbstu1.rz.tu-bs.de i3150101 dbstu1.rz.tu-bs.de benedikt rosenau writes article 1993apr5.091258.11830 monu6.cc.monash.edu.au darice yoyo.cc.monash.edu.au fred rice writes deletion course people say think religion exactly coming different people within religion nothing existing different perspective within religion perhaps one say tend converge truth point lot harm way meantime converge counterfactual religion appear split diverge even might true religion core layer determine happens practise quite inhumane usually post supposed answer n't see got say repeat religion harm people religion converge split giving disagree upon lot disagreement one tolerant one tolerant ideology also split giving disagree upon may also lead intolerance also oppose ideology n't think argument argument religion point weakness human nature big deletion 2 woman soul isl

## Create a document-word frequency matrix

In [14]:
create_doc_term_matrix(newsgroups_docs).head()

Unnamed: 0,00,000,000406,001125,01,0100,010116,011255,012536,013034,...,zlumber,zombie,zoo,zues,zumder,zur,zurlo,zus,zvonko,zyklon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Create a TF-IDF matrix

In [15]:
create_tfidf_matrix(newsgroups_docs_no_digits).head().iloc[:,10:]

Unnamed: 0,_o,aa,aaa,aah,aap,aario,aaron,abandoned,abberation,abc,...,zlumber,zombie,zoo,zues,zumder,zur,zurlo,zus,zvonko,zyklon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Run a full preprocessing pipeline in one line

In [16]:
bag_of_words_to_docs(preprocess_texts(newsgroups, lem=True))[0][:1000]

"darice yoyo.cc.monash.edu.au fred rice subject islam dress code woman organization monash university melb australia line .rz.tu-bs.de .rz.tu-bs.de benedikt rosenau writes article .. .cc.monash.edu.au darice yoyo.cc.monash.edu.au fred rice writes deletion course people say think religion exactly coming different people within religion nothing existing different perspective within religion perhaps one say tend converge truth point lot harm way meantime converge counterfactual religion appear split diverge even might true religion core layer determine happens practise quite inhumane usually post supposed answer n't see got say repeat religion harm people religion converge split giving disagree upon lot disagreement one tolerant one tolerant ideology also split giving disagree upon may also lead intolerance also oppose ideology n't think argument argument religion point weakness human nature big deletion woman soul islam people said muslim say woman soul must admit never heard view held m

## Ngrams Frequency

In [17]:
n_grams = ngram_freq(text_docs=newsgroups)
n_grams

FreqDist({('Subject', 'Re'): 455, ('In', 'article'): 372, ('Lines', 'In'): 201, ('Lines', 'In', 'article'): 182, ('article', '..'): 128, ('In', 'article', '..'): 128, ('writes', 'In'): 101, ('Lines', 'NNTP-Posting-Host'): 99, ('ca', "n't"): 90, ('keith', 'cco.caltech.edu'): 90, ...})