# Week2 - Exercise 2.2

### Import required packages

In [40]:
import numpy as np
import pandas as pd
import unicodedata
import sys
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Load dataset into dataframe

In [2]:
filepath = "data/controversial-comments.jsonl"

contrv_comt = pd.read_json(filepath, lines=True)

### selecting 50K records from the contrv_comt dataset for data cleaning

In [3]:
comments = contrv_comt.sample(50000)

### top 5 records of data and shape

In [4]:
comments.shape
comments.head()

(50000, 2)

Unnamed: 0,con,txt
170236,0,"To paraphrase Bill Burr, do you think anyone g..."
941856,0,"didn't this guy start the hunt for WMDs, that ..."
211126,0,Plan A is to get his Russian mafia friends to ...
733437,0,Just like the Clinton's have been wanting sinc...
474093,0,The amount of salt in this post could melt the...


## 2.2.1. Preprocessing TEXT:

In [5]:
# created new text column where all text preprocessing actions will be performed

comments['new_txt'] = comments['txt']

A. Convert all text to lowercase letters.

In [6]:
# changing text column to lower case
comments['new_txt'] = comments.new_txt.apply(lambda x : x.lower())

# displaying top 5 texts of text columns as list
comments.new_txt.head().tolist()

['to paraphrase bill burr, do you think anyone gives $200,000 worth of a fuck to hear what she has to say? he believes that "speaking fees" are essentially thinly-veiled laundering of bribe money.\n\ni\'d wager that if you looked into the speaking fees paid to someone and then see how they voted once they became a politician you\'d probably see a very clear pattern in *something*.',
 "didn't this guy start the hunt for wmds, that we never found, but we know were there, because we put them there.",
 "plan a is to get his russian mafia friends to steal the great wall in china and ship it here.\n\nfailing that, plan b is to get china to build a new one and ship it here.  and if we have to pay for it, we want a new one!  don't go making a new wall for yourselves and sending us your ratty old one.",
 "just like the clinton's have been wanting since the start of the campaign. he was on their list of 2-3 candidates they want to run against.",
 'the amount of salt in this post could melt the a

B. Remove all punctuation from the text.

In [7]:
# dictionary of punctuations characters with keys and none as values

punctuations = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

In [8]:
# removing punctuations from the text column
comments['new_txt'] = comments.new_txt.apply(lambda x : x.translate(punctuations))

# displaying top 5 texts of text columns as list
comments.new_txt.head().tolist()

['to paraphrase bill burr do you think anyone gives $200000 worth of a fuck to hear what she has to say he believes that speaking fees are essentially thinlyveiled laundering of bribe money\n\nid wager that if you looked into the speaking fees paid to someone and then see how they voted once they became a politician youd probably see a very clear pattern in something',
 'didnt this guy start the hunt for wmds that we never found but we know were there because we put them there',
 'plan a is to get his russian mafia friends to steal the great wall in china and ship it here\n\nfailing that plan b is to get china to build a new one and ship it here  and if we have to pay for it we want a new one  dont go making a new wall for yourselves and sending us your ratty old one',
 'just like the clintons have been wanting since the start of the campaign he was on their list of 23 candidates they want to run against',
 'the amount of salt in this post could melt the arctic']

C. Remove stop words.

In [9]:
stop_words = stopwords.words('english')

In [10]:
# removing stop words from the text column
comments['new_txt'] = comments.new_txt.apply(lambda x : ' '.join([word for word in word_tokenize(x.lower()) if word not in stop_words]))

# displaying top 5 texts of text columns as list
comments.new_txt.head().tolist()

['paraphrase bill burr think anyone gives $ 200000 worth fuck hear say believes speaking fees essentially thinlyveiled laundering bribe money id wager looked speaking fees paid someone see voted became politician youd probably see clear pattern something',
 'didnt guy start hunt wmds never found know put',
 'plan get russian mafia friends steal great wall china ship failing plan b get china build new one ship pay want new one dont go making new wall sending us ratty old one',
 'like clintons wanting since start campaign list 23 candidates want run',
 'amount salt post could melt arctic']

D. Apply NLTK’s PorterStemmer.

In [11]:
porter = PorterStemmer()

In [12]:
# stemming the text columns text
comments['new_txt'] = comments.new_txt.apply(lambda x : ' '.join([porter.stem(word) for word in word_tokenize(x)]))

# displaying top 5 texts of text columns as list
comments.new_txt.head().tolist()

['paraphras bill burr think anyon give $ 200000 worth fuck hear say believ speak fee essenti thinlyveil launder bribe money id wager look speak fee paid someon see vote becam politician youd probabl see clear pattern someth',
 'didnt guy start hunt wmd never found know put',
 'plan get russian mafia friend steal great wall china ship fail plan b get china build new one ship pay want new one dont go make new wall send us ratti old one',
 'like clinton want sinc start campaign list 23 candid want run',
 'amount salt post could melt arctic']

## 2.2.2. Now that the data is pre-processed, you will apply three different techniques to get it into a usable form for model-building. Apply each of the following steps (individually) to the pre-processed data.

In [36]:
# create list of all texts from new_text column

texts = comments.new_txt.tolist()

A. Convert each text entry into a word-count vector (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook).

In [35]:
# creating instance of Count vectorizer

vectorizer = CountVectorizer()

In [37]:
# tramsforming the texts data into bag of words

word_count_vector =  vectorizer.fit_transform(texts)
word_count_vector

<50000x32261 sparse matrix of type '<class 'numpy.int64'>'
	with 810482 stored elements in Compressed Sparse Row format>

In [39]:
# display top 4 items of word vector

word_count_vector.toarray()[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

B. Convert each text entry into a part-of-speech tag vector (see section 6.7 in the Machine Learning with Python Cookbook).

In [30]:
# add pos tagged text into list for each text of comments new_text column

pos_tagged_texts = []
post_text_tags = []
for text in texts:
    tag_text = pos_tag(word_tokenize(text))
    pos_tagged_texts.append(tag_text)
    post_text_tags.append([tag for word, tag in tag_text])

In [31]:
# pos tagged text list  and tags lists few samples - 

pos_tagged_texts[1]
post_text_tags[1]

[('didnt', 'NN'),
 ('guy', 'JJ'),
 ('start', 'NN'),
 ('hunt', 'NN'),
 ('wmd', 'NN'),
 ('never', 'RB'),
 ('found', 'VBN'),
 ('know', 'VBP'),
 ('put', 'VB')]

['NN', 'JJ', 'NN', 'NN', 'NN', 'RB', 'VBN', 'VBP', 'VB']

In [32]:
# using one-hot encoding to convert tags into pos tag vectors


one_hot_multi = MultiLabelBinarizer()
pos_vector = one_hot_multi.fit_transform(post_text_tags)
pos_vector[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])

C. Convert each entry into a term frequency-inverse document frequency (tfidf) vector (see section 6.9 in the Machine Learning with Python Cookbook).

In [41]:
# create instance of tf-idf

tfidf = TfidfVectorizer()

In [42]:
tfidef_vector = tfidf.fit_transform(texts)
tfidef_vector

<50000x32261 sparse matrix of type '<class 'numpy.float64'>'
	with 810482 stored elements in Compressed Sparse Row format>

In [43]:
# display top 4 records of tf-idf vector

tfidef_vector.toarray()[:4]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

For the three techniques in problem (2) above, give an example where each would be useful.

1. Bag of words - It is orderless document representation, and count of words only matters. This method is often used for document classification and sentiment analysis etc.

2. Part of Speech - In this method we tag each word to its part of speech and this used in NLP applications such as Named Entity Recognition, Question Answering and sentiment analysis etc.

3. TFIDF -  it is often used to see weightage of words in documents to learn how important that word is and used for information retrieval and text mining. The main use of this is in search engines where it can be used to determine the importance of searched word.

# END