# Data Cleaning
### Muf Tayebaly

In [35]:
#imports
import pandas as pd
import numpy as np
import pickle as pk
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import string
import re
from random import randint
from collections import defaultdict

### Download nltk reference data if not already available

In [15]:
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\XBBNWBL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
#read data
raw_data = pd.read_csv("Data/knight_data_articles.csv")
raw_data.head()

Unnamed: 0,content_id,content_body_clean,content_title_clean,label,economy,topic2,science,mention_Trump,mention_Clinton,mention_Democrat,...,mention_terrorist,mention_attack,mention_NRA,mention_sanctuary,mention_socialist,mention_fascist,mention_communist,mention_nationalist,mention_2A,content_source_desc
0,60,Republicans in Congress are at a make or break...,Tax reform now -- failure to act could mean mi...,9,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Fox News
1,61,The Constitution authorizes Congress to tax Am...,Tax reform -- America needs a tax code that's ...,9,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Fox News
2,66,"A long-mothballed, unfinished casino-hotel on ...",Long-mothballed Las Vegas Strip casino-hotel s...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fox News
3,80,If taxing foreign earnings that have already a...,"Trump Tax Plan May Free Up Corporate Dollars, ...",3,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,The New York Times
4,82,Hurricane Harvey has already taken an enormous...,Hurricane Harvey is a humanitarian disaster. I...,5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Vox


In [4]:
#get shape of articles
raw_data.shape

(1914, 32)

In [6]:
#article sample
random_article = randint(0,1914)
print("Random row:",random_article)
print(raw_data["content_title_clean"].iloc[random_article])
print(raw_data["content_body_clean"].iloc[random_article])

Random row: 1544
‘Unbelievable’: Heart Stents Fail to Ease Chest Pain
A procedure used to relieve chest pain in hundreds of thousands of heart patients each year is useless for many of them, researchers reported on Wednesday.  Their study focused on the insertion of stents, tiny wire cages, to open blocked arteries. The devices are lifesaving when used to open arteries in patients in the throes of a heart attack.  But they are most often used in patients who have a blocked artery and chest pain that occurs, for example, walking up a hill or going up stairs. Sometimes patients get stents when they have no pain at all, just blockages.  Heart disease is still the leading killer of Americans — 790,000 people have heart attacks each year — and stenting is a mainstay treatment in virtually every hospital. More than 500,000 heart patients worldwide have stents inserted each year to relieve chest pain, according to the researchers. Other estimates are far higher.


In [9]:
#clean data
def clean_data(text):
    #remove punctuation, digits, extra stuff. make lowercase
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    #lemma it - include POS tag in order to lemma it better
    tag_map = defaultdict(lambda : wordnet.NOUN)
    tag_map['J'] = wordnet.ADJ
    tag_map['V'] = wordnet.VERB
    tag_map['R'] = wordnet.ADV
    lemmatizer = WordNetLemmatizer()
    textTokens = word_tokenize(text)
    text = [lemmatizer.lemmatize(tok, tag_map[tag[0]]) for tok, tag in pos_tag(textTokens)]
    return " ".join(text)

In [8]:
clean_data(raw_data["content_body_clean"].iloc[random_article])

'a procedure use to relieve chest pain in hundred of thousand of heart patient each year be useless for many of them researcher report on wednesday their study focus on the insertion of stent tiny wire cage to open blocked artery the device be lifesaving when use to open artery in patient in the throe of a heart attack but they be most often use in patient who have a block artery and chest pain that occur for example walk up a hill or go up stair sometimes patient get stent when they have no pain at all just blockage heart disease be still the lead killer of american — people have heart attack each year — and stenting be a mainstay treatment in virtually every hospital more than heart patient worldwide have stent insert each year to relieve chest pain accord to the researcher other estimate be far high'

In [10]:
#create corpus
corpus_data = raw_data
corpus_data["content_body_clean"] = corpus_data["content_body_clean"].apply(clean_data)
corpus_data["content_body_clean"].iloc[random_article]

'a procedure use to relieve chest pain in hundred of thousand of heart patient each year be useless for many of them researcher report on wednesday their study focus on the insertion of stent tiny wire cage to open blocked artery the device be lifesaving when use to open artery in patient in the throe of a heart attack but they be most often use in patient who have a block artery and chest pain that occur for example walk up a hill or go up stair sometimes patient get stent when they have no pain at all just blockage heart disease be still the lead killer of american — people have heart attack each year — and stenting be a mainstay treatment in virtually every hospital more than heart patient worldwide have stent insert each year to relieve chest pain accord to the researcher other estimate be far high'

### Save corpus as pickle

In [12]:
corpus_data.to_pickle("Data/muf_articles_corpus.pkl")

## Article Term Matrix

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words=stopwords.words('english'))

In [37]:
data_cv = cv.fit_transform(corpus_data["content_body_clean"])
data_atm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_atm.shape

(1914, 34393)

In [41]:
data_atm.to_pickle("Data/muf_articles_atm.pkl")