In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import unicodedata
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Labels,Text,Text_Tag
0,1,Says the Annies List political group supports ...,abortion
1,2,When did the decline of coal start? It started...,"energy,history,job-accomplishments"
2,3,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy
3,1,Health care reform legislation is likely to ma...,health-care
4,2,The economic turnaround started at the end of ...,"economy,jobs"


In [3]:
y = df.pop('Labels')
np.unique(y)

array([0, 1, 2, 3, 4, 5])

## Building a pipeline

I remember from the nlp assignment that it was confusing to use the [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)... You can specify stopwords to remove, and a tokenizer, but need to remove punctuation and accents prior.

To get a better understanding of how it all works, I created a vocabulary to feed the CountVectorizer after sending the corpus through the cleaning_pipeline.

1. Lowercase
2. Remove punctuation
3. Remove stopwords
4. Remove accents
5. Lemmatize

In [4]:
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

def cleaning_pipeline(corpus, cleaner=WordNetLemmatizer):
    
    corpus = [row.lower() for row in corpus]
    print('Lowercase:')
    print(f'{corpus[:1]}')
    
    corpus = [punctuation_removal(row) for row in corpus]
    print('\nPunctuation Removed:')
    print(f'{corpus[:1]}')
    
    stop = stopwords.words('english')
    corpus = [' '.join([word for word in row.split() if word not in (stop)]) for row in corpus]
    print('\nStopwords Removed:')
    print(f'{corpus[:1]}')
    
    corpus = [remove_accents(row) for row in corpus]
    print('\nAccents Removed:')
    print(f'{corpus[:1]}')
    
    word_list = [word_tokenize(row) for row in corpus]
    print('\nTokenized:')
    print(f'{word_list[:1]}')
    
    stem_lemm = cleaner()
    stem_lemm_output = [' '.join([stem_lemm.lemmatize(word) for word in words]) for words in word_list]
    print('\nLemmatized:')
    print(f'{stem_lemm_output[:1]}')
    
    return stem_lemm_output

In [5]:
corpus = df['Text'].tolist()

docs_lem = cleaning_pipeline(corpus)

Lowercase:
['says the annies list political group supports third-trimester abortions on demand.']

Punctuation Removed:
['says the annies list political group supports thirdtrimester abortions on demand']

Stopwords Removed:
['says annies list political group supports thirdtrimester abortions demand']

Accents Removed:
['says annies list political group supports thirdtrimester abortions demand']

Tokenized:
[['says', 'annies', 'list', 'political', 'group', 'supports', 'thirdtrimester', 'abortions', 'demand']]

Lemmatized:
['say annies list political group support thirdtrimester abortion demand']


In [6]:
docs_lem

['say annies list political group support thirdtrimester abortion demand',
 'decline coal start started natural gas took started begin president george w bush administration',
 'hillary clinton agrees john mccain voting give george bush benefit doubt iran',
 'health care reform legislation likely mandate free sex change surgery',
 'economic turnaround started end term',
 'chicago bear starting quarterback last 10 year total number tenured uw faculty fired last two decade',
 'jim dunnam lived district represents year',
 'im person stage worked actively last year passing along rus feingold toughest ethic reform since watergate',
 'however took 195 million oregon lottery fund port newport eventually land new noaa marine operation centerpacific',
 'say gop primary opponent glenn grothman joe leibham cast compromise vote cost 788 million higher electricity cost',
 'first time history share national popular vote margin smaller latino vote margin',
 'since 2000 nearly 12 million american slip

In [8]:
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(docs_lem)
words = vectorizer.get_feature_names()
vect_df = pd.DataFrame(word_counts.toarray(), columns=words)
vect_df.head()

Unnamed: 0,005,01,02,025,03,04,047,05,06,07,...,zimmerman,zinn,zip,zippo,zombie,zone,zoning,zoo,zuckerberg,zuckerbergs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
vect_df['y'] = y
vect_df.head()

Unnamed: 0,005,01,02,025,03,04,047,05,06,07,...,zinn,zip,zippo,zombie,zone,zoning,zoo,zuckerberg,zuckerbergs,y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [10]:
tfidfvect = TfidfVectorizer()
tfidf_vectorized = tfidfvect.fit_transform(docs_lem)
words_tfidf = tfidfvect.get_feature_names()
tfidf_df = pd.DataFrame(tfidf_vectorized.toarray(), columns=words_tfidf)
tfidf_df.head()

Unnamed: 0,005,01,02,025,03,04,047,05,06,07,...,zimmerman,zinn,zip,zippo,zombie,zone,zoning,zoo,zuckerberg,zuckerbergs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
tfidf_df['y'] = y
tfidf_df.head()

Unnamed: 0,005,01,02,025,03,04,047,05,06,07,...,zinn,zip,zippo,zombie,zone,zoning,zoo,zuckerberg,zuckerbergs,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
