# Data clean pipeline

## Dependencies

In [19]:
import nltk
import pandas as pd
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codevars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/codevars/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Importing CSV

In [20]:
df = pd.read_csv('./eluniversal_sample.csv')
del df['location']
del df['Unnamed: 8']
df.columns

Index(['url', 'title', 'subtitle', 'section', 'author', 'date_time',
       'article_text'],
      dtype='object')

## Removing nulls in article text and section

In [21]:
df = df.dropna(subset=['article_text','section'])

In [22]:
df.shape

(488, 7)

## Tokenizing function

In [23]:
import nltk
from nltk.corpus import stopwords
import re
from unicodedata import normalize

stop_words = set(stopwords.words('spanish'))


def tokenize_column(tokenized_df, column_name):
    return (
        tokenized_df
            .apply(lambda row: nltk.word_tokenize(row[column_name]), axis=1)
            .apply(lambda tokens: [token for token in tokens if token.isalpha()])
            .apply(lambda tokens: [token.lower() for token in tokens])
            .apply(lambda word_list: [word for word in word_list if word not in stop_words])
            .apply(lambda word_list: ','.join(word_list))
            .apply(lambda answer_string: 
                       re.sub(
                         r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
                        normalize("NFD", answer_string), 0, re.I
                  )
            )
    )

In [24]:
tokenized_news = tokenize_column(df,'article_text')
df['article_text'] = tokenized_news

In [25]:
df['article_text']

0      oaxaca,defensoria,derechos,humanos,pueblo,oaxa...
1      presidente,andres,manuel,lopez,obrador,informo...
2      pese,presidente,andres,manuel,lopez,obrador,di...
3      senado,aprobo,ley,organica,fiscalia,general,re...
4      nuevo,video,excandidato,presidencial,ricardo,a...
                             ...                        
494    culiacan,sesion,publica,extraordinaria,congres...
495    gobernador,martin,orozco,sandoval,propuso,dar,...
496    apagon,comenzo,lunes,febrero,norte,noreste,mex...
498    coordinadores,legislativos,pri,prd,camara,dipu...
499    municipios,ratificaron,adhesion,mando,coordina...
Name: article_text, Length: 488, dtype: object

## Saving to CSV

In [26]:
df.to_csv('clean_news_data.csv');