In [1]:
import re
import unicodedata
import pandas as pd
import nltk
import acquire
import prepare

# Visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt', 'was']

def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

# Clean up the text file and label the columns
df = pd.read_csv('spam.csv', encoding='latin_1')
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df = df.rename(index=str, columns={"v1": "label", "v2": "text"})
df.head()

from wordcloud import WordCloud

## Explore the blog articles using the techniques discussed in the exploration lesson.

In [2]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']

def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [5]:
corpus = acquire.get_news_texts('business',
                                'sports')

/en/news/volkswagen-unit-porsche-fined-₹4100-crore-over-diesel-scandal-1557251200422
/en/news/guj-trader-who-gifted-cars-to-staff-to-remove-illegal-road-he-built-1557214245553
/en/news/spacexs-billionaire-moon-tourist-says-has-no-money-selling-art-1557239146967
/en/news/britannia-broke-rules-by-not-reporting-wadias-arrest-ingovern-1557234445948
/en/news/resigned-pledged-shares-provided-₹250-cr-to-banks-naresh-goyal-1557250273105
/en/news/ai-sent-₹2cr-to-nigeria-instead-of-us-firm-airline-says-probe-on-1557243035697
/en/news/us-warns-india-against-tariffs-over-scrapping-of-trade-benefits-1557245457108
/en/news/infosys-makes-hyd-staff-pay-for-parking-activists-call-it-illegal-1557230142855
/en/news/we-cant-ensure-cheaper-oil-sales-to-india-after-iran-sanctions-us-1557222796840
/en/news/indigo-talking-to-airbus-to-buy-yettobereleased-a321-xlr-jets-1557237922203
/en/news/unsure-on-galaxy-fold-shipping-will-cancel-us-preorders-samsung-1557244991830
/en/news/apple-features-warren-buffett-in-

In [6]:
corpus

[{'title': 'MI defeat CSK to enter IPL final for the fifth time | Sports News | Inshorts',
  'category': 'sports',
  'content': '\nMI defeat CSK to enter IPL final for the fifth time\n'},
 {'title': 'Preity Zinta jokingly warns MS Dhoni of kidnapping his daughter Ziva | Sports News | Inshorts',
  'category': 'sports',
  'content': '\nPreity Zinta jokingly warns MS Dhoni of kidnapping his daughter Ziva\n'},
 {'title': 'Rayudu, Shankar end IPL 2019 league stage with identical numbers | Sports News | Inshorts',
  'category': 'sports',
  'content': '\nRayudu, Shankar end IPL 2019 league stage with identical numbers\n'},
 {'title': 'Sorry to let fans down, I will come back stronger: Jaydev Unadkat | Sports News | Inshorts',
  'category': 'sports',
  'content': '\nSorry to let fans down, I will come back stronger: Jaydev Unadkat\n'},
 {'title': "Mandhana's Trailblazers beat Harmanpreet's Supernovas on last ball | Sports News | Inshorts",
  'category': 'sports',
  'content': "\nMandhana's Tra

In [None]:
prepare.prepare_article_data(corpus)
corpus

In [None]:
corpus_df = pd.DataFrame(corpus)
corpus_df.head()

In [None]:
corpus_df= corpus_df[['category', 'title', 'clean']]
corpus_df

In [None]:
category_words = clean(' '.join(df[df.label == 'ham'].text))
spam_words = clean(' '.join(df[df.label == 'spam'].text))
all_words = clean(' '.join(df.text))

In [None]:
all_cloud = WordCloud(background_color='white', height=1000, width=400).generate(' '.join(all_words))
ham_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(ham_words))
spam_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(spam_words))

plt.figure(figsize=(10, 8))
axs = [plt.axes([0, 0, .5, 1]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, 0, .5, .5])]

axs[0].imshow(all_cloud)
axs[1].imshow(ham_cloud)
axs[2].imshow(spam_cloud)

axs[0].set_title('All Words')
axs[1].set_title('Ham')
axs[2].set_title('Spam')

for ax in axs: ax.axis('off')