## Multi Modal News Analyzer

### Part 1
#### Data Collection and Preparation
- Text Cleaning
- Tokenization
- Stop Word Removal
- Lemmatization

In [4]:
import pandas as pd

In [13]:
df=pd.read_json('data/News_Category_Dataset_v3.json', lines=True)

In [6]:
print("\n First 5 rows of the data:")
print(df.head())

print("\n DataFrame info")
print(df.info())


 First 5 rows of the data:
                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss  

In [4]:
#Check for missing values

print("\n Missing values per column:")
print(df.isnull().sum())

#Check distribution of categories

print(df['category'].value_counts())

print("\n Example headlines and descriptions:")
print(df[['headline', 'short_description', 'category']].sample(5))


 Missing values per column:
link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64
category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        

In [36]:
#Function to clean the text
import re
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english'))
def clean_text(text):
    #convert to lowercase
    text=text.lower()

    #remove URLs
    text= re.sub(r'https?:\S*|www\.S*','',text)

    #remove punctuation and special characters
    text=text.translate(str.maketrans('','', string.punctuation))

    #remove stop words
    words=text.split()
    filtered_words=[word for word in words if word not in stop_words]
    # join the words back into a string
    text=" ".join(filtered_words)

    return text
    

In [16]:
df['combined_text'] = df['headline'] + " " + df['short_description']

In [17]:
df['combined_text']=df['combined_text'].apply(clean_text)

In [24]:
#function for lemmatising text with nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_words
    


In [28]:
import nltk
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tnevi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tnevi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tnevi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [31]:
df['lemmatized_combined_text']=df['combined_text'].apply(tokenize_and_lemmatize)

In [32]:
print(df['lemmatized_combined_text'].head())

0    [4, million, american, roll, sleeve, omicronta...
1    [american, airline, flyer, charged, banned, li...
2    [23, funniest, tweet, cat, dog, week, sept, 17...
3    [funniest, tweet, parent, week, sept, 1723, ac...
4    [woman, called, cop, black, birdwatcher, loses...
Name: lemmatized_combined_text, dtype: object


In [35]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 8 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   link                      209527 non-null  object        
 1   headline                  209527 non-null  object        
 2   category                  209527 non-null  object        
 3   short_description         209527 non-null  object        
 4   authors                   209527 non-null  object        
 5   date                      209527 non-null  datetime64[ns]
 6   combined_text             209527 non-null  object        
 7   lemmatized_combined_text  209527 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 12.8+ MB
None
