# Preprocessing

In [15]:
# Imports

import json
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import pandas as pd

In [16]:
# Load the data into a list called data

data = []

with open('News_Category_Dataset_IS_course.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

data = pd.DataFrame(data)

In [17]:
# Clean the data

# Initialize lists to store indices of missing data
missing_headline_indices = set()
missing_description_indices = set()
missing_both_indices = set()

# Print the number of datapoints
print("Number of datapoints:", len(data))

# Find missing data
for i in range(len(data)):
    data['idx'] = i
    if data['headline'][i] == "" or data['headline'][i] is None:
        missing_headline_indices.add(i)
    if data['short_description'][i] == "" or data['short_description'][i] is None:
        missing_description_indices.add(i)

missing_both_indices = missing_headline_indices.intersection(missing_description_indices)

# Print the number of missing data
print("Number of missing headlines:", len(missing_headline_indices))
print("Number of missing short descriptions:", len(missing_description_indices))
print("Number of missing both:", len(missing_both_indices))

# Print the indices of missing data
print("Indices of missing headlines:", missing_headline_indices)
print("Indices of missing short descriptions:", missing_description_indices)
print("Indices of missing both:", missing_both_indices)

# Remove the datapoints with missing data
data = data.drop(missing_both_indices)

# Print the number of datapoints
print("Number of datapoints:", len(data))

Number of datapoints: 148122
Number of missing headlines: 734
Number of missing short descriptions: 12184
Number of missing both: 61
Indices of missing headlines: {38918, 116750, 77846, 61464, 77855, 47145, 129066, 67629, 36910, 34863, 51247, 14395, 73795, 63559, 79943, 18505, 28746, 69709, 10318, 122957, 71762, 145492, 67669, 106581, 75867, 100443, 116828, 4197, 34920, 14446, 135285, 133238, 98423, 114808, 118904, 123001, 137336, 139381, 147573, 39039, 143494, 26762, 143499, 84110, 143, 80016, 114830, 22681, 49306, 59547, 53408, 145569, 2211, 53411, 145583, 123056, 108722, 69815, 61626, 135354, 73917, 80065, 100546, 49347, 43206, 8392, 67788, 14541, 209, 59602, 86229, 147671, 41181, 116958, 10463, 137437, 73954, 100582, 65767, 57576, 67815, 121064, 69869, 39152, 78068, 4341, 92405, 119031, 73976, 112890, 82178, 139524, 16645, 51461, 67845, 24844, 63759, 18705, 78101, 22809, 39196, 145693, 49442, 86306, 133415, 55592, 127277, 119087, 127279, 71987, 106807, 69945, 78137, 49472, 49474, 6

In [18]:
# Function to preprocess text

# Download necessary resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Lemmatizer and Stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    if text == "" or text is None:
        return text

    # Tokenize
    words = word_tokenize(text.lower())

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    words = [w.translate(table) for w in words if w.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]

    # Lemmatize
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words]

    # Stemming
    stemmed_words = [stemmer.stem(w) for w in lemmatized_words]

    # Join the words back
    text = " ".join(stemmed_words)

    return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Domen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
# Preprocess the data

# Preprocess the headline and short description
data['clean_headline'] = data['headline'].apply(preprocess_text)
data['clean_short_description'] = data['short_description'].apply(preprocess_text)

print(data['headline'][0])
print(data['clean_headline'][0])
print(data['short_description'][0])
print(data['clean_short_description'][0])

# Save the preprocessed data
data.to_csv('News_Category_Dataset_IS_course_preprocessed.csv', index=False)



# Print the majority classifier accuracy
print("Majority classifier accuracy:") 
print(data.groupby(['category']).size().max() / len(data))

# Print the category disctribution
print("Category distribution:")
print(data.groupby(['category']).size())

23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23)
funniest tweet cat dog week
"Until you have a dog you don't understand what could be eaten."
dog understand could eaten
Majority classifier accuracy:
0.24036714597361897
Category distribution:
category
BLACK VOICES       4583
BUSINESS           5989
COMEDY             5395
ENTERTAINMENT     17344
FOOD & DRINK       6340
HEALTHY LIVING     6686
HOME & LIVING      4320
PARENTING          8791
PARENTS            3953
POLITICS          35589
QUEER VOICES       6337
SPORTS             5075
STYLE & BEAUTY     9814
TRAVEL             9900
WELLNESS          17945
dtype: int64
