In [3]:
# Imports Requires For Data Cleaning
import pandas as pd
import re

In [5]:
# Merging articles from Print website and articles from US Defense Website
articles1 = pd.read_csv('articles_the_print.csv')
articles2 = pd.read_csv('articles_us_defense.csv')

# applying full month format to us defense articles
def full_month(text: str):
    text = text.lower()
    replacements = {
        'jan.': 'january',
        'feb.': 'february',
        'aug.': 'august',
        'sept.': 'september',
        'oct.': 'october',
        'nov.': 'november',
        'dec.': 'december'
    }
    for short, full in replacements.items():
        if short in text:
            text = text.replace(short, full)
            # break if it is found earlier
            # (since there is only one replacement)
            # Such that it saves time
            break
    return text

# Applying full month to US Defense Articles
articles2['Date'] = articles2['Date'].apply(full_month)
# Merging both articles DataFrames
merged_articles = pd.concat([articles1, articles2], ignore_index=True)
# Saving Into One CSV File
merged_articles.to_csv('all_articles.csv', index=False)

In [6]:
articles = pd.read_csv('all_articles.csv')
articles.shape

(13600, 3)

In [34]:
articles.head()

Unnamed: 0,Title,Date,Description
0,Why Mission Divyastra is significant & what it...,"11 March, 2024",New Delhi: The successful launch of Mission Di...
1,Agni-V test firing puts India in elite list of...,"11 March, 2024",New Delhi: Prime Minister Narendra Modi on Mon...
2,Maharashtra ATS arrests Mazagon Dockyard emplo...,"11 March, 2024",New Delhi: The Anti-Terrorism Squad (ATS) of t...
3,"India remains world’s largest arms importer, F...","11 March, 2024",New Delhi: India continues to be the world’s t...
4,Chinese research vessel enters Bay of Bengal d...,"11 March, 2024",New Delhi: Days before planned missile tests b...


In [35]:
articles.tail()

Unnamed: 0,Title,Date,Description
13595,"Contracts for Jan. 4, 2019","4 january, 2019","ARMY Avon Protection Systems, Cadillac, Michig..."
13596,One Mean Airplane!,"4 january, 2019",The F-35C Lightning II is the Navy's aircraft ...
13597,"Contracts for Jan. 3, 2019","3 january, 2019",DEFENSE INTELLIGENCE AGENCY Fulcrum IT Service...
13598,Shanahan Stresses National Defense Strategy in...,"3 january, 2019",Acting Defense Secretary Patrick M. Shanahan w...
13599,10 Things You Probably Didn’t Know About the P...,"3 january, 2019","1 Why is the Pentagon, you know, a pentagon? T..."


In [37]:
def clean_text(text: str):
    # Stripping the text from left and right
    text = text.strip()
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    # Replace ambiguous Unicode characters
    cleaned_text = cleaned_text.encode('ascii', 'ignore').decode('utf-8')

    return cleaned_text

In [38]:
# Clean the 'title', 'date' and 'description' columns
articles['Title'] = articles['Title'].apply(clean_text)
articles['Date'] = articles['Date'].apply(clean_text)
articles['Description'] = articles['Description'].apply(clean_text)
articles.head()

Unnamed: 0,Title,Date,Description
0,why mission divyastra is significant what it ...,11 march 2024,new delhi the successful launch of mission div...
1,agniv test firing puts india in elite list of ...,11 march 2024,new delhi prime minister narendra modi on mond...
2,maharashtra ats arrests mazagon dockyard emplo...,11 march 2024,new delhi the antiterrorism squad ats of the m...
3,india remains worlds largest arms importer fra...,11 march 2024,new delhi india continues to be the worlds top...
4,chinese research vessel enters bay of bengal d...,11 march 2024,new delhi days before planned missile tests by...


In [39]:
articles.tail()

Unnamed: 0,Title,Date,Description
13595,contracts for jan 4 2019,4 january 2019,army avon protection systems cadillac michigan...
13596,one mean airplane,4 january 2019,the f35c lightning ii is the navys aircraft ca...
13597,contracts for jan 3 2019,3 january 2019,defense intelligence agency fulcrum it service...
13598,shanahan stresses national defense strategy in...,3 january 2019,acting defense secretary patrick m shanahan wi...
13599,10 things you probably didnt know about the pe...,3 january 2019,1 why is the pentagon you know a pentagon the ...


In [40]:
articles.describe()

Unnamed: 0,Title,Date,Description
count,13600,13600,13600
unique,12284,1744,13471
top,pentagon press secretary john f kirby holds a ...,26 february 2019,pentagon deputy press secretary sabrina singh ...
freq,83,33,4


In [41]:
# Removing Duplicates if found considering both title and description
articles = articles.drop_duplicates(subset=['Title','Description'])
articles.describe()

Unnamed: 0,Title,Date,Description
count,13471,13471,13471
unique,12284,1744,13471
top,pentagon press secretary john f kirby holds a ...,26 february 2019,new delhi the successful launch of mission div...
freq,83,33,1


In [42]:
# Save the cleaned articles to a new CSV file
articles.to_csv("cleaned_articles.csv", index=False)