In [1]:
import pandas as pd

dataset = pd.read_csv('./dataset.csv')
dataset.head()

Unnamed: 0,article_text,category
0,Nitish Kumar’s exit will not affect INDIA bloc...,india
1,Ex-BharatPe MD Ashneer Grover and wife move De...,india
2,Tripurasundari Express suffers technical malfu...,india
3,Hemant Soren Live Updates: JMM-Congress coalit...,india
4,Puja in cellar: Gyanvapi mosque committee knoc...,india


In [2]:
dataset.describe().transpose()

Unnamed: 0,count,unique,top,freq
article_text,7500,7462,Expect high competitive intensity between Indi...,2
category,7500,5,india,1500


In [3]:
dataset.drop_duplicates(subset='article_text', inplace=True)

In [4]:
dataset.shape

(7462, 2)

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess_text(line: str):
    line = word_tokenize(line)
    line = [word.lower() for word in line if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(line)


[nltk_data] Downloading package stopwords to C:\Users\Darain
[nltk_data]     Quadri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Darain
[nltk_data]     Quadri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
articles = dataset['article_text']
articles = articles.apply(preprocess_text)
articles.head()

0    nitish kumar exit affect india bloc poll prosp...
1    md ashneer grover wife move delhi hc challengi...
2    tripurasundari express suffers technical malfu...
3    hemant soren live updates coalition mlas set f...
4    puja cellar gyanvapi mosque committee knocks a...
Name: article_text, dtype: object

In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

compound_score = articles.apply(lambda x: sid.polarity_scores(x)['compound'])
sentiment = compound_score.apply(lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral'))

result_df = pd.DataFrame({
    'processed_text': articles,
    'compound_score': compound_score,
    'sentiment': sentiment
})

result_df.to_csv('processed_dataset.csv', index=False)

[nltk_data] Downloading package vader_lexicon to C:\Users\Darain
[nltk_data]     Quadri\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
