In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Importing necessary Libraries

In [22]:
import numpy as np
import pandas as pd

import os
import math
import time
import random
import string

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

#for storing the model
import pickle

# Below libraries are for text processing using NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Below libraries are for feature representation using sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Below libraries are for similarity matrices using sklearn
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load data

In [40]:
news_articles_og = pd.read_json("/content/drive/MyDrive/IR Project - Group 49/Dataset/News_Category_Dataset_v2.json", lines = True) # snehal
# news_articles_og = pd.read_pickle('/content/drive/MyDrive/IR Project - Group 49/NewsData/NewsData_07_04') # shambhavi

In [57]:
# loading data from all the files 

files = os.listdir('/content/drive/MyDrive/IR Project - Group 49/NewsData') # snehal
path = '/content/drive/MyDrive/IR Project - Group 49/NewsData/'

# files = os.listdir('/content/drive/MyDrive/NewsData/') # shambhavi
# path = '/content/drive/MyDrive/NewsData/'

news_articles_og = pd.DataFrame()
temp = []
for file in files:
  file_path = path + file
  temp.append(pd.read_pickle(file_path))

news_articles_og = pd.concat(temp)


In [43]:
news_articles_og.head()

Unnamed: 0,short_description,link,headlines,category
0,Uorfi Javed criticised Sonali Kulkarni for her...,https://www.freepressjournal.in/amp/entertainm...,How insensitive: Uorfi on Sonali's 'many girls...,ENTERTAINMENT
1,Shakti Arora spoke about quitting the show 'Ku...,https://www.hindustantimes.com/entertainment/t...,Didn't want to play father to 28-yr-old: Shakt...,ENTERTAINMENT
2,"'Naatu Naatu' singer Kaala Bhairava, who misse...",https://twitter.com/kaalabhairava7/status/1636...,Singer Kaala Bhairava apologises for not thank...,ENTERTAINMENT
3,The makers of Ishaan Khatter and Mrunal Thakur...,https://indianexpress.com/article/entertainmen...,Makers of 'Pippa' refute reports of film's dir...,ENTERTAINMENT
4,"Guneet Monga, the producer of Oscar-winning do...",https://www.news18.com/amp/movies/guneet-monga...,India's moment taken away: Guneet Monga on not...,ENTERTAINMENT


# Applying pre-processing on text data

In [45]:
# Function for data preprocessing

def clean_text(text):
      le=WordNetLemmatizer()
      word_tokens=word_tokenize(text)
      tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
      tokens_cleaned = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
      cleaned_text=" ".join(tokens_cleaned)
      return cleaned_text


In [46]:
# Creating a new dataframe to store preprocessed data
rev={}
rev['cleaned_text']=news_articles_og['short_description'].apply(clean_text)

# LDA for topic modeling

In [47]:
vect =TfidfVectorizer(max_features=1000,min_df = 0)
vect_text=vect.fit_transform(rev['cleaned_text'])

In [48]:
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42,max_iter=1) 
lda_top=lda_model.fit_transform(vect_text)

In [49]:
print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  1.97209185574439 %
Topic  1 :  1.9716685491843928 %
Topic  2 :  1.9711594811356623 %
Topic  3 :  82.25683718544012 %
Topic  4 :  1.97099378095114 %
Topic  5 :  1.972350299762419 %
Topic  6 :  1.9710870720507434 %
Topic  7 :  1.97156208175785 %
Topic  8 :  1.9711854743785804 %
Topic  9 :  1.9710642195947075 %


In [50]:
vocab = vect.get_feature_names_out()
for i, comp in enumerate(lda_model.components_):
  vocab_comp = zip(vocab, comp)
  sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
  print("Topic "+str(i)+": ")
  for t in sorted_words:
    print(t[0],end=" ")
  print()

Topic 0: 
gandhi leader said congress rahul kejriwal party delhi study passport 
Topic 1: 
exam question roach class service mcqs secured paper zendaya board 
Topic 2: 
company said rule giant global million group stake private unit 
Topic 3: 
said user added twitter actress outfit also good musk india 
Topic 4: 
allu samantha arjun film ajay actor loan fan debt crore 
Topic 5: 
owaisi bail people tems photo godse view police hyderabad asaduddin 
Topic 6: 
apple google store mahindra over delivery cricket fifty report near 
Topic 7: 
school child student karnataka university book state making saying congress 
Topic 8: 
savarkar gandhi rahul congress leader black price parliament said singhvi 
Topic 9: 
super bank tiktok finance sitharaman match said nirmala 2023 final 


# Creating the final data with topics

In [51]:
df = pd.DataFrame(lda_top)

In [52]:
# create column names
cols = []
txt ='topic_'
for i in range(1,11):
  cols.append(txt+str(i))

cols[:5]

['topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5']

In [53]:
df.columns=cols
df.iloc[:10]

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10
0,0.019721,0.019717,0.019712,0.822568,0.01971,0.019724,0.019711,0.019716,0.019712,0.019711
1,0.019491,0.01949,0.019489,0.824594,0.019487,0.019493,0.019489,0.019489,0.019488,0.01949
2,0.026589,0.026579,0.02658,0.760674,0.026581,0.026663,0.026585,0.026589,0.026579,0.02658
3,0.023818,0.023813,0.023811,0.785704,0.023811,0.023807,0.023808,0.023809,0.023811,0.023808
4,0.019966,0.019877,0.019886,0.820991,0.019878,0.019876,0.019877,0.019882,0.019889,0.019878
5,0.022557,0.022546,0.022545,0.797061,0.022544,0.022544,0.022544,0.022563,0.022544,0.022552
6,0.023802,0.312629,0.023798,0.496968,0.023798,0.023804,0.0238,0.02381,0.023796,0.023795
7,0.02383,0.023828,0.02383,0.78554,0.023829,0.023828,0.023829,0.02383,0.023828,0.023828
8,0.02526,0.025374,0.025249,0.772613,0.025258,0.02525,0.025248,0.025251,0.025248,0.025248
9,0.01989,0.019892,0.182312,0.65856,0.01989,0.01989,0.01989,0.019895,0.01989,0.019891


In [54]:
# Creating a column for topic in news_articles_g dataset

topic = []
for li in lda_top:
  max_score = max(li)
  ind = np.where(li==max_score)[0]
  topic.append(ind[0])
  
# len(topic)

In [55]:
news_articles_with_topic = news_articles_og

news_articles_with_topic['topic_article_text'] = topic

In [56]:
news_articles_with_topic.head()

Unnamed: 0,short_description,link,headlines,category,topic_article_text
0,Uorfi Javed criticised Sonali Kulkarni for her...,https://www.freepressjournal.in/amp/entertainm...,How insensitive: Uorfi on Sonali's 'many girls...,ENTERTAINMENT,3
1,Shakti Arora spoke about quitting the show 'Ku...,https://www.hindustantimes.com/entertainment/t...,Didn't want to play father to 28-yr-old: Shakt...,ENTERTAINMENT,3
2,"'Naatu Naatu' singer Kaala Bhairava, who misse...",https://twitter.com/kaalabhairava7/status/1636...,Singer Kaala Bhairava apologises for not thank...,ENTERTAINMENT,3
3,The makers of Ishaan Khatter and Mrunal Thakur...,https://indianexpress.com/article/entertainmen...,Makers of 'Pippa' refute reports of film's dir...,ENTERTAINMENT,3
4,"Guneet Monga, the producer of Oscar-winning do...",https://www.news18.com/amp/movies/guneet-monga...,India's moment taken away: Guneet Monga on not...,ENTERTAINMENT,3


# Storing the data in pickle file

In [None]:
# Creating a pikle file to store the dataset 

# news_articles_with_topic.to_pickle("/content/drive/MyDrive/IR Project - Group 49/NewsData/news_articles_with_topic.pkl")


### Use the pickle module to load the data 

In [38]:
# news_articles_og = pd.read_pickle('/content/drive/MyDrive/IR Project - Group 49/Pickle Files/news_articles_with_sentiment_analysis.pkl')
