In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from gensim.utils import simple_preprocess

In [2]:
dataset = pd.read_csv('/Users/test/Desktop/news_cato/dataset/trainset/combined_csv.csv')[['headline', 'sector_name']].rename(columns = {'sector_name': 'category'})
ds = pd.read_csv('/Users/test/Desktop/news_cato/dataset/livemint_data_2023.csv')[['headline', 'sector_name']].rename(columns = {'sector_name': 'category'})

In [3]:
ds.dropna(inplace = True)

In [4]:
dataset.dropna(inplace = True)

In [5]:
dataset.category.value_counts()

FMCG                             12892
Health & Wellness                 8743
Auto & Ancillary                  6447
E-Commerce & IT                   6348
Handsets                          5570
BFSI                              5330
Communications                    5042
OTT, Gaming & Digital Content     3840
Retail                            2862
Construction & Real Estate        2025
Fashion & Lifestyle               1709
Liquor & Tobacco                  1500
Internet based service            1459
Consumer Electronics              1197
Ed-Tech                           1150
E-health                          1075
Media & Entertainment              997
Name: category, dtype: int64

In [6]:
ds.category.value_counts()

FMCG                             2192
Health & Wellness                2030
Auto & Ancillary                 1557
Handsets                         1434
Communications                   1323
Internet based service            720
Fashion & Lifestyle               702
E-Commerce & IT                   669
Retail                            666
BFSI                              519
Ed-Tech                           261
OTT, Gaming & Digital Content     162
Liquor & Tobacco                  147
Media & Entertainment              31
Consumer Electronics               26
Construction & Real Estate         13
E-health                           10
Name: category, dtype: int64

In [7]:
X_train = dataset['headline']
y_train = dataset['category']
X_test = ds['headline']
y_test = ds['category']

In [8]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000) 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [9]:
logistic_classifier = LogisticRegression(max_iter=1000)  

In [10]:
logistic_classifier.fit(X_train_tfidf, y_train)

In [11]:
y_pred = logistic_classifier.predict(X_test_tfidf)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.5442144118119082


In [13]:
report = classification_report(y_test, y_pred)
print(report)

                               precision    recall  f1-score   support

             Auto & Ancillary       0.76      0.67      0.71      1557
                         BFSI       0.60      0.61      0.61       519
               Communications       0.89      0.53      0.66      1323
   Construction & Real Estate       0.05      0.31      0.08        13
         Consumer Electronics       0.25      0.23      0.24        26
              E-Commerce & IT       0.54      0.49      0.51       669
                     E-health       0.18      0.40      0.25        10
                      Ed-Tech       0.88      0.30      0.45       261
                         FMCG       0.34      0.83      0.48      2192
          Fashion & Lifestyle       0.94      0.42      0.58       702
                     Handsets       0.57      0.41      0.48      1434
            Health & Wellness       0.84      0.43      0.57      2030
       Internet based service       0.78      0.34      0.47       720
     

In [14]:
import fasttext

In [15]:
pretrained_model_path = '/Users/test/Desktop/news_cato/cc.en.300.bin'
ft_model = fasttext.load_model(pretrained_model_path)



In [16]:

X_train_embeddings = [ft_model.get_sentence_vector(text) for text in X_train]
X_test_embeddings = [ft_model.get_sentence_vector(text) for text in X_test]

In [17]:

logistic_classifier = LogisticRegression(max_iter=1000)

In [18]:

logistic_classifier.fit(X_train_embeddings, y_train)

In [19]:

y_pred = logistic_classifier.predict(X_test_embeddings)

In [20]:

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.4285026480500722


In [21]:

report = classification_report(y_test, y_pred)


print(report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                               precision    recall  f1-score   support

             Auto & Ancillary       0.67      0.59      0.63      1557
                         BFSI       0.38      0.49      0.43       519
               Communications       0.82      0.40      0.54      1323
   Construction & Real Estate       0.00      0.00      0.00        13
         Consumer Electronics       0.00      0.00      0.00        26
              E-Commerce & IT       0.34      0.40      0.37       669
                     E-health       0.00      0.00      0.00        10
                      Ed-Tech       0.53      0.03      0.06       261
                         FMCG       0.27      0.80      0.41      2192
          Fashion & Lifestyle       0.93      0.04      0.08       702
                     Handsets       0.50      0.26      0.34      1434
            Health & Wellness       0.80      0.40      0.54      2030
       Internet based service       0.74      0.14      0.23       720
     

  _warn_prf(average, modifier, msg_start, len(result))
