In [39]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

import re

In [40]:
data = pd.read_csv("first_batch.csv")

data = data[["TITLE", "CATEGORY"]]
data.head()

Unnamed: 0,TITLE,CATEGORY
0,"Fed official says weak data caused by weather,...",b
1,Fed's Charles Plosser sees high bar for change...,b
2,US open: Stocks fall after Fed official hints ...,b
3,"Fed risks falling 'behind the curve', Charles ...",b
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b


In [41]:
data.isna().sum()

TITLE       0
CATEGORY    0
dtype: int64

In [42]:
data["CATEGORY"].unique()

array(['b', 'e', 't'], dtype=object)

In [43]:
data["NUM_CATEGORY"] = data["CATEGORY"].map({'b':0, 'e':1, 't': 2})

data.head()

Unnamed: 0,TITLE,CATEGORY,NUM_CATEGORY
0,"Fed official says weak data caused by weather,...",b,0
1,Fed's Charles Plosser sees high bar for change...,b,0
2,US open: Stocks fall after Fed official hints ...,b,0
3,"Fed risks falling 'behind the curve', Charles ...",b,0
4,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,b,0


In [44]:
X = data["TITLE"]
y = data["NUM_CATEGORY"]

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [45]:
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (45,)
X_test:  (20,)
y_train:  (45,)
y_test:  (20,)


In [46]:
vect = CountVectorizer(ngram_range=(2,2))

X_train = vect.fit_transform(X_train)

X_test = vect.transform(X_test)

In [47]:
print(X_train)

  (0, 71)	1
  (0, 192)	1
  (0, 235)	1
  (0, 175)	1
  (0, 117)	1
  (0, 25)	1
  (0, 254)	1
  (1, 135)	1
  (1, 89)	1
  (2, 71)	1
  (2, 190)	1
  (2, 143)	1
  (2, 273)	1
  (2, 93)	1
  (2, 48)	1
  (2, 111)	1
  (3, 103)	1
  (3, 262)	1
  (3, 225)	1
  (3, 53)	1
  (3, 229)	1
  (3, 75)	1
  (3, 31)	1
  (3, 50)	1
  (4, 106)	1
  :	:
  (42, 45)	1
  (42, 247)	1
  (42, 104)	1
  (43, 35)	1
  (43, 52)	1
  (43, 153)	1
  (43, 83)	1
  (43, 121)	1
  (43, 161)	1
  (43, 186)	1
  (43, 222)	1
  (43, 81)	1
  (43, 30)	1
  (43, 278)	1
  (43, 2)	1
  (44, 89)	1
  (44, 216)	1
  (44, 130)	1
  (44, 136)	1
  (44, 177)	1
  (44, 154)	1
  (44, 209)	1
  (44, 168)	1
  (44, 268)	1
  (44, 133)	1


In [48]:
mnb = MultinomialNB(alpha =0.2)

mnb.fit(X_train,y_train)

y_pred = mnb.predict(X_test)
print(y_pred)

[2 2 0 1 1 2 1 1 2 1 1 0 1 1 1 1 0 1 1 0]


In [49]:
accuracy_score(y_test, y_pred)

0.9

In [50]:
def predict_news(news):
    test = vect.transform(news)
    pred= mnb.predict(test)
    if pred  == 0:
         return 'Business or Politics'
    elif pred == 1:
        return 'Entertainment'
    elif pred == 2:
        return 'Technology'
    else:
        return 'no class found'
    

In [51]:
x=["Nifty IT index down nearly 3% on Infosys weak guidance"]
r = predict_news(x)
print (r)

Entertainment


In [52]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 4,  1,  0],
       [ 0, 10,  0],
       [ 0,  1,  4]], dtype=int64)