In [1]:
import pandas as pd
import pickle
import re

In [2]:
df = pd.read_csv('df_file.csv')
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [3]:
X = df['Text']
y = df['Label']

In [4]:
def preprocess_text(text):
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    return text

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_train_transformed = cv.fit_transform(X_train_preprocessed).toarray()
X_test_transformed = cv.transform(X_test_preprocessed).toarray()

  text = re.sub(r'[[]]', ' ', text)


In [7]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_transformed, y_train)

In [8]:
y_pred = model.predict(X_test_transformed)

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

In [10]:
print('Accuracy is: ', acc)

Accuracy is:  0.9797752808988764


In [11]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vectorizer', cv), ('multinomialNB', model)])
pipe.fit(X_train_preprocessed, y_train)

In [12]:
with open('classify_news_pipeline-0.1.0.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [13]:
text = 'Chelsea scored a great goal last night'
y = pipe.predict([text])
y

array([1])

In [14]:
classes = [
    'Politics',
    'Sport',
    'Technology',
    'Entertainment',
    'Business'
]
probabilities = model.predict_proba([text]).tolist()
probs_percentage = [[round(prob * 100, 4) for prob in probs] for probs in probabilities]

class_probs = dict(zip(classes, probs_percentage[0]))

class_probs

{'Politics': 0.0001,
 'Sport': 99.9926,
 'Technology': 0.0,
 'Entertainment': 0.0073,
 'Business': 0.0}