In [1]:
import pandas as pd

In [2]:
#Read csv file 
data = pd.read_csv('bbc-news-data.csv', sep="\t")

In [3]:
data.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [4]:
import nltk

In [5]:
nltk.download('stopwords') # Download stopwords from Natural Language Toolkit
nltk.download('punkt')     #Download punctuation from Natural Language Toolkit 

[nltk_data] Downloading package stopwords to /home/nahid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nahid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk_stopwords = nltk.corpus.stopwords.words('english')    #Choose English language from Stopwords
len(nltk_stopwords)

179

In [7]:
from nltk.stem.snowball import SnowballStemmer    #Using Stemming algorithm for root word

In [8]:
sno = SnowballStemmer('english')    #Provide language as prameter 

In [9]:
sno.stem('grows')

'grow'

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/nahid/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [14]:
from nltk.stem import WordNetLemmatizer

In [17]:
lem = WordNetLemmatizer()

In [26]:
lem.lemmatize('umberella')

'umberella'

In [19]:
from nltk import word_tokenize

In [28]:
dataset = pd.DataFrame(columns = ('title_body', 'category')) #Create Empty dataframe with two columns

#iterate each row of dataset
for index, row in data.iterrows():
    title_body = row['title'] + ' ' + row['content']
    title_body_tokenized = word_tokenize(title_body)  #Convert string to list 
    title_body_tokenized_filtered = [w for w in title_body_tokenized if not w in nltk_stopwords] #Filtered list by remove stopwords from the list
    title_body_tokenized_filtered_snoStemmed = [sno.stem(w) for w in title_body_tokenized_filtered]  #Use Stemming algorithm to stem filtered list
    title_body_tokenized_filtered_lem = [lem.lemmatize(w) for w in title_body_tokenized_filtered]  #Use Lemmatizing algorithm after Stemmering algorithm 
    dataset.loc[index] = {    #Fill up columns of dataset
        'title_body': ' '.join(title_body_tokenized_filtered_lem) + ' ' + ' '.join(title_body_tokenized_filtered_snoStemmed), 
        'category': row['category']
    }

In [29]:
dataset.head()

Unnamed: 0,title_body,category
0,Ad sale boost Time Warner profit Quarterly pro...,business
1,Dollar gain Greenspan speech The dollar hit hi...,business
2,Yukos unit buyer face loan claim The owner emb...,business
3,High fuel price hit BA 's profit British Airwa...,business
4,Pernod takeover talk lift Domecq Shares UK dri...,business


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [34]:
vectorizer.fit(dataset['title_body'])

TfidfVectorizer(ngram_range=(1, 2))

In [35]:
X = vectorizer.transform(dataset['title_body'])

In [36]:
X

<2225x574416 sparse matrix of type '<class 'numpy.float64'>'
	with 1277035 stored elements in Compressed Sparse Row format>

In [37]:
#We need to convert dependent variable to label(numbers)
from sklearn.preprocessing import LabelEncoder

In [38]:
le = LabelEncoder()

In [39]:
y = le.fit_transform(dataset['category'])

In [40]:
len(y)

2225

In [41]:
import numpy as np
print(np.unique(dataset['category']))

['business' 'entertainment' 'politics' 'sport' 'tech']


In [42]:
#Display the number of row and columns
np.shape(X)

(2225, 574416)

In [43]:
#Display the number of row
np.shape(y)

(2225,)

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X,y) 

In [46]:
from sklearn import svm

In [47]:
svmc = svm.SVC()
svmc.fit(X_train, y_train)

SVC()

In [48]:
#Display the accuracy 
svmc.score(X_test, y_test)  #Predict X_test and compare it with y_test

0.9658886894075404

In [49]:
from sklearn.metrics import classification_report, confusion_matrix

In [50]:
y_pred = svmc.predict(X_test)

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.95       129
           1       0.99      0.98      0.98        91
           2       0.99      0.93      0.96       101
           3       0.97      0.99      0.98       139
           4       0.99      0.93      0.96        97

    accuracy                           0.97       557
   macro avg       0.97      0.96      0.97       557
weighted avg       0.97      0.97      0.97       557



In [52]:
print(confusion_matrix(y_test, y_pred))

[[127   0   0   1   1]
 [  1  89   1   0   0]
 [  6   0  94   1   0]
 [  1   0   0 138   0]
 [  4   1   0   2  90]]
