
## Build text classification models using scikit-learn
- Use TfidfVectorizer to transform input texts into tfidf encoded float point matrix
- Build a pipeline that include both feature extraction, and classification model
- Build and train models
- Evaluate model performace


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn import metrics

In [3]:
df = pd.read_csv('kaggle_data/bbc-text.csv')
print(df.shape, df['category'].nunique())
df.head(2)

(2225, 2) 5


Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...


In [4]:
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['category'], test_size=.2, stratify=df['category'], random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1780,) (445,) (1780,) (445,)


In [6]:
sgd = Pipeline(
        [
            (
                "tfidf_vector_com",
                TfidfVectorizer(),
            ),
            (
                "clf",
                SGDClassifier(),
            ),
        ]
    )

In [7]:
def print_metrics(pred_test, y_test, pred_train, y_train):
    print("test accuracy", str(np.mean(pred_test == y_test)))
    print("train accuracy", str(np.mean(pred_train == y_train)))
    print("\n Metrics and Confusion for SVM \n")
    print(metrics.confusion_matrix(y_test, pred_test))
    print(metrics.classification_report(y_test, pred_test))

In [8]:
%%time
sgd.fit(X_train, y_train)
pred_test = sgd.predict(X_test)
pred_train = sgd.predict(X_train)
print_metrics(pred_test, y_test, pred_train, y_train)

test accuracy 0.9820224719101124
train accuracy 1.0

 Metrics and Confusion for SVM 

[[100   0   2   0   0]
 [  0  76   0   0   1]
 [  1   1  81   0   1]
 [  0   0   0 102   0]
 [  1   1   0   0  78]]
               precision    recall  f1-score   support

     business       0.98      0.98      0.98       102
entertainment       0.97      0.99      0.98        77
     politics       0.98      0.96      0.97        84
        sport       1.00      1.00      1.00       102
         tech       0.97      0.97      0.97        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445

CPU times: user 2.45 s, sys: 28.7 ms, total: 2.47 s
Wall time: 915 ms


# Understand model coefficient. What are the the most import features/words for classficiation

In [9]:
# the model pipeline
sgd

Pipeline(steps=[('tfidf_vector_com', TfidfVectorizer()),
                ('clf', SGDClassifier())])

In [10]:
# model classes
sgd.classes_

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype='<U13')

In [11]:
# the feature coefficients for business class
sgd['clf'].coef_[0]

array([-0.05925439,  0.25474777,  0.        , ...,  0.        ,
        0.        ,  0.        ])

In [12]:
# top 10 positive feature coefficients for business class
top_coef_idx = sorted([(v,i) for (i, v) in enumerate(sgd['clf'].coef_[0])], reverse=True)[:10]
top_coef_idx

[(2.4074082816652034, 3128),
 (2.329973528697794, 21702),
 (2.2649871765258487, 13260),
 (2.236753638759535, 8434),
 (1.8363101117575296, 5806),
 (1.830036678535737, 9882),
 (1.5941859779599135, 13107),
 (1.535328853990862, 4489),
 (1.5173086259584452, 15193),
 (1.5162727946923382, 20986)]

In [13]:
# the top indices 
top_idx = [e[1] for e in top_coef_idx]
top_idx

[3128, 21702, 13260, 8434, 5806, 9882, 13107, 4489, 15193, 20986]

In [14]:
# the top word related to the business class
[ word for (word,seq) in sgd['tfidf_vector_com'].vocabulary_.items() if seq in top_idx]

['business',
 'its',
 'economic',
 'bank',
 'firm',
 'market',
 'sales',
 'company',
 'investment',
 'shares']

## Exercise:
The above 10 most import words are not ordered by its importance/coefficient, I leave it as an exercise for you to complete:
- Get the top N words ordered by coefficent for a given class
- report all top words for all class categories
- Aso report top N negative coeeficients for the class, the top N negative coeeficients in the class are most likely to be top words in other class categories
- format the report nicely 

In [59]:
# list(enumerate(sgd['clf'].coef_[0]))

In [53]:
# Convert word to index sequence dictionary to index to word for easy lookup word by index
idx_to_word = {idx:word for (word, idx) in sgd['tfidf_vector_com'].vocabulary_.items()}
# idx_to_word

In [55]:
np.where(sgd.classes_== 'business')

(array([0]),)

In [16]:
def top_n_features_by_coef(n, classname):
    """
        Args:
            n: the top number of words by coefficient,
            classname: the class label
        Returns:
            top or bottom n words with coefficients
    """
    class_idx = np.where(sgd.classes_== classname)[0][0]
    idx_coef = sorted(
        [(i,v) for (i, v) in enumerate(sgd['clf'].coef_[class_idx])], key=lambda e: e[1],reverse=True)
    top_n_idx_coef = idx_coef[:n]
    bottom_n_idx_coef = idx_coef[-n:] # top negative N words, sort asscending
    top_word_coef = list(map(lambda e: (idx_to_word[e[0]], round(e[1], 4)), top_n_idx_coef))
    bottom_word_coef = list(map(lambda e: (idx_to_word[e[0]], round(e[1], 4)), bottom_n_idx_coef))
    df = pd.DataFrame(top_word_coef, columns=[f"{classname}_word", 'coef'])
    df_bottom = pd.DataFrame(bottom_word_coef, columns=[f"{classname}_word", 'coef'])
    return pd.concat([df, df_bottom])

In [17]:
n = 10
df_list = []
for _class in sgd.classes_:
    df_list.append(top_n_features_by_coef(n, _class))
pd.concat(df_list, axis=1)

Unnamed: 0,business_word,coef,entertainment_word,coef.1,politics_word,coef.2,sport_word,coef.3,tech_word,coef.4
0,bank,2.4074,film,3.5343,blair,2.4407,match,1.846,online,2.4196
1,shares,2.33,show,3.0107,party,2.2978,cup,1.7546,software,2.3592
2,its,2.265,music,2.5611,lord,2.051,rugby,1.651,technology,2.3324
3,economic,2.2368,singer,2.4325,secretary,2.0133,athletics,1.5769,computer,2.1125
4,company,1.8363,album,2.3064,government,2.0103,players,1.5422,games,1.9707
5,firm,1.83,band,2.1063,minister,1.9874,season,1.5281,digital,1.9346
6,investment,1.5942,star,1.9524,labour,1.9762,win,1.5194,game,1.7995
7,business,1.5353,festival,1.5706,committee,1.8323,liverpool,1.4837,ink,1.7228
8,market,1.5173,tv,1.5275,mr,1.8213,club,1.4555,internet,1.6473
9,sales,1.5163,ballet,1.5092,straw,1.8144,coach,1.3881,net,1.5992


The tfidf (a common term weighting scheme in information retrieval) values that were fed into SGD classifier

In [60]:
sgd['tfidf_vector_com']

TfidfVectorizer()

In [18]:
print(sgd['tfidf_vector_com'].idf_.shape)
sgd['tfidf_vector_com'].idf_

(26795,)


array([6.69317081, 2.47121513, 7.7917831 , ..., 7.7917831 , 7.7917831 ,
       7.38631799])

In [63]:
feature_output = sgd['tfidf_vector_com'].idf_ * sgd['clf'].coef_
def top_n_features_by_feature_output(n, classname):
    """
        Args:
            n: the top number of words by coefficient,
            classname: the class label
        Returns:
            top or bottom n words with coefficients
    """
    class_idx = np.where(sgd.classes_== classname)[0][0]
    idx_coef = sorted(
        [(i,v) for (i, v) in enumerate(feature_output[class_idx])], key=lambda e: e[1],reverse=True)
    top_n_idx_coef = idx_coef[:n]
    bottom_n_idx_coef = idx_coef[-n:] # top negative N words, sort asscending
    top_word_coef = list(map(lambda e: (idx_to_word[e[0]], round(e[1], 4)), top_n_idx_coef))
    bottom_word_coef = list(map(lambda e: (idx_to_word[e[0]], round(e[1], 4)), bottom_n_idx_coef))
    df = pd.DataFrame(top_word_coef, columns=[f"{classname}_word", 'feature_output'])
    df_bottom = pd.DataFrame(bottom_word_coef, columns=[f"{classname}_word", 'feature_output'])
    return pd.concat([df, df_bottom])

In [64]:
n = 10
df_list = []
for _class in sgd.classes_:
    df_list.append(top_n_features_by_feature_output(n, _class))
pd.concat(df_list, axis=1)

Unnamed: 0,business_word,feature_output,entertainment_word,feature_output.1,politics_word,feature_output.2,sport_word,feature_output.3,tech_word,feature_output.4
0,bank,8.8223,film,11.4067,ict,12.01,hamm,8.1622,ink,11.2653
1,shares,8.5956,ballet,10.3765,straw,9.4835,athletics,8.0164,argonaut,9.6489
2,economic,7.7711,album,10.3696,snooker,8.7586,doping,7.1837,software,8.8646
3,datamonitor,7.7181,gallery,10.1932,lord,8.6891,rugby,7.1168,online,8.7156
4,wto,7.5195,singer,10.1372,councils,8.6832,liverpool,7.0437,simonetti,8.3161
5,boeing,7.4387,hendrix,9.4445,duchy,8.5979,match,6.4889,computer,7.6912
6,crossrail,7.3504,freeview,9.2216,blair,8.4961,cup,6.43,spam,7.5454
7,davos,7.2253,band,8.9844,safety,8.1032,balco,6.3047,technology,7.5275
8,emi,6.5881,show,8.9707,party,7.5264,athens,6.1105,rfid,7.1601
9,tobacco,6.3727,music,8.2,nuclear,7.4859,rangers,6.0413,sigarchi,7.0222


In [65]:
sgd['tfidf_vector_com'].idf_ 

array([6.69317081, 2.47121513, 7.7917831 , ..., 7.7917831 , 7.7917831 ,
       7.38631799])

In [66]:
maxidx = np.argmax(sgd['tfidf_vector_com'].idf_)
maxval = max(sgd['tfidf_vector_com'].idf_ )
maxidx, maxval

(2, 7.791783102729716)

In [67]:
np.where(sgd['tfidf_vector_com'].idf_  > 7)

(array([    2,     4,     5, ..., 26792, 26793, 26794]),)

In [27]:
minidx = np.argmin(sgd['tfidf_vector_com'].idf_)
minval = min(sgd['tfidf_vector_com'].idf_)
minidx, minval

(24092, 1.0)

In [34]:
max_idf_token = idx_to_word[maxidx]
min_idf_token = idx_to_word[minidx]
max_idf_token, min_idf_token

('000bn', 'the')

In [68]:
df[df['text'].str.contains(max_idf_token)]

Unnamed: 0,category,text
855,business,japan bank shares up on link talk shares of su...


In [73]:
df[df['text'].str.contains(min_idf_token)]

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [74]:
df[df['text'].str.contains(' ict ')]

Unnamed: 0,category,text
158,politics,uk firms embracing e-commerce uk firms are e...


In [75]:
df.iloc[158]['text']

'uk firms  embracing e-commerce  uk firms are embracing internet trading opportunities as never before  e-commerce minister mike o brien says.  a government-commissioned study ranked the uk third in its world index of use of information and communication technology (ict). the report suggests 69% of uk firms are now using broadband and that 30% of micro businesses are trading online. mr o brien said uk businesses were sprinting forward in ict use  but that there were more challenges ahead. the report  carried out independently by consultants booz allen hamilton and hi europe  placed the uk third behind sweden and ireland for business use of ict.  it showed british business brought greater maturity to their ict use  by using broadband in increased numbers  bringing ict into their business plans and using new technologies such as voice activated programmes and desktop video conferences. mr o brien said:  the increase in the proportion of business connected by broadband shows that uk compa