In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import classification_report, confusion_matrix

import pandas as pd
import xgboost, numpy, string
import numpy as np
import re
from collections import Counter, defaultdict
from scipy.sparse import hstack
import gensim
from gensim.models.word2vec import Word2Vec

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

#import shorttext
import matplotlib.pyplot as plt
from pandas_ml import ConfusionMatrix
%matplotlib inline 

Using TensorFlow backend.


In [2]:
import nltk
import textblob

In [3]:
# load the dataset
data = pd.read_csv('bbc-text.csv', encoding = 'ANSI')

In [4]:
# have a look on the text csv to find out the target feature
data.head(5)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [5]:
# def a text cleaning method
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [6]:
# clean the short abbreviation 
data['text'] = data['text'].map(lambda x : clean_text(x))

In [7]:
# stopword is the most common used word which not have a particular meaning such as i, am, you, a...
from nltk.corpus import stopwords
from textblob import Word
new_stopwords = stopwords.words('english')
print(new_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
#lower case
data['text'] = data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [9]:
##remove punctuation in each phrase such as comma, full stop mark and so on
data['text'] = data['text'].str.replace('[^\w\s]','')

In [10]:
##stopwords removal  --- if you want to add stopwords in or remove stopwords out, you coould modify the stopwords array
data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in new_stopwords))

In [11]:
 ##Lemmatization, use textblob's lemmatize function to do the lenatization. eg: went -- go
data['text'] = data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [12]:
# after cleaning, let's see the dataset again
data.head(5)

Unnamed: 0,category,text
0,tech,tv future hand viewer home theatre system plas...
1,business,worldcom bos left book alone former worldcom b...
2,sport,tiger wary farrell gamble leicester say rushed...
3,sport,yeading face newcastle fa cup premiership side...
4,entertainment,ocean twelve raid box office ocean twelve crim...


In [14]:
# split the dataset into training and validation datasets 
# Attach also labels for filter and predict later

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data['text'], data['category'], test_size=0.3, random_state=17)

In [15]:
# see the target value distrubution 
train_y.value_counts()

sport            357
business         352
politics         290
entertainment    279
tech             279
Name: category, dtype: int64

In [35]:
# create a tokenizer from text
#token = text.Tokenizer()
#token.fit_on_texts(data['text'])
#word_index = token.word_index
#print(word_index)
#print(token.word_counts)
X, y = [], []
text = data['text']
label = data['category']
text = text.to_frame()
label = label.to_frame()
text.to_csv('text.txt', header=None, index=None, sep=' ', mode='a')
label.to_csv('label.txt', header=None, index=None, sep=' ', mode='a')



In [36]:
TRAIN_SET_x_PATH = "text.txt"
TRAIN_SET_y_PATH = "label.txt"
with open(TRAIN_SET_x_PATH, "r") as infile:
    for line in infile:
        # text = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        X.append(line.split())
X= np.array(X)

In [37]:
with open(TRAIN_SET_y_PATH, "r") as infile:
    for line in infile:
        # text = line.split("\t")
        # texts are already tokenized, just split on space
        # in a real case we would use e.g. spaCy for tokenization
        # and maybe remove stopwords etc.
        y.append(line.split())
y= np.array(y)

In [38]:
print(X)
print(y)

[ ['"tv', 'future', 'hand', 'viewer', 'home', 'theatre', 'system', 'plasma', 'high', 'definition', 'tv', 'digital', 'video', 'recorder', 'moving', 'living', 'room', 'way', 'people', 'watch', 'tv', 'radically', 'different', 'five', 'year', 'time', 'according', 'expert', 'panel', 'gathered', 'annual', 'consumer', 'electronics', 'show', 'la', 'vega', 'discus', 'new', 'technology', 'impact', 'one', 'favourite', 'pastime', 'u', 'leading', 'trend', 'programme', 'content', 'delivered', 'viewer', 'via', 'home', 'network', 'cable', 'satellite', 'telecom', 'company', 'broadband', 'service', 'provider', 'front', 'room', 'portable', 'device', 'one', 'talked', 'technology', 'ce', 'digital', 'personal', 'video', 'recorder', 'dvr', 'pvr', 'set', 'top', 'box', 'like', 'u', 'tivo', 'uk', 'sky', 'system', 'allow', 'people', 'record', 'store', 'play', 'pause', 'forward', 'wind', 'tv', 'programme', 'want', 'essentially', 'technology', 'allows', 'much', 'personalised', 'tv', 'also', 'built', 'high', 'defin

In [39]:
#Do the word2vec
#Self-train word embedding
#Train word2vec on all the texts
model1 = Word2Vec(X, size=100, window=5, min_count=5, workers=2)
print(model1.wv.index2word)
print(model1.wv.syn0)
#w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

[[ 0.72756851 -0.26057884 -0.73779637 ..., -0.59350276  0.46667406
  -0.34387818]
 [ 0.58914304  0.62779385 -0.57124746 ..., -0.44564831  0.67644763
   0.51555574]
 [ 0.84139132 -1.55693746  0.57134479 ..., -1.0923152  -0.13459912
  -0.59044909]
 ..., 
 [ 0.04383487 -0.03698334 -0.05624707 ..., -0.05358861  0.04042584
  -0.03047162]
 [ 0.04094311 -0.02201137 -0.02418733 ..., -0.03649042  0.01641328
  -0.02548887]
 [ 0.05058155 -0.0317612  -0.01974179 ..., -0.04600464  0.02659057
  -0.02070546]]


In [40]:
w2v = {w: vec for w, vec in zip(model1.wv.index2word, model1.wv.syn0)}
#print(w2v)

In [41]:
len(model1.wv.index2word)

9740

In [42]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v))])
        else:
            self.dim=0
        
    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [44]:
#Train word embedding vector
word_embedding_vector = TfidfEmbeddingVectorizer(w2v).fit(data['text'])

In [48]:
#Train TFIDF vector
# word level tf-idf
tfidf_vect_word = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect_word.fit(data['text'])
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(data['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(2, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [49]:
def transforming(x):
    #Transform train_x into vectors
    train_x_word_embedding = word_embedding_vector.transform(x)
    train_x_tfidf_word = tfidf_vect_word.transform(x)
    train_x_tfidf_ngram = tfidf_vect_ngram.transform(x)
    #Stack all vectors together
    transformed_train_x = hstack([train_x_word_embedding,train_x_tfidf_word,train_x_tfidf_ngram])
    return transformed_train_x

In [51]:
transformed_train_x = transforming(train_x)

In [53]:
train_x.shape

(1557,)

In [54]:
transformed_train_x.shape

(1557, 10100)

In [59]:
#define algorithm
logistic = linear_model.LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
# Fit grid search
best_model = clf.fit(transformed_train_x, train_y)

# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 464.158883361


In [61]:
# get transforming text data
transformed_test_x = transforming(valid_x)

In [62]:
predicted_valid_y = best_model.predict(transformed_test_x)
predicted_valid_y_proba = best_model.predict_proba(transformed_test_x)

In [63]:
predicted_valid_y_proba_max = []
for x in predicted_valid_y_proba:
    predicted_valid_y_proba_max.append(max(x))

In [64]:
# check accuracy
np.mean(predicted_valid_y == valid_y)

0.9760479041916168

In [65]:
pd.crosstab(valid_y, predicted_valid_y, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,business,entertainment,politics,sport,tech,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
business,154,0,1,1,2,158
entertainment,0,105,1,0,1,107
politics,3,1,122,1,0,127
sport,1,0,0,153,0,154
tech,0,1,2,1,118,122
All,158,107,126,156,121,668


In [66]:
#create a dataframe for prediction results
result_df = pd.DataFrame(np.column_stack([valid_x,valid_y,
                                         predicted_valid_y, predicted_valid_y_proba_max]),
                         columns = ['Text','Ture_category','PREDICTED_cetegory', '% PREDICTED_cetegory'],
                         index = valid_x.index)


In [67]:
result_df

Unnamed: 0,Text,Ture_category,PREDICTED_cetegory,% PREDICTED_cetegory
942,butler strike gold spain britain kathy butler ...,sport,sport,0.994596
1429,bt offer equal access rival bt moved pre empt ...,business,tech,0.504323
1771,apple laptop greatest gadget apple powerbook 1...,tech,tech,0.998535
701,peer peer net stay peer peer p2p network stay ...,tech,tech,0.997934
459,fuming robinson blast official england coach a...,sport,sport,0.99897
2009,tindall aiming earn lion spot bath england cen...,sport,sport,0.994988
1428,sfa awaits report mikoliunas scottish football...,sport,sport,0.995133
2138,libya take 1bn unfrozen fund libya withdrawn 1...,business,business,0.998218
191,ticking budget facing u budget proposal laid a...,business,business,0.988215
763,share rise new man utd offer share manchester ...,business,business,0.998686


In [68]:
result_df.shape

(668, 4)

In [70]:
error_prediction = result_df[result_df['Ture_category'] != result_df['PREDICTED_cetegory']]
error_prediction

Unnamed: 0,Text,Ture_category,PREDICTED_cetegory,% PREDICTED_cetegory
1429,bt offer equal access rival bt moved pre empt ...,business,tech,0.504323
547,ink help drive democracy asia kyrgyz republic ...,tech,politics,0.352899
384,drive save festive holiday effort made protect...,politics,entertainment,0.558888
1146,bbc lead interactive bafta win bbc national th...,tech,entertainment,0.822778
1565,ferdinand cast doubt glazer rio ferdinand said...,sport,business,0.84811
2072,hatfield executive go trial engineering firm b...,politics,sport,0.729885
2002,salary scandal cameroon cameroon say widesprea...,business,politics,0.804776
2183,piero give rugby perspective bbc sport unveils...,tech,sport,0.829921
1933,fear raised ballet future fewer child uk follo...,entertainment,tech,0.801555
445,hunt ban support decline support ban hunting f...,politics,business,0.766438


In [71]:
# look the prob in prediction
print(predicted_valid_y_proba)

[[  4.25872446e-03   8.53905841e-04   1.19873511e-04   9.94595521e-01
    1.71975518e-04]
 [  4.92008130e-01   6.25402830e-05   2.00253228e-03   1.60338793e-03
    5.04323409e-01]
 [  5.79066406e-05   9.24333679e-04   6.07266090e-05   4.21945065e-04
    9.98535088e-01]
 ..., 
 [  2.01765787e-03   3.58700805e-04   6.38350855e-04   9.90820817e-01
    6.16447366e-03]
 [  6.20598166e-03   9.93660957e-01   4.09462380e-06   2.44300509e-05
    1.04536673e-04]
 [  9.99452824e-01   1.59939155e-04   2.31572352e-04   1.14281151e-04
    4.13834430e-05]]
