# Getting the dataset and evaluation baseline ready

In [1]:
import sys
sys.path.append('..')

In [2]:
import re
import csv
import nltk
import math
import string
import numpy as np
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier
from sklearn import preprocessing
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from Chapter01.tokenization import tokenize_nltk

In [3]:
stemmer = SnowballStemmer('english')
bbc_dataset = 'bbc-text.csv'

In [4]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_token = [t for t in tokens if t not in string.punctuation]
    stems = [stemmer.stem(t) for t in filtered_token]
    return stems

In [6]:
stop_words = stopwords.words('english')

def get_stopwrods(stop_words):
    stemmed_stopwords = [stemmer.stem(word) for word in stop_words]
    stop_words = stop_words + stemmed_stopwords
    return stop_words

In [7]:
stop_words = get_stopwrods(stop_words)

In [8]:
def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', 
                            quotechar='"')
        data_read = [row for row in reader]
        return data_read

In [9]:
def get_data(filename):
    data = read_in_csv(filename)
    data_dict = {}
    for row in data[1:]:
        category = row[0]
        text = row[1]
        if category not in data_dict.keys():
            data_dict[category] = []
        data_dict[category].append(text)
    return data_dict

In [10]:
def get_stats(text, num_words=200):
    word_list = tokenize_nltk(text)
    word_list = [word for word in word_list if word not in stop_words and 
                re.search("[A-Za-z]", word)]
    freq_list = FreqDist(word_list)
    print(freq_list.most_common(num_words))
    return freq_list

In [11]:
data_dict = get_data('bbc-text.csv')

In [12]:
for topic in data_dict.keys():
    print(topic, ':', len(data_dict[topic]))

tech : 401
business : 510
sport : 511
entertainment : 386
politics : 417


In [13]:
business_data = data_dict["business"]
sports_data = data_dict["sport"]

In [14]:
business_string = ' '.join(business_data)
sports_string = ' '.join(sports_data)

In [15]:
get_stats(business_string)
get_stats(sports_string)

[('said', 1680), ('us', 813), ('year', 637), ('mr', 600), ('would', 463), ('also', 440), ('market', 425), ('new', 416), ('company', 415), ('growth', 384), ('last', 365), ('firm', 362), ('economy', 359), ('government', 340), ('bank', 335), ('sales', 316), ('could', 311), ('economic', 310), ('oil', 294), ('shares', 265), ('however', 256), ('world', 252), ('may', 251), ('years', 247), ('prices', 246), ('one', 243), ('chief', 236), ('two', 231), ('china', 223), ('business', 218), ('companies', 212), ('analysts', 209), ('uk', 207), ('deal', 206), ('rise', 203), ('expected', 200), ('group', 199), ('financial', 197), ('yukos', 196), ('firms', 193), ('since', 183), ('dollar', 180), ('december', 173), ('country', 173), ('months', 170), ('people', 170), ('stock', 168), ('first', 165), ('president', 165), ('three', 164), ('still', 164), ('many', 163), ('time', 159), ('european', 159), ('rate', 159), ('state', 158), ('trade', 158), ('told', 155), ('investment', 153), ('demand', 151), ('interest', 

FreqDist({'said': 941, 'game': 476, 'england': 459, 'first': 437, 'win': 415, 'would': 396, 'world': 379, 'last': 376, 'one': 355, 'two': 351, ...})

In [16]:
def create_vectorizer(text_list):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=20000,
                                      min_df=0.05, stop_words='english',
                                      use_idf=True,tokenizer=tokenize_and_stem,
                                      ngram_range=(1, 3))
    tfidf_vectorizer.fit_transform(text_list)
    return tfidf_vectorizer

In [17]:
def split_train_test(data, train_percentage):
    train_test_boarder = math.ceil(train_percentage * len(data))
    train_data = data[0:train_test_boarder]
    test_data = data[train_test_boarder:]
    return train_data, test_data

In [18]:
def get_labels(names):
    le = preprocessing.LabelEncoder()
    le.fit(names)
    return le
    

In [19]:
business_train_data, business_test_data = split_train_test(business_data, 0.8)
sports_train_data, sports_test_data = split_train_test(sports_data, 0.8)

In [20]:
train_data = business_train_data + sports_train_data
tfidf_vec = create_vectorizer(train_data)



In [21]:
le = get_labels(['business', 'sports'])

In [22]:
def create_data_matrix(input_data, vectorizer, label, le):
    vectors = vectorizer.transform(input_data).todense()
    labels = [label] * len(input_data)
    enc_labels = le.transform(labels)
    return vectors, enc_labels

In [23]:
def create_dataset(vectorizer, data_dict, le):
    business_news = data_dict['business']
    sports_news = data_dict['sports']
    
    business_vector, business_label = create_data_matrix(business_news, vectorizer,
                                                        'business', le)
    sports_vector, sports_label = create_data_matrix(sports_news, vectorizer,
                                                        'sports', le)
    
    all_data_matrix = np.vstack((business_vector, sports_vector))
    labels = np.concatenate([business_label, sports_label])
    return all_data_matrix, labels
    

In [24]:
train_data_dict = {'business':business_train_data, 
                   'sports':sports_train_data}
test_data_dict = {'business':business_test_data, 
                  'sports':sports_test_data}

In [25]:
(X_train, y_train) = create_dataset(tfidf_vec, train_data_dict, le)
(X_test, y_test) = create_dataset(tfidf_vec, test_data_dict, le)

In [26]:
def predict_trivial(X_train, y_train, X_test, y_test, le):
    dummy_clf = DummyClassifier(strategy='uniform', random_state=0)
    dummy_clf.fit(X_train, y_train)
    y_pred = dummy_clf.predict(X_test)
    print(dummy_clf.score(X_test, y_test))
    print(classification_report(y_test, y_pred, 
                                labels=le.transform(le.classes_),
                               target_names=le.classes_))

In [27]:
predict_trivial(X_train, y_train, X_test, y_test, le)

0.44607843137254904
              precision    recall  f1-score   support

    business       0.45      0.44      0.44       102
      sports       0.45      0.45      0.45       102

    accuracy                           0.45       204
   macro avg       0.45      0.45      0.45       204
weighted avg       0.45      0.45      0.45       204



# Performing rule-based text classification using keywords

In [40]:
import sys
sys.path.append('..')

In [41]:
import numpy as np
import string
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from itertools import repeat
from nltk.probability import FreqDist
from Chapter01.tokenization import tokenize_nltk
from Chapter04.preprocess_bbc_dataset import get_data
from Chapter04.preprocess_bbc_dataset import get_stopwords

In [42]:
business_vocabulary = ["market", "company", "growth", "firm", "economy", "government", "bank", "sales", "oil", "prices", "business", "uk", "financial", "dollar", "stock","trade", "investment", "quarter", "profit", "jobs", "foreign", "tax","euro", "budget", "cost", "money", "investor", "industry", "million", "debt"]
sports_vocabulary = ["game", "england", "win", "player", "cup", "team", "club", "match","set", "final", "coach", "season", "injury", "victory", "league", "play","champion", "olympic", "title", "ball", "sport", "race", "football", "rugby","tennis", "basketball", "hockey"]

In [43]:
business_vectorizer = CountVectorizer(vocabulary=business_vocabulary)
sports_vectorizer = CountVectorizer(vocabulary=sports_vocabulary)

In [44]:
bbc_dataset = './bbc-text.csv'
stopwords_list = get_stopwords()

In [45]:
def get_labels(labels):
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    return le

le = get_labels(['business', 'sport'])

In [46]:
def create_dataset(data_dict, le):
    data_matrix = []
    classifications = []
    gold_labels = []
    for text in data_dict["business"]:
        gold_labels.append(le.transform(["business"]))
        text_vector = transform(text)
        data_matrix.append(text_vector)
    for text in data_dict["sport"]:
        gold_labels.append(le.transform(["sport"]))
        text_vector = transform(text)
        data_matrix.append(text_vector)
    X = np.array(data_matrix)
    y = np.array(gold_labels)
    return (X, y)

In [47]:
def transform(text):
    business_X = business_vectorizer.transform([text])
    sports_X = sports_vectorizer.transform([text])
    business_sum = sum(business_X.todense().tolist()[0])
    sports_sum = sum(sports_X.todense().tolist()[0])
    return np.array([business_sum, sports_sum])

In [48]:
def classify(vector, le):
    lebel = ''
    if vector[0] > vector[1]:
        label = 'business'
    else:
        label = 'sport'
    return le.transform([label])

In [49]:
def evaluate(X, y):
    y_pred = np.array(list(map(classify, X, repeat(le))))
    print(classification_report(y, y_pred, labels=le.transform(le.classes_),
                               target_names=le.classes_))

In [50]:
data_dict = get_data(bbc_dataset)
(X, y) = create_dataset(data_dict, le)
evaluate(X, y)

              precision    recall  f1-score   support

    business       1.00      0.98      0.99       510
       sport       0.98      1.00      0.99       511

    accuracy                           0.99      1021
   macro avg       0.99      0.99      0.99      1021
weighted avg       0.99      0.99      0.99      1021



### Automated process for vacubulary selection

In [51]:
data_dict = get_data(bbc_dataset)

In [56]:
from sklearn.model_selection import train_test_split
def divide_data(data_dict):
    train_dict = {}
    test_dict = {}
    for topic in data_dict.keys():
        text_list = data_dict[topic]
        x_train, x_test = train_test_split(text_list, test_size=0.2)
        train_dict[topic] = x_train
        test_dict[topic] = x_test
    return train_dict, test_dict

In [57]:
train_dict, test_dict = divide_data(data_dict)

In [58]:
le = get_labels(list(data_dict.keys()))

In [92]:
def create_vectorizers(data_dict):
    topic_list = list(data_dict.keys())
    vectorizer_dict = {}
    for topic in topic_list:
        text_array = data_dict[topic]
        text = " ".join(text_array)
        word_list = tokenize_nltk(text)
        word_list = [word for word in word_list if 
                     word not in stop_words]
        freq_dist = FreqDist(word_list)
        top_200 = freq_dist.most_common(200)
        vocab = [wtuple[0] for wtuple in top_200 if 
                 wtuple[0] not in stop_words and 
                 wtuple[0] not in string.punctuation]
        vectorizer_dict[topic] = CountVectorizer(vocabulary=vocab)
    return vectorizer_dict

In [93]:
def transform_auto(text, vect_dict, le):
    number_topics = len(list(vect_dict.keys()))
    sum_list = [0]*number_topics
    for topic in vect_dict.keys():
        vectorizer = vect_dict[topic]
        this_topic_matrix = vectorizer.transform([text])
        this_topic_sum = sum(this_topic_matrix.todense().tolist()[0])
        index = le.transform([topic])[0]
        sum_list[index] = this_topic_sum
    return np.array(sum_list)

In [94]:
def create_dataset_auto(data_dict, le, vectorizer_dict):
    data_matrix = []
    classifications = []
    gold_labels = []
    for topic in data_dict.keys():
        for text in data_dict[topic]:
            gold_labels.append(le.transform([topic]))
            text_vector = transform_auto(text, vectorizer_dict, le)
            data_matrix.append(text_vector)
    X = np.array(data_matrix)
    y = np.array(gold_labels)
    return (X, y)

In [95]:
def classify_auto(vector, le):
    result = np.where(vector == np.amax(vector))
    label = result[0][0]
    return [label]

In [96]:
def evaluate_auto(X, y, le):
    y_pred = np.array(list(map(classify_auto, X, repeat(le))))
    print(classification_report(y, y_pred, 
          labels=le.transform(le.classes_), 
          target_names=le.classes_))

In [97]:
vectorizers = create_vectorizers(train_dict)
X, y = create_dataset_auto(test_dict, le, vectorizers)
evaluate_auto(X, y, le)

               precision    recall  f1-score   support

     business       0.93      0.91      0.92       102
entertainment       0.96      0.95      0.95        78
     politics       0.87      0.98      0.92        84
        sport       0.97      0.97      0.97       103
         tech       0.96      0.88      0.92        81

     accuracy                           0.94       448
    macro avg       0.94      0.94      0.94       448
 weighted avg       0.94      0.94      0.94       448



# Clustering sentences using K-means – unsupervised text classification

In [99]:
import nltk
import re
import string
import pandas as pd
from sklearn.cluster import KMeans
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist
from Chapter01.tokenization import tokenize_nltk
from Chapter01.dividing_into_sentences import divide_into_sentences_nltk
from Chapter04.preprocess_bbc_dataset import get_data
from Chapter04.keyword_classification import divide_data
from Chapter04.preprocess_bbc_dataset import get_stopwords

In [100]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

bbc_dataset = "./bbc-text.csv"
stop_words = stopwords.words('english')
stop_words = get_stopwords(stop_words)
stemmer = SnowballStemmer('english')

In [101]:
data_dict = get_data(bbc_dataset)
train_dict, test_dict = divide_data(data_dict)

In [103]:
all_training = []
all_test = []
for topic in train_dict.keys():
    all_training = all_training + train_dict[topic]
for topic in test_dict.keys():
    all_test = all_test + test_dict[topic]

In [104]:
def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in 
                       stop_words and t not in 
                       string.punctuation and 
                       re.search('[a-zA-Z]', t)]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [109]:
def create_vectorizer(data):
    vec = TfidfVectorizer(max_df=0.90, max_features=200000,
                    min_df=0.05, stop_words=stop_words,
                    use_idf=True,
                    tokenizer=tokenize_and_stem, 
                    ngram_range=(1,3))
    vec.fit(data)
    return vec

In [110]:
vectorizer = create_vectorizer(all_training)
matrix = vectorizer.transform(all_training)



In [111]:
km = KMeans(n_clusters=5, init='k-means++', random_state=0)
km.fit(matrix)

KMeans(n_clusters=5, random_state=0)

In [112]:
def make_predictions(test_data, vectorizer, km):
    predicted_data = {}
    for topic in test_data.keys():
        this_topic_list = test_data[topic]
        
        if topic not in predicted_data.keys():
            predicted_data[topic] = {}
        
        for text in this_topic_list:
            prediction = km.predict(vectorizer.transform([text]))[0]
            if (prediction not in predicted_data[topic].keys()):
                predicted_data[topic][prediction] = []
            predicted_data[topic][prediction].append(text)
    return predicted_data

In [114]:
def print_report(predicted_data):
    for topic in predicted_data.keys():
        print(topic)
        for prediction in predicted_data[topic].keys():
            print("Cluster number: ", prediction, 
                  "number of items: ", 
                  len(predicted_data[topic][prediction]))

In [116]:
predicted_data = make_predictions(test_dict, vectorizer, km)
print_report(predicted_data)

tech
Cluster number:  1 number of items:  70
Cluster number:  4 number of items:  5
Cluster number:  2 number of items:  3
Cluster number:  0 number of items:  3
business
Cluster number:  2 number of items:  101
Cluster number:  3 number of items:  1
sport
Cluster number:  4 number of items:  99
Cluster number:  0 number of items:  1
Cluster number:  2 number of items:  3
entertainment
Cluster number:  0 number of items:  51
Cluster number:  2 number of items:  23
Cluster number:  1 number of items:  3
Cluster number:  4 number of items:  1
politics
Cluster number:  3 number of items:  58
Cluster number:  2 number of items:  25
Cluster number:  4 number of items:  1


In [117]:
def print_most_common_words_by_cluster(all_training, km, num_clusters):
    clusters = km.labels_.tolist()
    docs = {'text': all_training, 'cluster': clusters}
    frame = pd.DataFrame(docs, index = [clusters])
    for cluster in range(0, num_clusters):
        this_cluster_text = frame[frame['cluster'] == cluster]
        all_text = " ".join(this_cluster_text['text'].astype(str))
        top_200 = get_most_frequent_words(all_text)
        print(cluster)
        print(top_200)
    return frame

In [122]:
def get_most_frequent_words(text):
    word_list = tokenize_nltk(text)
    word_list = [word for word in word_list if word not in stop_words and word not in string.punctuation and re.search('[a-zA-Z]', word)]
    freq_dist = FreqDist(word_list)
    top_200 = freq_dist.most_common(200)
    top_200 = [word[0] for word in top_200]
    return top_200

In [123]:
print_most_common_words_by_cluster(all_training, km, 5)

0
['film', 'best', 'said', 'also', 'year', 'one', 'awards', 'us', 'music', 'new', 'show', 'award', 'director', 'last', 'films', 'uk', 'first', 'number', 'years', 'top', 'actor', 'band', 'british', 'star', 'song', 'two', 'tv', 'album', 'three', 'actress', 'including', 'festival', 'bbc', 'people', 'time', 'would', 'prize', 'stars', 'made', 'movie', 'world', 'oscar', 'aviator', 'mr', 'comedy', 'like', 'rock', 'million', 'musical', 'well', 'nominations', 'win', 'record', 'series', 'role', 'singer', 'hit', 'life', 'ceremony', 'hollywood', 'make', 'week', 'took', 'theatre', 'office', 'oscars', 'five', 'box', 'place', 'named', 'love', 'london', 'book', 'radio', 'starring', 'children', 'set', 'four', 'academy', 'chart', 'play', 'nominated', 'success', 'pop', 'could', 'category', 'second', 'include', 'man', 'take', 'list', 'drama', 'night', 'original', 'day', 'good', 'ray', 'charles', 'think', 'single', 'baby', 'performance', 'third', 'told', 'industry', 'next', 'many', 'songs', 'foxx', 'see', 

Unnamed: 0,text,cluster
1,mobiles not media players yet mobiles are no...,1
2,apple attacked over sources row civil libertie...,2
2,junk e-mails on relentless rise spam traffic i...,2
1,broadband set to revolutionise tv bt is starti...,1
1,disney backs sony dvd technology a next genera...,1
...,...,...
3,tory candidate quits over remark a conservativ...,3
3,tory leader cleared over work scottish conse...,3
3,observers to monitor uk election ministers wil...,3
3,what really divides the parties so what is the...,3


In [124]:
import pickle
pickle.dump(km, open('bbc_kmeans.pkl', 'wb'))

In [125]:
km = pickle.load(open("bbc_kmeans.pkl", 'rb'))

# Using SVMs for supervised text classification

In [126]:
a = [3, 6, -2, -5, 7, 3]

In [127]:
a.sort(reverse=True)

In [148]:
def almostIncreasingSequence(sequence):
    initial = sequence[0]
    block = 0
    for i in range(1,len(sequence)):
        print(initial, sequence[i])
        if initial < sequence[i]:
            initial = sequence[i]
        elif i==1:
            initial = sequence[i]
            block += 1
        else: 
            block += 1
    print(block)
    return True if block <=1 else False    
almostIncreasingSequence([1, 2, 5, 3, 5])

1 2
2 5
5 3
5 5
2


False

In [155]:
a = [1, 2, 5, 3, 5]

In [152]:
a

[2, 5, 3, 5]

In [161]:
a[:2-1]+a[2:]

[1, 5, 3, 5]

In [211]:
def almostIncreasingSequence(sequence):
    #status = False
    for i in range(1,len(sequence)+1):

        temp = sequence[:]
        temp.pop(i-1)
        temp.sort()
        print(i)
        print(temp, sequence[:i-1]+sequence[i:])
        
        if temp == sequence[:i-1]+sequence[i:]:
            return True
    return False

almostIncreasingSequence([1, 2, 1, 2])

1
[1, 2, 2] [2, 1, 2]
2
[1, 1, 2] [1, 1, 2]


True

In [243]:
[a - b for a,b in zip([1, 2, 1],[2, 1, 2])]

[-1, 1, -1]

In [None]:
[1, 3, 2]

In [264]:
sequence[1:], sequence[:-1]

([2, 2], [1, 2])

In [265]:
def almostIncreasingSequence(sequence):
    new_s = [ a - b > 0 for a,b in zip(sequence[1:], sequence[:-1])]
    print([ a - b for a,b in zip(sequence[1:], sequence[:-1])])
    print(new_s)
    return  False if new_s.count(False) > 1 else True

In [309]:
def almostIncreasingSequence(sequence):
    start = 0
    end = 1
    status = 0
    block_index = []
    while end <= len(sequence)-1:
        
        print(sequence[start] , sequence[end])
        if sequence[start] >= sequence[end] and status <= 1 :
            status += 1
            block_index.append(start)
            if start !=0:
                start -= 1
            else:
                start += 1
                end += 1
            
            
                   
        else:
            
            start += 1
            if start in block_index:
                start +=1
            end += 1
    print(status)
    return True if status <= 1 else False

In [310]:
sequence = [1, 2, 3, 4, 3, 6]
almostIncreasingSequence(sequence)

1 2
2 3
3 4
4 3
3 3
2 3
4 6
2


False

In [291]:
def almostIncreasingSequence(sequence):
    temp = sequence[0]
    status = 0
    for i in range(1,len(sequence)):
        print(temp , sequence[i])
        if temp >= sequence[i] and status <= 1 :
            status += 1
            if i ==1:
                temp = sequence[1]
            else:
                temp = sequence[i-2]
                
        else:
            temp = sequence[i]
    print(status)
    return True if status <= 1 else False

In [292]:
sequence = [1, 2, 5, 3, 5]
almostIncreasingSequence(sequence)

1 2
2 5
5 3
2 5
1


True

In [273]:
sequence.count(5)

0

In [205]:

i = 1
sequence[:i-1] + sequence[i:]

[3, 2]

In [183]:
sequence = [1, 3, 2, 4]
print(sequence[:2-1])
print(sequence[2:])

[1]
[2, 4]


In [None]:
def almostIncreasingSequence(sequence):
    initial = sequence[0]
    block = 0
    for i in range(1,len(sequence)):
        print(initial, sequence[i])
        if initial < sequence[i]:
            initial = sequence[i]
        elif i==1:
            initial = sequence[i]
            block += 1
        else: 
            block += 1
    print(block)
    return True if block <=1 else False    
almostIncreasingSequence([1, 2, 5, 3, 5])

In [237]:
def almostIncreasingSequence(sequence):
    
    for i in range(len(sequence)):
        status = True
        #print(i)
        temp = sequence[:]
        temp.pop(i)
        
        item = temp[0]
        print(temp)
        print('***')
        for j in temp[1:]:
            print(j, item)
            if j > item:
                item = j
                continue
            else:
                status = False
                item = j
        if status:
            return True
            
        
    
    return False
            
            
almostIncreasingSequence([1, 2, 1, 2])

[2, 1, 2]
***
1 2
2 1
[1, 1, 2]
***
1 1
2 1
[1, 2, 2]
***
2 1
2 2
[1, 2, 1]
***
2 1
1 2


False