# 2 topic detection
## get BBC training data
## train model at 0.95+ score 
## and save as pickle
## save/show learning curve

In [1]:
topic_classification_folder = '../topic_classification/'
import os
if not os.path.exists(topic_classification_folder):
    os.makedirs(topic_classification_folder)

In [2]:
url_topic_train = 'https://raw.githubusercontent.com/01-edu/public/master/subjects/ai/nlp-scraper/bbc_news_train.csv'
url_topic_test = 'https://github.com/01-edu/public/blob/master/subjects/ai/nlp-scraper/bbc_news_tests.csv'

In [3]:
def fetch_file_stream(folder,file_name,url):
    import requests
    import os
    path_destination = folder+file_name
    if not os.path.exists(folder):
        os.makedirs(folder)
    if not os.path.exists(path_destination):
        response = requests.get(url,stream=True)
        with open(path_destination, 'wb') as file_destination:
            for chunk in response.iter_content(chunk_size=128):
                file_destination.write(chunk)

In [4]:
def get_topic_detection_train_and_test_data(topic_training_material_folder):
    import pandas
    import os
    # if not os.path.exists(topic_training_material_folder):
    #     os.makedirs(topic_training_material_folder)
    url_topic_train = 'https://raw.githubusercontent.com/01-edu/public/master/subjects/ai/nlp-scraper/bbc_news_train.csv'
    url_topic_test = 'https://raw.githubusercontent.com/01-edu/public/master/subjects/ai/nlp-scraper/bbc_news_tests.csv'
    xs_topic_train_filename = 'topic_train.csv'
    xs_topic_test_filename = 'topic_test.csv'
    fetch_file_stream(
        topic_training_material_folder,
        xs_topic_train_filename,
        url_topic_train,
    )
    fetch_file_stream(
        topic_training_material_folder,
        xs_topic_test_filename,
        url_topic_test,
    )
    xss_topic_train = pandas.read_csv(
        topic_training_material_folder+xs_topic_train_filename,
        sep=',',
        engine = 'pyarrow',
    )
    xss_topic_test = pandas.read_csv(
        topic_training_material_folder+xs_topic_test_filename,
        sep=',',
        engine = 'pyarrow',
    )
    return xss_topic_train, xss_topic_test

In [5]:
xss_bbc_train_raw, xss_bbc_test_raw = get_topic_detection_train_and_test_data(topic_classification_folder)

In [6]:
xss_bbc_train_raw.columns

Index(['ArticleId', 'Text', 'Category'], dtype='object')

In [7]:
xss_bbc_train_raw['Category'].shape

(1490,)

In [8]:
def get_spacy_stuff(stuff_name='en_core_web_sm'):
    match stuff_name:
        case 'small':
            stuff_name='en_core_web_sm'
        case 'medium':
            stuff_name='en_core_web_md'
        case 'large':
            stuff_name='en_core_web_lg'
        case 'transformer':
            stuff_name='en_core_web_trf' 
        case _:
            stuff_name=stuff_name
    import spacy
    if not spacy.util.is_package(stuff_name):
        spacy.cli.download(stuff_name) 
    return spacy.load(stuff_name)

## with spacy and classy_classification

In [9]:
def format_data_for_label_training(xs_text, ys_label):
    training_data_for_labelizing = []
    each_labels = ys_label.unique()
    for i, text in enumerate(xs_text):
        topic_of_the_text = ys_label[i]
        row = (text)
        categories_dictionary = {"cats": {label: label == topic_of_the_text for label in each_labels}}
        row = (text, categories_dictionary)
        training_data_for_labelizing.append(row)
    return training_data_for_labelizing

In [10]:
def format_data_for_classy_training(xs_text, ys_label):
    labels_to_textss = {topic:[] for topic in ys_label.unique()}
    for text,label in zip(xs_text, ys_label):
        labels_to_textss[label].append(text)
    return labels_to_textss

In [11]:
def train_topic_classy_classifier_spacy(xs_text, ys_label, nlp=None):
    import random
    try:
      import classy_classification
    except:
        !pip install classy_classification
        # import os
        # os.system('pip install classy_classification')
    import classy_classification
    
    import classy_classification
    if nlp is None:
        nlp = get_spacy_stuff('medium')
    classy_training_data = format_data_for_classy_training(xs_text, ys_label)
    nlp.add_pipe(
        "classy_classification",
        config={
            'data': classy_training_data,
            'model':'spacy',
        },
    )
    return nlp

In [12]:
classifier_model_classy = train_topic_classy_classifier_spacy(
    xss_bbc_train_raw['Text'],
    xss_bbc_train_raw['Category'],
)

2024-03-20 10:53:19.773486: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-20 10:53:21.299991: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(


In [13]:
classifier_model_classy(
    'let make company and sell stuff and buy other organisation and make big profit and fire people ?'
)._.cats

{'business': 0.9648643762488617,
 'entertainment': 0.005311315861420202,
 'politics': 0.010333569915212906,
 'sport': 0.0013157527986121147,
 'tech': 0.01817498517589292}

In [14]:
def most_probable_category(classy_nlp, text):
    return max(classy_nlp(text)._.cats, key=classy_nlp(text)._.cats.get)   
def categories_from_texts(nlp_classy, xs_text):
    return xs_text.apply(lambda text : most_probable_category(nlp_classy,text))

In [15]:
ys_test_true = xss_bbc_test_raw['Category']
ys_test_predicted = categories_from_texts(classifier_model_classy, xss_bbc_test_raw['Text'])
from sklearn.metrics import accuracy_score
print("accuracy over the testing set", accuracy_score(ys_test_true, ys_test_predicted))

accuracy over the testing set 0.9619047619047619


In [16]:
def save_data_as_pickle(data,filepath):
    import pypickle
    return pypickle.save(filepath, data)

In [17]:
save_data_as_pickle(classifier_model_classy, topic_classification_folder+'topic_classifier_classy.pickle')

[pypickle] File already exists and is not overwritten: [../topic_classification/topic_classifier_classy.pickle]


False

# apply to the 300ish articles

In [18]:
articles_folder = '../articles/'

In [19]:
def get_all_articles():
    import pandas
    filename = 'three_hundredish_articles_1.ignore.csv'
    return pandas.read_csv(articles_folder+filename, index_col=0)

In [20]:
articles = get_all_articles()

FileNotFoundError: [Errno 2] No such file or directory: '../articles/three_hundredish_articles_1.ignore.csv'

In [None]:
articles['topic'] = categories_from_texts(classifier_model_classy, articles['body'])

In [None]:
articles.head(2)['topic']

In [None]:
print(articles.iloc[0]['headline'])
print(articles.iloc[1]['headline'])

In [None]:
articles.to_csv( articles_folder+'three_hundredish_articles_2.ignore.csv')