<a href="https://colab.research.google.com/github/claudiaxpreda/SRI-Project/blob/main/SRI_Project_Active.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


We have split the dataset previously into:
* labeled data - 1157 articles - this are gooing to be used as a seed for the active learning model 
* unlabeled data - 3152 articles crawled from the internet - this is the part of the dataset that still needs to be labeled using an active learning model 

The active learning model helps us to label fresh new article just crawled from News Sites (cybernetic/ security focus websites) in order to have a larger number of samples for the model we aim to train. The original dataset was having a smaller number of articles, aproximately 1000, to which will add 3000 new articles. 

For testing we will use a subset of the orginal dataset, in order to have a clear statistic of how well our solution will perform. We will compare the performance of trainindgthe models using only the the original dataset, as well as the enhanced dataset.

Active learning:
*   We will use **Entropy Sampling** - is the average level of “information”, “surprise”, or “uncertainty” inherent in the variable’s possible outcomes
*   We will use modAL package








In [None]:
!pip install modAL



In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
import json
import re

from urllib.parse import urlparse
import seaborn as sns

import numpy as np
from sklearn.model_selection import train_test_split

from modAL.models import ActiveLearner
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from modAL.uncertainty import entropy_sampling

from nltk.corpus import stopwords

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
PATH_LABELED_DATA = 'final_labeled_data.csv'
PATH_UNALBELED_DATA = 'new_unlabeled_data.csv'

ONE_HOT_ENCODING = 1

def read_data_csv(path):
  data = pd.read_csv(path)
  data.drop(data.columns[0], axis=1, inplace=True)
  
  if path == PATH_LABELED_DATA:
    data['Relevance'] = data['Relevance'].apply(lambda item : item.upper())
    data.loc[(data['Relevance'] == 'IRELEVANT'),'Relevance']='IRRELEVANT'
    data.drop(data[data['Relevance'] == 'SERVICE'].index, inplace=True)
    data.reset_index(drop=True, inplace=True)
    
  return data

def standard_cleaning(text):
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = text.lower().split() 
    stops = set(stopwords.words("english"))
    meaningful_words = " ".join([w for w in words if not w in stops])
    #element = " ".join([lemmatizer.lemmatize(w) for w in meaningful_words])
    return meaningful_words

def prepare_dataset(data, path):
  data['Text'] = data['Title'] + data ['Description']
  data['Text'] = data['Text'].astype(str)
  data['Text'] = data['Text'].apply(lambda item : standard_cleaning(item))
  if path == PATH_LABELED_DATA:
    data['Category'] = np.where(data['Relevance'] == 'RELEVANT', 1, 0)
  return data

def split_labeled_dataset(data, dummy):
  if dummy == ONE_HOT_ENCODING:
    X_train, X_test, y_train, y_test = train_test_split(
      data['Text'], data['Category'], test_size=0.30, random_state=42)
  else:
    # classes = []
    # texts =[]
    # # for _, row in data.iterrows():
    # #       classes.append(row['Category'])
    # #       texts.append(row['Text'])

    # X_train, X_test, y_train, y_test = train_test_split(
    #   texts, classes, test_size=0.30, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(
      data['Text'], data['Category'], test_size=0.30, random_state=42)
  
  return X_train, X_test, y_train, y_test


def get_X(data):
  return data['Text']

def get_Y(data):
  return data['Category']

def define_active_learner(cls,X_initial, y_initial ):
  learner = ActiveLearner(
    estimator=cls,
    query_strategy=preset_batch,
    X_training=X_initial, y_training=y_initial
  )
  return learner

def learner_input(path, dummy):
  labeled_data = read_data_csv(PATH_LABELED_DATA)
  labeled_data = prepare_dataset(labeled_data, PATH_LABELED_DATA)
  X_train, X_test, y_train, y_test = split_labeled_dataset(labeled_data, dummy)
  y_train.reset_index(drop=True, inplace=True)
  y_test.reset_index(drop=True, inplace=True)
  X_test.reset_index(drop=True, inplace=True)
  X_train.reset_index(drop=True, inplace=True)



  last_index = len(X_train) - 1

  vectorizer = TfidfVectorizer(lowercase=False, max_features=1500)
  X_train = vectorizer.fit_transform(X_train).toarray()
  X_test = vectorizer.fit_transform(X_test).toarray()

  if dummy == ONE_HOT_ENCODING:
    ohe = OneHotEncoder(handle_unknown='ignore')
    y_train = ohe.fit_transform(y_train.values.reshape(-1,1)).toarray()
    y_test = ohe.fit_transform(y_test.values.reshape(-1,1)).toarray()
  else:
      y_train = y_train.values.ravel()
      y_test = y_test.values.ravel()
  
  n_initial = 150
  initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)

  X_initial = X_train[initial_idx]
  y_initial = y_train[initial_idx]


  y_pool = np.delete(y_train, initial_idx, axis=0)[:last_index]
  X_pool = np.delete(X_train, initial_idx, axis=0)[:last_index]



  return X_initial, X_pool, X_test, y_initial, y_pool, y_test

def query_learner(learner, X_pool, y_pool):
  n_queries = 10
  for idx in range(n_queries):
    query_index, query_instance = learner.query(X_pool)
    X = X_pool[query_index]
    y = y_pool[query_index]
    learner.teach(X, y)


def get_class_weights(yp):
    from sklearn.utils import class_weight
    classes = []
    # texts =[]

    class_weights = class_weight.compute_class_weight( class_weight = "balanced",
                                        classes = np.unique(yp),
                                        y = yp)         
    return {i: class_weights[i] for i in range(len(np.unique(yp)))}


In [None]:
# clsf = SVC(probability=True)
# X_initial, X_pool, X_test, y_initial, y_pool, y_test = learner_input(PATH_LABELED_DATA, 0)
# learner = define_active_learner(clsf, X_initial, y_initial)
# print(learner.score(X_test, y_test))

In [None]:
  labeled_data = read_data_csv(PATH_LABELED_DATA)
  labeled_data = prepare_dataset(labeled_data, PATH_LABELED_DATA)
  X_train, X_test, y_train, y_test = split_labeled_dataset(labeled_data,1)
  y_train.reset_index(drop=True, inplace=True)
  y_test.reset_index(drop=True, inplace=True)
  X_test.reset_index(drop=True, inplace=True)
  X_train.reset_index(drop=True, inplace=True)

  ohe = OneHotEncoder(handle_unknown='ignore')
  y_train = ohe.fit_transform(y_train.values.reshape(-1,1)).toarray()
  y_test = ohe.fit_transform(y_test.values.reshape(-1,1)).toarray()

  
  vectorizer = TfidfVectorizer(lowercase=False, max_features=1500)
  X_train = vectorizer.fit_transform(X_train).toarray()
  X_test = vectorizer.fit_transform(X_test).toarray()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from modAL.models import ActiveLearner

# assembling initial training set
n_initial = 10
initial_idx = np.random.choice(range(len(X_train)), size=n_initial, replace=False)
X_t, y_t = X_train[initial_idx], y_train[initial_idx]

# initialize the learner
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    X_training=X_t, y_training=y_t
)
unqueried_score = learner.score(X_test, y_test)

print('Initial prediction accuracy: %f' % unqueried_score)

Initial prediction accuracy: 0.602941


In [None]:
from modAL.uncertainty import classifier_uncertainty

performance_history = [unqueried_score]

# learning until the accuracy reaches a given threshold|
while learner.score(X_train, y_train) < 0.88:
    stream_idx = np.random.choice(range(len(X_train)))
    if classifier_uncertainty(learner, X_train[stream_idx].reshape(1, -1)).all() >= 0.4:
        print(X_train[stream_idx].reshape(1, -1).shape)
        learner.teach(X_train[stream_idx].reshape(1, -1), y_train[stream_idx].reshape(1, -1 ))
        new_score = learner.score(X_train, y_train)
        performance_history.append(new_score)
        print('Pixel no. %d queried, new accuracy: %f' % (stream_idx, new_score))

In [None]:
unqueried_score = learner.score(X_test, y_test)


In [None]:
print(unqueried_score)