# 3. To increase data, we will use active learning

In [28]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

from scipy.sparse import hstack, vstack

In [2]:
df = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df = df[df['y'].notnull()]
df.shape

(522, 17)

In [4]:
df_clean = pd.DataFrame(index=df.index)
df_clean['title'] = df['watch-title']

## Limpeza de dados e treinamento de ML para o active learning

### Pre processamento da data

In [45]:
r_today = pd.to_datetime('today')
today = r_today.strftime("%Y-%m-%d")

In [6]:
today_post = r_today.strftime("Publicado em %d de %b de %Y")
df = df.replace({'watch-time-text': r"\bhá \b\d+\b hor.+\b"}, {'watch-time-text': today_post}, regex=True)
clean_date = df['watch-time-text'].str.extract(r"(\d+) de ([a-zA-Z]+)\.? de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

month_map = {"jan": "Jan", "fev": "Feb", "mar": "Mar", "abr": "Apr", "mai": "May", "jun": "Jun",
             "jul": "Jul", "ago": "Aug", "set": "Sep", "out": "Oct", "nov": "Nov", "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map).fillna(r_today.strftime("%b"))
    
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

### Pre processamento das views

In [7]:
views = df['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_clean['views'] = views
df_clean.head()

Unnamed: 0,title,date,views
0,#DataScience #Pandas #python Python Pandas Tut...,2020-04-11,62
1,Machine Learning Course A To Z || Beginner to ...,2018-08-10,174642
2,Python For Data Science Full Course - 9 Hours ...,2020-03-15,14707
3,Michael I. Jordan: Machine Learning: Dynamical...,2019-05-02,4021
4,From UX Designer to Data Scientist | How to Ch...,2020-04-13,20


### Definição das features

In [9]:
features = pd.DataFrame(index=df_clean.index)
y = df['y'].copy()

In [10]:
features['days_since_pub'] = (pd.to_datetime(today) - df_clean['date']) / np.timedelta64(1, 'D')
features['views'] = df_clean['views']
features['daily_views'] = features['views'] / features['days_since_pub']
features = features.drop(['days_since_pub'], axis=1)

### Divisão em dados de treino e validação

In [18]:
mask_train = df_clean['date'] < "2019-03-01'"
mask_val = df_clean['date'] >= "2019-03-01'"

Xtrain, Xval = features[mask_train], features[mask_val]
ytrain, yval = y[mask_train], y[mask_val]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((255, 2), (267, 2), (255,), (267,))

In [20]:
title_train = df_clean[mask_train]['title']
title_val = df_clean[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,3))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [23]:
Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [25]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [26]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

### Active learning

In [34]:
df_unlabeled = pd.read_csv("raw_data_with_labels.csv", index_col=0)
df_unlabeled = df_unlabeled[df_unlabeled['y'].isnull()].dropna(how='all')

In [35]:
df_clean_u = pd.DataFrame(index=df_unlabeled.index)
df_clean_u['title'] = df_unlabeled['watch-title']

In [47]:
r_today = pd.to_datetime('today')
today = r_today.strftime("%Y-%m-%d")
today_post = r_today.strftime("Publicado em %d de %b de %Y")
df = df_unlabeled.replace({'watch-time-text': r"\bhá \b\d+\b hor.+\b"}, {'watch-time-text': today_post}, regex=True)
clean_date = df_unlabeled['watch-time-text'].str.extract(r"(\d+) de ([a-zA-Z]+)\.? de (\d+)")
clean_date[0] = clean_date[0].map(lambda x: "0"+x[0] if len(x) == 1 else x)

month_map = {"jan": "Jan", "fev": "Feb", "mar": "Mar", "abr": "Apr", "mai": "May", "jun": "Jun",
             "jul": "Jul", "ago": "Aug", "set": "Sep", "out": "Oct", "nov": "Nov", "dez": "Dec"}

clean_date[1] = clean_date[1].map(month_map).fillna(r_today.strftime("%b"))
    
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
df_clean_u['date'] = pd.to_datetime(clean_date, format="%d %b %Y")

In [48]:
views = df_unlabeled['watch-view-count'].str.extract(r"(\d+\.?\d*)", expand=False).str.replace(".", "").fillna(0).astype(int)
df_clean_u['views'] = views

In [49]:
features_u = pd.DataFrame(index=df_clean_u.index)

In [50]:
features_u['days_since_pub'] = (pd.to_datetime(today) - df_clean_u['date']) / np.timedelta64(1, 'D')
features_u['views'] = df_clean_u['views']
features_u['daily_views'] = features_u['views'] / features_u['days_since_pub']
features_u = features_u.drop(['days_since_pub'], axis=1)

In [51]:
title_u = df_clean_u['title']
title_bow_u = title_vec.transform(title_u)

In [56]:
pd.isnull(df_clean_u).sum() > 0

title    False
views    False
date     False
dtype: bool

In [57]:
Xu_wtitle = hstack([features_u, title_bow_u])

In [58]:
pu = mdl.predict_proba(Xu_wtitle)[:, 1]

In [59]:
df_unlabeled['p'] = pu

In [60]:
df_unlabeled.head(1)

Unnamed: 0,watch-title,y,og:url,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p
75,Why Kaggle for data science practitioners,,https://www.youtube.com/watch?v=1vBJZ9MOBc8,4.038 visualizações,Publicado em 8 de out. de 2017,Educação,Why Kaggle for data science practitioners,Minsuk Heo 허민석\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarr...,4.038 visualizações\n\n\n\n\n\n\n\n31\n\nGosto...,https://i.ytimg.com/vi/1vBJZ9MOBc8/hqdefault.jpg,480,360,This is Kaggle introduction why Kaggle is the ...,960.0,720.0,data science practice,/channel/UCxP77kNgVfiiG6CXZ5WMuAQ,0.577


In [62]:
mask_u = (df_unlabeled['p'] >= 0.45) & (df_unlabeled['p'] <= 0.55)

In [63]:
df_unlabeled[mask_u].sort_values("p").head(3)

Unnamed: 0,watch-title,y,og:url,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0,p
365,5 Must Have Skills To Become Machine Learning ...,,https://www.youtube.com/watch?v=DZ7xuZ1-uh8,357.604 visualizações,Publicado em 6 de fev. de 2018,Pessoas e blogs,5 Must Have Skills To Become Machine Learning ...,Art of Engineer\n\n\n\n\n\n\n\n\n\n\n\n\n\nCar...,357.604 visualizações\n\n\n\n\n\n\n\n14.133\n\...,https://i.ytimg.com/vi/DZ7xuZ1-uh8/hqdefault.jpg,480,360,"If this video is helpful to you, you can suppo...",1280.0,720.0,machine learning job skills,/channel/UCTXLMW9262FBpcQTVNXQ8Aw,0.45
1442,"Backpropagation calculus | Deep learning, chap...",,https://www.youtube.com/watch?v=tIeHLnjs5U8,1.047.347 visualizações,Publicado em 3 de nov. de 2017,Educação,3Blue1Brown series T3 • E4\n\n\n\n Backprop...,3Blue1Brown\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarre...,1.047.347 visualizações\n\n\n\n\n\n\n\n21.554\...,https://i.ytimg.com/vi/tIeHLnjs5U8/maxresdefau...,1280,720,Brought to you by you: http://3b1b.co/nn3-than...,1280.0,720.0,3brown1blue,/channel/UCYO_jab_esuFRV4b17AJtAw,0.45
372,Rethinking Business: Data Analytics With Googl...,,https://www.youtube.com/watch?v=DpngHc31a5Y,6.068 visualizações,Publicado em 10 de abr. de 2019,Ciência e tecnologia,Rethinking Business: Data Analytics With Googl...,G Suite\n\n\n\n\n\n\n\n\n\n\n\n\n\nCarregando....,6.068 visualizações\n\n\n\n\n\n\n\n84\n\nGosto...,https://i.ytimg.com/vi/DpngHc31a5Y/maxresdefau...,1280,720,"People, process, & technology: these universal...",1280.0,720.0,official,/channel/UCBmwzQnSoj9b6HzNmFrg_yw,0.45


In [64]:
hard_decisions = df_unlabeled[mask_u]

In [65]:
randoms = df_unlabeled[~mask_u].sample(31)

In [66]:
pd.concat([hard_decisions, randoms]).to_csv("active_label.csv")