In [3]:
import numpy as np 
import pandas as pd 
# !pip install sentence_transformers

In [4]:
df = pd.read_csv('title_category.csv')
columns = df.columns.tolist()
columns[0] = 'Index'
df.columns = columns
df.head()

Unnamed: 0,Index,Title of the video,Type of Video
0,0.0,Everything Happens For A Reason | Muniba Mazari,
1,,Sprituality__##__General Knowledge,
2,1.0,Detachment from Overthinking & Stress | Boost ...,Sprituality__##__Mental Health__##__Chill Mix
3,2.0,I Meditated Every Day & This Is What Happened ...,Sprituality__##__Mental Health
4,3.0,Vipassanā Meditation: a daily meditation timer...,Mental Health


In [5]:
df[pd.isnull(df['Index'])]

Unnamed: 0,Index,Title of the video,Type of Video
1,,Sprituality__##__General Knowledge,
17,,General Knowledge,
19,,General Knowledge,
25,,Sprituality,
27,,Sprituality,
102,,Mental Health__##__Chill Mix,
104,,Mental Health,
106,,Mental Health,
108,,Mental Health,
128,,Sprituality,


In [6]:
for i in range(df.shape[0]):
    if pd.isnull(df.iloc[i,2]):
        df.iloc[i,2] = df.iloc[i+1,1]
df.dropna(subset=['Index'], inplace=True)
df.index = df['Index']
df.drop(['Index'], axis = 1, inplace=True)
df['Type of Video'] = df['Type of Video'].apply(lambda x: str(x).replace('__##__',','))

In [7]:
df.head()

Unnamed: 0_level_0,Title of the video,Type of Video
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,Everything Happens For A Reason | Muniba Mazari,"Sprituality,General Knowledge"
1.0,Detachment from Overthinking & Stress | Boost ...,"Sprituality,Mental Health,Chill Mix"
2.0,I Meditated Every Day & This Is What Happened ...,"Sprituality,Mental Health"
3.0,Vipassanā Meditation: a daily meditation timer...,Mental Health
4.0,Why Meditate? | Change your Brain's Default Mode,"Sprituality,Science,Mental Health"


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(min_df=2, max_features = None, strip_accents = 'unicode', norm='l2',
                            analyzer = 'char', token_pattern = r'\w{1,}',ngram_range=(1,5),
                            use_idf = 1, smooth_idf = 1, sublinear_tf = 1, stop_words = 'english')
features = tf_idf.fit_transform(df['Title of the video']).toarray()



In [9]:
features.shape

(625, 14723)

In [10]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
bert_features = embedder.encode(df['Title of the video'].tolist())

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
semantic_embedder = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
semantic_bert_features = semantic_embedder.encode(df['Title of the video'].tolist())

In [12]:
bert_features = np.array(bert_features)
semantic_bert_features = np.array(semantic_bert_features)

In [13]:
final_features = np.hstack((features, bert_features, semantic_bert_features))
final_features.shape

(625, 16259)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
y = vectorizer.fit_transform(df['Type of Video'])



In [15]:
final_features.shape

(625, 16259)

In [16]:
y.shape

(625, 20)

In [17]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier
clf = MultiOutputClassifier(SGDClassifier(max_iter=4000)).fit(final_features, y.toarray())

In [18]:
def generate_embedding(text):
    word_transform = tf_idf.transform([text]).toarray()[0]
    bert_transform = embedder.encode([text], show_progress_bar=False)[0]
    semantic_bert_transform = semantic_embedder.encode([text], show_progress_bar=False)[0]
    embedding = np.hstack((word_transform, bert_transform, semantic_bert_transform))
    return embedding

def get_terms(pred_list):
    return [w.title() for w in vectorizer.inverse_transform([pred_list])[0]]

def get_topics(text):
    text_embedding = generate_embedding(text)
    pred_list = clf.predict([text_embedding])[0]
    return get_terms(pred_list)

def increment_learn(text, topics):
    available_topics = vectorizer.get_feature_names()
    for topic in topics.split(','):
        if topic.lower() not in available_topics:
            return -1
    text_embedding = generate_embedding(text)
    topics = vectorizer.transform([topics]).toarray()[0]
    clf.partial_fit([text_embedding], [topics])

In [19]:

example = 'Eric Weinstein: Revolutionary Ideas in Science, Math, and Society | Artificial Intelligence Podcast'
get_topics(example)

['General Knowledge']

In [20]:
title = 'Consciousness -- the final frontier | Dada Gunamuktananda | TEDxNoosa 2014'
get_topics(title)

['Sprituality']

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

# Define a custom tokenizer function
def custom_tokenizer(text):
    return text.split(',')

# Create the CountVectorizer with the custom tokenizer
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

# Fit and transform the data
y = vectorizer.fit_transform(df['Type of Video'])

# Save the model
import pickle
pickle.dump(clf, open('model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))



In [26]:
import pickle

# Load model
model = pickle.load(open('model.pkl','rb'))
vectorizer = pickle.load(open('vectorizer.pkl','rb'))

# Use model
example = 'Eric Weinstein: Revolutionary Ideas in Science, Math, and Society | Artificial Intelligence Podcast'
get_topics(example)

['General Knowledge']

In [28]:

title = 'Machine Learning for Humans, Part 1: Introduction'
get_topics(title)

['Sprituality']