In [1]:
import pandas as pd, numpy as np
from tqdm import tqdm
import tensorflow.keras as keras
import tensorflow

In [2]:
# Zapewnienie powtarzalności wyników
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [3]:
raw_df = pd.read_csv("ted_talks_en.csv")
raw_df.sample(5)

Unnamed: 0,talk_id,title,speaker_1,all_speakers,occupations,about_speakers,views,recorded_date,published_date,event,native_lang,available_lang,comments,duration,topics,related_talks,url,description,transcript
870,1119,The antidote to apathy,Dave Meslin,{0: 'Dave Meslin'},{0: ['artist and organizer']},"{0: 'Dave Meslin is a ""professional rabble-rou...",1869774,2010-10-10,2011-04-12,TEDxToronto 2010,en,"['ar', 'bg', 'cs', 'de', 'el', 'en', 'es', 'et...",236.0,425,"['TEDx', 'collaboration', 'community', 'cultur...","{1825: 'Why mayors should rule the world', 815...",https://www.ted.com/talks/dave_meslin_the_anti...,"Local politics -- schools, zoning, council ele...",How often do we hear that people just don't ca...
929,1183,Try something new for 30 days,Matt Cutts,{0: 'Matt Cutts'},{0: ['technologist']},"{0: 'An early employee at Google, Matt Cutts w...",12481445,2011-03-03,2011-07-01,TED2011,en,"['ar', 'arq', 'az', 'be', 'bg', 'bn', 'bs', 'c...",916.0,207,"['culture', 'success']","{947: 'Keep your goals to yourself', 282: 'Wha...",https://www.ted.com/talks/matt_cutts_try_somet...,"Is there something you've always meant to do, ...","A few years ago, I felt like I was stuck in a ..."
1669,1972,Be passionate. Be courageous. Be your best.,Gabby Giffords and Mark Kelly,{0: 'Gabby Giffords and Mark Kelly'},{0: ['former u.s. representative and nasa astr...,{0: 'After Rep. Gabby Giffords was wounded by ...,1134014,2014-03-21,2014-04-11,TED2014,en,"['ar', 'el', 'en', 'es', 'fa', 'fr', 'he', 'hu...",80.0,1128,"['violence', 'guns']","{1566: 'A story about knots and surgeons', 104...",https://www.ted.com/talks/gabby_giffords_and_m...,"On January 8, 2011, Congresswoman Gabby Giffor...","Pat Mitchell: That day, January 8, 2011, began..."
701,921,A headset that reads your brainwaves,Tan Le,{0: 'Tan Le'},{0: ['entrepreneur']},"{0: ""Tan Le is the founder & CEO of Emotiv, a ...",2741071,2010-07-16,2010-07-21,TEDGlobal 2010,en,"['ar', 'bg', 'cs', 'da', 'de', 'el', 'en', 'es...",595.0,636,"['brain', 'computers', 'design', 'entertainmen...",{685: 'The thrilling potential of SixthSense t...,https://www.ted.com/talks/tan_le_a_headset_tha...,Tan Le's astonishing new computer interface re...,"Up until now, our communication with machines ..."
2307,2663,"To solve old problems, study new species",Alejandro Sánchez Alvarado,{0: 'Alejandro Sánchez Alvarado'},{0: ['developmental and regeneration biologist']},{0: 'Alejandro Sánchez Alvarado wants to under...,1336563,2016-08-19,2017-01-12,TEDxKC,en,"['ar', 'bg', 'ca', 'de', 'el', 'en', 'es', 'fr...",46.0,759,"['animals', 'biodiversity', 'beauty', 'biology...","{206: 'Underwater astonishments', 2390: 'The s...",https://www.ted.com/talks/alejandro_sanchez_al...,"Nature is wonderfully abundant, diverse and my...","For the past few years, I've been spending my ..."


In [4]:
raw_df.shape

(4005, 19)

In [5]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4005 entries, 0 to 4004
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   talk_id         4005 non-null   int64  
 1   title           4005 non-null   object 
 2   speaker_1       4005 non-null   object 
 3   all_speakers    4001 non-null   object 
 4   occupations     3483 non-null   object 
 5   about_speakers  3502 non-null   object 
 6   views           4005 non-null   int64  
 7   recorded_date   4004 non-null   object 
 8   published_date  4005 non-null   object 
 9   event           4005 non-null   object 
 10  native_lang     4005 non-null   object 
 11  available_lang  4005 non-null   object 
 12  comments        3350 non-null   float64
 13  duration        4005 non-null   int64  
 14  topics          4005 non-null   object 
 15  related_talks   4005 non-null   object 
 16  url             4005 non-null   object 
 17  description     4005 non-null   o

## Oczyszczanie danych
* Usunięcie zbędnych kolumn

In [6]:
all_cols = [col for col in raw_df.columns]
all_cols

['talk_id',
 'title',
 'speaker_1',
 'all_speakers',
 'occupations',
 'about_speakers',
 'views',
 'recorded_date',
 'published_date',
 'event',
 'native_lang',
 'available_lang',
 'comments',
 'duration',
 'topics',
 'related_talks',
 'url',
 'description',
 'transcript']

In [7]:
left_cols = ['title','duration','topics','description','transcript']
del_cols = [item for item in all_cols if item not in left_cols]

In [8]:
for col in del_cols:
    del raw_df[col]

* Tablice jako string w DF

In [9]:
raw_df["topics"][0]

"['alternative energy', 'cars', 'climate change', 'culture', 'environment', 'global issues', 'science', 'sustainability', 'technology']"

In [10]:
import ast
topics = [ast.literal_eval(topic["topics"]) for topic in raw_df.iloc]
raw_df["topics"] = topics 

* Usunięcie kategorii poniżej 400 wystąpień

In [11]:
from collections import defaultdict
categories = defaultdict(int)
length = range(raw_df["topics"].shape[0])
for i in length:
    for j in topics[i]:
        categories[j] += 1
print(f"Categories count: {len(categories)}")

Categories count: 457


In [12]:
more_than = 400
cat_series = pd.Series(categories)
cat_series = cat_series[cat_series>=more_than].sort_values(ascending = False)
try:
    to_del = ["TEDx","TED-Ed"]
    cat_series  = cat_series.drop(to_del)
except:
    print("Lack cats to del.")

print(cat_series)
num_classes = len(cat_series)
print(f"Categories count: {num_classes}")

science          993
technology       979
culture          680
global issues    574
society          557
design           518
social change    512
animation        487
business         443
health           442
history          406
dtype: int64
Categories count: 11


In [13]:
cat_400 = cat_series.index
for row_num,row in enumerate(raw_df.iloc):
    intersect = list(set(row["topics"]) & set(cat_400))
    if intersect:
        raw_df.at[row_num, "topics"] = intersect
    else:
        raw_df = raw_df.drop([row_num])

* Wagi

In [14]:
#weight = number_of_all_samples / (2 * number_of_samples_cl1)
all_sampl_num = cat_series.sum()
class_weight = {}
for num, cat in enumerate(cat_series):
    class_weight[num] = (all_sampl_num / (2 * cat))

num_classes = len(class_weight.keys())

In [15]:
class_weight_norm = [weight/max(class_weight.values()) for weight in class_weight.values()]
for num, weight in enumerate(class_weight_norm):
    class_weight[num] = weight

* Kategorie jako one-hot

In [16]:
raw_df = raw_df.reset_index()

for row_num in range(raw_df.shape[0]):
    temp_cols = raw_df.loc[row_num, "topics"]
    raw_df.loc[row_num, temp_cols] = 1

df = raw_df.fillna(0)

In [17]:
y = df.drop(["title", "duration", "topics", "description", "transcript", "index"], axis = 1)

In [18]:
df

Unnamed: 0,index,title,duration,topics,description,transcript,science,technology,global issues,culture,health,business,design,social change,history,animation,society
0,0,Averting the climate crisis,977,"[science, technology, global issues, culture]",With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre...",1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,The best stats you've ever seen,1190,"[health, global issues]",You've never seen data presented like this. Wi...,"About 10 years ago, I took on the task to teac...",0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Simplicity sells,1286,[technology],New York Times columnist David Pogue takes aim...,"(Music: ""The Sound of Silence,"" Simon & Garfun...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Greening the ghetto,1116,[business],"In an emotionally charged talk, MacArthur-winn...",If you're here today — and I'm very happy that...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4,Do schools kill creativity?,1164,[culture],Sir Ken Robinson makes an entertaining and pro...,Good morning. How are you? (Audience) Good. It...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3396,4000,"Crisis support for the world, one text away",690,[technology],What if we could help people in crisis anytime...,"""I'm 14, and I want to go home."" ""My name is B...",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3397,4001,The dark history of IQ tests,346,"[animation, history]","In 1905, psychologists Alfred Binet and Théodo...","In 1905, psychologists Alfred Binet and Théodo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3398,4002,"How ""policing for profit"" undermines your rights",774,[society],"Many countries have an active, centuries-old l...",Picture yourself driving down the road tomorro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3399,4003,The electrifying speeches of Sojourner Truth,257,"[animation, history]",Isabella Baumfree was born into slavery in lat...,"In early 1828, Sojourner Truth approached the ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


* Tokenizacja i rdzeniowanie metodami Snowball i Porter

In [19]:
# Snowwball stemmer
from nltk.stem.snowball import SnowballStemmer
snow = SnowballStemmer(language='english')

# Porter stemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

# Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


In [20]:
def trans_trim(text,stemmer):
    def text_trimming(word):
        replace_marks = [".",",","?","-",":","[","]","!",")","(","\\"]
        for mark in replace_marks:
            word  = word.replace(mark,"")
        word = word.lower()
        word = stemmer.stem(word)
        return word
    temp = [word for word in text.split()]
    transcript = [text_trimming(word) for word in temp if word not in stop and len(word) > 1]
    return transcript

In [21]:
def get_tokens(list_of_txts,stemmer):
    tokens_arr = [trans_trim(text,stemmer) for text in tqdm(list_of_txts)]
    return tokens_arr

In [22]:
tokenized_tr_s = get_tokens(df["transcript"],snow)

100%|██████████████████████████████████████████████████████████████████████████████| 3401/3401 [01:21<00:00, 41.54it/s]


In [23]:
tokenized_tr_p = get_tokens(df["transcript"],porter)

100%|██████████████████████████████████████████████████████████████████████████████| 3401/3401 [01:55<00:00, 29.33it/s]


In [24]:
# Redukcja rzadich wyrazow
def rare_reduction(tokenized_text,size = 10000):
    vocab = defaultdict(int)
    length = range(sum([len(transcr) for transcr in tokenized_text]))

    for transcript in tokenized_tr_s:
        for word in transcript:
            vocab[word] += 1
    vocab = pd.Series(vocab).sort_values(ascending = False).head(size)
    for transcript in tqdm(tokenized_text):
        for word in transcript:
            if word not in vocab:
                transcript.remove(word)
    return tokenized_text

In [25]:
tokenized_tr_s = rare_reduction(tokenized_tr_s)

100%|█████████████████████████████████████████████████████████████████████████████| 3401/3401 [00:05<00:00, 638.27it/s]


In [26]:
tokenized_tr_p = rare_reduction(tokenized_tr_p)

100%|█████████████████████████████████████████████████████████████████████████████| 3401/3401 [00:06<00:00, 551.34it/s]


In [27]:
def join_again(texts):
    joined_texts = []
    for text in texts:
        joined_texts.append(" ".join(text))
    return joined_texts

transc_joined = join_again(tokenized_tr_s)
transc_joined_p = join_again(tokenized_tr_p)

In [28]:
df.loc[0, "transcript"][:80]

"Thank you so much, Chris. And it's truly a great honor to have the opportunity t"

In [29]:
transc_joined[0][:50]

'thank much chris and truli great honor opportun co'

In [30]:
transc_joined_p[0][:50]

'thank much and truli great honor opportun come sta'

## Podział zbioru i TF-IDF

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = df.drop(["title", "duration", "topics", "description", "transcript", "index"], axis = 1)

def split_data(transc_joined, y):
    return train_test_split(transc_joined, y, test_size=0.2, random_state=1)
train_text, test_text, y_train, y_test = split_data(transc_joined, y)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word', norm='l2', ngram_range=(1,1)) #  ngram_range=(1,3),strip_accents='unicode',
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
x_test = vectorizer.transform(test_text)

## Klasyfikacja multi-label

### Chain classifier

In [34]:
# Chain classifier
def chain_classifier(x_train,y_train,x_test, y_test):
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.linear_model import LogisticRegression
    # initialize classifier chains multi-label classifier
    classifier = ClassifierChain(LogisticRegression())
    # Training logistic regression model on train data
    classifier.fit(x_train, y_train)
    # predict
    predictions = classifier.predict(x_test)
    # accuracy
    print("Accuracy = ",accuracy_score(y_test,predictions))
    return predictions

chain_preds = chain_classifier(x_train,y_train,x_test, y_test)

Accuracy =  0.19236417033773862


### LabelPowerset classifier

In [35]:
# using Label Powerset
def powerset_classifier(x_train,y_train,x_test, y_test):
    from sklearn.linear_model import LogisticRegression
    from skmultilearn.problem_transform import LabelPowerset
    # initialize label powerset multi-label classifier
    classifier = LabelPowerset(LogisticRegression())
    # train
    classifier.fit(x_train, y_train)
    # predict
    predictions = classifier.predict(x_test)
    # accuracy
    print("Accuracy = ",accuracy_score(y_test,predictions))
    return predictions

powerset_preds = powerset_classifier(x_train,y_train,x_test, y_test)

Accuracy =  0.2511013215859031


### BinaryRelevance classifier

In [36]:
def brelevance_classifier(x_train,y_train,x_test,y_test):
    from skmultilearn.problem_transform import BinaryRelevance
    from sklearn.naive_bayes import GaussianNB
    classifier = BinaryRelevance(GaussianNB())  
    classifier.fit(x_train, y_train)  
    predictions = classifier.predict(x_test)
    # accuracy
    print("Accuracy = ",accuracy_score(y_test,predictions))
    return predictions

brelevance_preds = brelevance_classifier(x_train,y_train,x_test,y_test)

Accuracy =  0.06754772393538913


* Wyniki dla metody porter

In [37]:
train_text, test_text, y_train, y_test = split_data(transc_joined_p, y)

In [38]:
chain_preds = chain_classifier(x_train,y_train,x_test, y_test)

Accuracy =  0.19236417033773862


In [39]:
powerset_preds_p = powerset_classifier(x_train,y_train,x_test, y_test)

Accuracy =  0.2511013215859031


In [40]:
brelevance_preds = brelevance_classifier(x_train,y_train,x_test,y_test)

Accuracy =  0.06754772393538913


* Wyniki dla poszczególnych klas uzyskane metodą Label Powerset

In [41]:
from sklearn.metrics import classification_report
target_names = cat_series.keys()
print(classification_report(y_test, powerset_preds, target_names=target_names))

               precision    recall  f1-score   support

      science       0.70      0.80      0.75       200
   technology       0.79      0.53      0.63       195
      culture       0.66      0.43      0.52       119
global issues       0.51      0.30      0.38       144
      society       0.93      0.14      0.25        98
       design       0.84      0.39      0.53        82
social change       0.81      0.37      0.51       102
    animation       0.72      0.47      0.57       106
     business       1.00      0.13      0.23        77
       health       0.87      0.49      0.62       107
      history       0.54      0.34      0.42       109

    micro avg       0.71      0.44      0.54      1339
    macro avg       0.76      0.40      0.49      1339
 weighted avg       0.74      0.44      0.52      1339
  samples avg       0.71      0.49      0.55      1339



In [42]:
report_power = classification_report(y_test, powerset_preds, target_names=target_names, output_dict=True)
report_chain = classification_report(y_test, chain_preds, target_names=target_names, output_dict=True)
report_brelevance = classification_report(y_test, brelevance_preds, target_names=target_names, output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
categories_names = [key for key in report_power.keys()][:num_classes]
f1_power = np.array([report_power[cat]["f1-score"] for cat in categories_names]).round(2)
f1_chain = np.array([report_chain[cat]["f1-score"] for cat in categories_names]).round(2)
f1_brelevance = np.array([report_brelevance[cat]["f1-score"] for cat in categories_names]).round(2)

## Keras

* Model bez uwzględnienia wag klas

In [44]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [45]:
# max_words = max([len(i) for i in tokenized_tr_s])
max_words = 500
maxlen = 1300

In [46]:
df_topics_bin = df.drop(["title", "duration", "topics", "description", "transcript", "index"], axis = 1) 

In [47]:
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(transc_joined) # transc_joined

def get_features(text_series, maxlen = 1300):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)
x = get_features(transc_joined) # transc_joined
y = np.array(df_topics_bin)

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

In [49]:
# Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, Input, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam, Adagrad
def keras_seq(class_weight):
    model = Sequential()
    model.add(Embedding(input_dim =  max_words, output_dim = 512, input_length=maxlen))
    model.add(Conv1D(512, 12, padding='valid', activation='sigmoid', strides=1))
    model.add(Dropout(0.2))
    model.add(GlobalMaxPool1D())

    model.add(Dense(num_classes))
#     model.add(BatchNormalization())
    model.add(Activation('sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
    model.summary()

    callbacks = [
        ReduceLROnPlateau(), 
        EarlyStopping(patience=4), 
        ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
    ]
    
    history = model.fit(x_train, y_train,
                        class_weight=class_weight,
                        epochs=45,
                        batch_size=256,
                        validation_split=0.1,
                        callbacks=callbacks)
    return history, model

In [50]:
history, model = keras_seq(False)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1300, 512)         256000    
_________________________________________________________________
conv1d (Conv1D)              (None, 1289, 512)         3146240   
_________________________________________________________________
dropout (Dropout)            (None, 1289, 512)         0         
_________________________________________________________________
global_max_pooling1d (Global (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 11)                5643      
_________________________________________________________________
activation (Activation)      (None, 11)                0         
Total params: 3,407,883
Trainable params: 3,407,883
Non-trainable params: 0
______________________________________________

In [51]:
cnn_model = keras.models.load_model('model-conv1d.h5')
predictions = cnn_model.predict(x_test)

In [52]:
def prob_to_class(predictions):
    classes_predictions = []
    for arr in predictions:
        classes_predictions.append([1 if i > 0.5 else 0 for i in arr])
    return classes_predictions

classes_predictions = prob_to_class(predictions)

In [53]:
columns_pred = [f"{col}_pred" for col in df_topics_bin.columns]
pred_df = pd.DataFrame(classes_predictions, columns =  columns_pred)

columns_true = [f"{col}_true" for col in df_topics_bin.columns]
true_df = pd.DataFrame(y_test, columns =  columns_true)

scoring_df = pd.concat([pred_df,true_df], axis = 1)
scoring_df = scoring_df.reindex(sorted(scoring_df.columns), axis=1)
scoring_df.tail(10)

Unnamed: 0,animation_pred,animation_true,business_pred,business_true,culture_pred,culture_true,design_pred,design_true,global issues_pred,global issues_true,...,history_pred,history_true,science_pred,science_true,social change_pred,social change_true,society_pred,society_true,technology_pred,technology_true
331,0,0.0,0,0.0,0,0.0,0,1.0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0
332,1,1.0,0,0.0,0,1.0,0,0.0,0,0.0,...,1,1.0,0,0.0,0,0.0,0,0.0,0,0.0
333,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,1,1.0,1,1.0,0,0.0
334,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
335,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,1.0,0,1.0,0,0.0
336,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,...,0,0.0,0,0.0,1,0.0,1,0.0,0,0.0
337,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,...,0,0.0,1,0.0,0,0.0,0,0.0,1,0.0
338,0,0.0,0,1.0,0,1.0,0,0.0,0,1.0,...,0,1.0,0,0.0,0,0.0,0,0.0,0,0.0
339,0,0.0,0,0.0,0,1.0,0,0.0,0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
340,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


In [54]:
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[1], metrics[1]))

categorical_accuracy: 0.49266862869262695


In [55]:
from sklearn.metrics import classification_report
target_names = cat_series.keys()
print(classification_report(y_test, classes_predictions, target_names=target_names))

               precision    recall  f1-score   support

      science       0.69      0.70      0.70       101
   technology       0.74      0.55      0.63       109
      culture       0.56      0.31      0.40        65
global issues       0.48      0.15      0.23        66
      society       0.64      0.44      0.52        48
       design       0.65      0.43      0.52        35
social change       0.70      0.42      0.53        50
    animation       0.44      0.32      0.37        38
     business       0.73      0.34      0.47        32
       health       0.94      0.96      0.95        53
      history       0.37      0.21      0.27        48

    micro avg       0.67      0.47      0.55       645
    macro avg       0.63      0.44      0.51       645
 weighted avg       0.64      0.47      0.53       645
  samples avg       0.55      0.50      0.49       645



  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
keras_one = classification_report(y_test, classes_predictions, target_names=target_names, output_dict=True)
f1_keras_one = np.array([keras_one[cat]["f1-score"] for cat in categories_names]).round(2)

  _warn_prf(average, modifier, msg_start, len(result))


* Model uwzględniający wagi klas

In [57]:
history, model = keras_seq(class_weight)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1300, 512)         256000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1289, 512)         3146240   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1289, 512)         0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 11)                5643      
_________________________________________________________________
activation_1 (Activation)    (None, 11)                0         
Total params: 3,407,883
Trainable params: 3,407,883
Non-trainable params: 0
____________________________________________

In [58]:
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[1], metrics[1]))

predictions = cnn_model.predict(x_test)
classes_predictions = prob_to_class(predictions)
print(classification_report(y_test, classes_predictions, target_names=target_names))

categorical_accuracy: 0.4457477927207947
               precision    recall  f1-score   support

      science       0.69      0.60      0.65       101
   technology       0.77      0.51      0.62       109
      culture       0.57      0.35      0.44        65
global issues       0.55      0.18      0.27        66
      society       0.74      0.54      0.63        48
       design       0.62      0.46      0.52        35
social change       0.70      0.46      0.55        50
    animation       0.38      0.29      0.33        38
     business       0.73      0.34      0.47        32
       health       0.94      0.94      0.94        53
      history       0.38      0.19      0.25        48

    micro avg       0.68      0.46      0.55       645
    macro avg       0.64      0.44      0.52       645
 weighted avg       0.66      0.46      0.53       645
  samples avg       0.55      0.49      0.49       645



  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
keras_one_w = classification_report(y_test, classes_predictions, target_names=target_names, output_dict=True)
f1_keras_one_w = np.array([keras_one_w[cat]["f1-score"] for cat in categories_names]).round(2)

  _warn_prf(average, modifier, msg_start, len(result))


* Funkcyjny model 3-wejściowy

In [60]:
rows_for_test = 500

In [61]:
tokenized_transcr_s = get_tokens(df["transcript"].iloc[:-rows_for_test],snow)
tokenized_descr_s = get_tokens(df["description"].iloc[:-rows_for_test],snow)
tokenized_title_s = get_tokens(df["title"].iloc[:-rows_for_test],snow)

tokenized_transcr_test = get_tokens(df["transcript"].iloc[-rows_for_test:],snow)
tokenized_descr_test = get_tokens(df["description"].iloc[-rows_for_test:],snow)
tokenized_title_test = get_tokens(df["title"].iloc[-rows_for_test:],snow)

100%|██████████████████████████████████████████████████████████████████████████████| 2901/2901 [01:12<00:00, 39.81it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2901/2901 [00:02<00:00, 1156.37it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2901/2901 [00:00<00:00, 7180.58it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [00:09<00:00, 53.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 904.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 8333.80it/s]


In [62]:
tokenized_transcr_s = rare_reduction(tokenized_descr_s)
tokenized_descr_s = rare_reduction(tokenized_descr_s)
tokenized_title_s = rare_reduction(tokenized_title_s)

tokenized_transcr_test = rare_reduction(tokenized_descr_test)
tokenized_descr_test = rare_reduction(tokenized_descr_test)
tokenized_title_test = rare_reduction(tokenized_title_test)


100%|███████████████████████████████████████████████████████████████████████████| 2901/2901 [00:00<00:00, 22317.04it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2901/2901 [00:00<00:00, 21976.84it/s]
100%|██████████████████████████████████████████████████████████████████████████| 2901/2901 [00:00<00:00, 145053.60it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 19231.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 20833.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 125016.51it/s]


In [63]:
transc_joined = join_again(tokenized_transcr_s)
descr_joined = join_again(tokenized_descr_s)
title_joined = join_again(tokenized_title_s)

transc_joined_test = join_again(tokenized_transcr_test)
descr_joined_test = join_again(tokenized_descr_test)
title_joined_test = join_again(tokenized_title_test)

In [64]:
text_size_1 = 1500
text_size_2 = 50

x_transcript  = get_features(transc_joined, text_size_1)
x_description = get_features(descr_joined, text_size_2)
x_title = get_features(title_joined, text_size_2)

y = np.array(df_topics_bin.iloc[:-rows_for_test])
y = y.astype('int32')

# Test


x_transcript_test  = get_features(transc_joined_test, text_size_1)
x_description_test = get_features(descr_joined_test, text_size_2)
x_title_test = get_features(title_joined_test, text_size_2)
x_test = [x_transcript_test,x_description_test] # , x_title_test

y_test = np.array(df_topics_bin.iloc[-rows_for_test:])
y_test = y_test.astype('int32')

x_description.shape, x_transcript.shape,  y.shape, y_test.shape # x_title.shape,

((2901, 50), (2901, 1500), (2901, 11), (500, 11))

In [65]:
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, LSTM, Input, SpatialDropout1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam

transcript_input = Input(shape = (None,), dtype = 'int64', name = 'transcription')
description_input = Input(shape = (None,), dtype = 'int64', name = 'description')
title_input = Input(shape = (None,), dtype = 'int64', name = 'title')

embedded_transcript  = Embedding(input_dim = text_size_1, output_dim = 256, input_length=maxlen)(transcript_input)
embedded_description = Embedding(input_dim = 50, output_dim = 32, input_length=50)(description_input)
embedded_title = Embedding(input_dim = 50, output_dim = 32, input_length=50)(title_input)

encoded_transcript  = Conv1D(filters = 256, kernel_size = 12, padding='valid', activation='sigmoid', strides=1)(embedded_transcript)
encoded_description = Conv1D(filters = 32, kernel_size = 6, padding='valid', activation='sigmoid', strides=1)(embedded_description)
encoded_title = Conv1D(filters = 32, kernel_size = 6, padding='valid', activation='sigmoid', strides=1)(embedded_title)

pooled_transcript = GlobalMaxPool1D()(encoded_transcript)
pooled_description = GlobalMaxPool1D()(encoded_description)
pooled_title = GlobalMaxPool1D()(encoded_title)

concat = layers.concatenate([pooled_transcript, pooled_description, pooled_title], axis = -1)
# concat = layers.concatenate([pooled_transcript, pooled_description], axis = -1)


x = Dense(500, activation = "sigmoid")(concat)

output = Dense(num_classes, activation = "sigmoid")(x)

model = Model([transcript_input, description_input, title_input], output)

# model = Model([transcript_input, description_input], output)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])


callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv2d.h5', save_best_only = True)
]

model.fit([x_transcript, x_description, x_title], y, epochs = 30, batch_size=256, validation_split=0.1) #, callbacks = callbacks)
# model.fit([x_transcript, x_description], y, epochs = 15, batch_size=256, validation_split=0.1) #, callbacks = callbacks)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1b1fa5ec730>

In [66]:
# cnn_model = keras.models.load_model('model-conv2d.h5')

predictions = model.predict([x_transcript_test, x_description_test, x_title_test])
classes_predictions = prob_to_class(predictions)
print(classification_report(y_test, classes_predictions, target_names=target_names))

               precision    recall  f1-score   support

      science       0.57      0.60      0.59       151
   technology       0.51      0.43      0.47        95
      culture       0.29      0.52      0.38        29
global issues       0.58      0.28      0.38        88
      society       0.69      0.29      0.40        70
       design       0.48      0.33      0.39        36
social change       0.58      0.48      0.53        31
    animation       0.54      0.33      0.41        95
     business       0.55      0.39      0.46       111
       health       1.00      0.00      0.01       202
      history       0.59      0.37      0.45       104

    micro avg       0.54      0.33      0.41      1012
    macro avg       0.58      0.37      0.41      1012
 weighted avg       0.65      0.33      0.37      1012
  samples avg       0.43      0.33      0.35      1012



  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
keras_three = classification_report(y_test, classes_predictions, target_names=target_names, output_dict=True)
f1_keras_three = np.array([keras_three[cat]["f1-score"] for cat in categories_names]).round(2)

  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
import pandas as pd
scores_df = pd.DataFrame([categories_names,pd.Series(f1_keras_one), pd.Series(f1_keras_one_w), pd.Series(f1_power),\
                          pd.Series(f1_chain),pd.Series(f1_keras_three), pd.Series(f1_brelevance)]).T

In [69]:
scores_df.columns = ["Classes","Keras one-input", "Keras one-input-weight","S-k Powerset","S-k Chain", "Keras three-input","S-k B-relevance"]

In [70]:
scores_df

Unnamed: 0,Classes,Keras one-input,Keras one-input-weight,S-k Powerset,S-k Chain,Keras three-input,S-k B-relevance
0,science,0.7,0.65,0.75,0.68,0.59,0.54
1,technology,0.63,0.62,0.63,0.59,0.47,0.44
2,culture,0.4,0.44,0.52,0.41,0.38,0.2
3,global issues,0.23,0.27,0.38,0.2,0.38,0.3
4,society,0.52,0.63,0.25,0.34,0.4,0.12
5,design,0.52,0.52,0.53,0.5,0.39,0.13
6,social change,0.52,0.55,0.51,0.47,0.53,0.24
7,animation,0.37,0.33,0.57,0.48,0.41,0.15
8,business,0.47,0.47,0.23,0.25,0.46,0.1
9,health,0.95,0.94,0.62,0.78,0.01,0.12
