In [None]:
import os
import csv
import random
import logging as log
log.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=log.INFO)

import pandas as pd
import numpy as np
import text_normalizer as tn
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn import tree
from sklearn.svm import LinearSVC
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE

import lightgbm as lgbm

import shap

import fasttext

import gensim
from gensim import utils as gensim_utils
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_colwidth', None)

%matplotlib inline

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords + ['los', 'must', 'may', 'could','jim','would','without','also','thus','however','ben']

In [None]:
def display_closestwords_tsnescatterplot(model, dim, words, topn=10):
    
    arr = np.empty((0,dim), dtype='f')
    word_labels = words

    # get close words
    #close_words = [model.similar_by_word(word) for word in words]
    
    # add the vector for each of the closest words to the array
    close_words=[]
    for word in words:
        arr = np.append(arr, np.array([model[word]]), axis=0)
        close_words +=model.similar_by_word(word, topn=topn)
        
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    #np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()
    
def tsne_plot(model, words):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    #for word in model.wv.vocab:
    for word in words:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(14, 10)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

### Preprocessing

In [None]:
# MovieSummaries dataset. Source: http://www.cs.cmu.edu/~ark/personas/
df_meta = pd.read_csv('movie_genre_prediction/movie.metadata.tsv', sep='\t')
df_meta.head()

In [None]:
df_plot = pd.read_csv('movie_genre_prediction/plot_summaries.txt', sep='\t')
#df_plot.head()

In [None]:
df = pd.merge(df_plot, df_meta,on='movie_id',how='left')
#df.head()

In [None]:
df.drop(['x1','title','date','x2','x3','lang','country'],axis=1,inplace=True)
df.shape

In [None]:
df['plot'] = df['plot'].astype(str)
df['tags'] = df['tags'].astype(str)
df['tags'] = df['tags'].str.lower()

In [None]:
df['sci'] = ''
df['sci'] = df['tags'].apply(lambda x : 1 if 'science fiction' in x else 0)

In [None]:
df_sci = df[df['sci'] == 1]
df_sci.shape

In [None]:
df_non_sci = df[df['sci'] == 0][:2500]
df_non_sci.shape

In [None]:
df_train = pd.concat([df_sci,df_non_sci])

In [None]:
df_train['sci'].value_counts()

In [None]:
df_train['cleaned_plot'] = tn.normalize_corpus(corpus=df_train['plot'],stopwords=stopwords)
df_train.drop(['plot','tags'],axis=1,inplace=True)
df_train.to_csv('cleaned_plots.csv',index=False)
df_train.head()

In [None]:
df_train['cleaned_plot'] = df_train['plot']
df_train.drop(['plot','tags'],axis=1,inplace=True)
df_train.to_csv('cleaned_plots_original.csv',index=False)
df_train.head()

### TF IDF features

In [None]:
df = pd.read_csv('movie_genre_prediction/cleaned_plots.csv')

In [None]:
max_features = 700
min_df = 10
max_df = 0.5

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=31, shuffle=True, stratify=df['sci'])

In [None]:
tfidf = TfidfVectorizer(max_features=max_features, min_df=min_df,max_df=max_df,stop_words=stopwords,analyzer='word',\
                            token_pattern='[^\W\d_]{2,}', ngram_range=(1,3),strip_accents='unicode')

In [None]:
train_texts = df_train['cleaned_plot'].tolist()

tfidf.fit(train_texts)

tfidf_train = np.array(tfidf.transform(df_train['cleaned_plot']).todense(), dtype=np.float16)

tfidf_feature_names = { v:k for k,v in tfidf.vocabulary_.items() }

for i in range(max_features):
    df_train['tfidf_' + tfidf_feature_names[i]] = tfidf_train[:, i]

In [None]:
tfidf_test = np.array(tfidf.transform(df_test['cleaned_plot']).todense(), dtype=np.float16)

for i in range(max_features):
    df_test['tfidf_' + tfidf_feature_names[i]] = tfidf_test[:, i]

In [None]:
X_train = df_train.drop(['movie_id','sci','cleaned_plot'], axis=1,errors='ignore')
y_train = df_train['sci']
X_test = df_test.drop(['movie_id','sci','cleaned_plot'], axis=1,errors='ignore')
y_test = df_test['sci']

In [None]:
lr = LogisticRegression()
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
svm = LinearSVC()
lgm = lgbm.LGBMClassifier()

for model in [svm, knn,d_tree,lr,lgm,forest]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    #print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
df_out = X_test.copy()
df_out['truth'] = y_test
df_out.reset_index(inplace=True)
df_out['predicted'] = y_pred
df_misclassified = df_out[df_out['truth'] != df_out['predicted']][['index','truth','predicted']]

In [None]:
df_misclassified.head(10)

In [None]:
index = 2986
orig_index = df_misclassified[df_misclassified['index'] == index].index.values[0]
movie_id = df.loc[index]['movie_id']
print(df.loc[index])
print(X_test.loc[index])
print(orig_index)
print(movie_id)

In [None]:
df_meta[df_meta['movie_id'] == movie_id]

In [None]:
model = lr

In [None]:
feature_names = tfidf.get_feature_names_out()
explainer = shap.Explainer(model, X_train, feature_names=feature_names)

In [None]:
shap.initjs()

In [None]:
#shapexplainer = shap.Explainer(forest, X_train, feature_names=feature_names)
shap_values = explainer.shap_values(X_test)

In [None]:
shap_values_exp = explainer(X_test)

#### Global Explanations

In [None]:
try:
    # explainer for tree-based models
    shap.plots.bar(shap_values_exp[:,:,1], max_display=20)
except IndexError:
    # falling back to standard explainer
    print('Falling back to standard explainer')
    shap.plots.bar(shap_values_exp, max_display=20)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar',feature_names=feature_names, max_display=20)

In [None]:
# change the first argument from 0 to 1 to see the chart from other angle

try:
    shap.summary_plot(shap_values[0], X_test, plot_type='violin',feature_names=feature_names, max_display=20)
except AssertionError:
    print('Falling back to standard explainer')
    shap.summary_plot(shap_values, X_test, plot_type='violin',feature_names=feature_names, max_display=20)

In [None]:
# change the first argument from 1 to 0 to see the chart from other angle

try:
    shap.summary_plot(shap_values[1], X_test, plot_type='dot',feature_names=feature_names, max_display=20)
except AssertionError:
    print('Falling back to standard explainer')
    shap.summary_plot(shap_values, X_test, plot_type='violin',feature_names=feature_names, max_display=20)

#### Local Explainers

In [None]:
try:
    shap.plots.waterfall(shap_values_exp[orig_index,:,1], max_display=20)
except IndexError:
    print('Falling back')
    shap.plots.waterfall(shap_values_exp[orig_index], max_display=20)

In [None]:
try:
    shap.plots.bar(shap_values_exp[orig_index,:,1], max_display=20)
except IndexError:
    print('Falling back')
    shap.plots.bar(shap_values_exp[orig_index], max_display=20)

In [None]:
# TODO: to understand this plot
try:
    shap.dependence_plot(feature_names.tolist().index('like'), shap_values[1], X_test)
except TypeError:
    shap.dependence_plot(feature_names.tolist().index('like'), shap_values, X_test)

In [None]:
def predict(x):
    data = pd.Series(x)
    return model.predict_proba(tfidf.transform(data))

masker = shap.maskers.Text(r"\W")
corpus = [df.loc[index]['cleaned_plot']]
single_explainer = shap.Explainer(predict, masker, output_names=['Non Sci-Fi','Sci-Fi'])
single_shap_values = single_explainer(corpus)
shap.plots.text(single_shap_values)

In [None]:
choosen_instance = X_test.loc[index]
shap_values2 = explainer.shap_values(choosen_instance)
try:
    plot = shap.force_plot(explainer.expected_value[1], shap_values2[1], choosen_instance)
except IndexError:
    plot = shap.force_plot(explainer.expected_value, shap_values2, choosen_instance)
# the code block did not display the chart in the try-catch so I had to explicitly make the plot to be shown with this last line
plot

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Word2vec

In [None]:
df = pd.read_csv('movie_genre_prediction/cleaned_plots.csv')

In [None]:
sentences=[gensim_utils.simple_preprocess(x) for x in df['cleaned_plot'].tolist()]

vector_size = 300
window_size = 10
min_count = 10

w2v_model = gensim.models.Word2Vec(sentences,
                                   vector_size=vector_size,
                                   window=window_size,
                                   min_count=min_count)

In [None]:
w2v_model.wv.most_similar('deep')

In [None]:
w2v_model.build_vocab(sentences)

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
# tokenize the documents into words

from nltk.tokenize import word_tokenize
df['tok_plot'] = df['cleaned_plot'].str.lower().apply(word_tokenize)
#df['tok_plot_bi'] = df['tok_plot'].apply(lambda x: [x[0] + ' ' + x[1] for x in list(nltk.bigrams(x))])
#df['tok_plot_sum'] = df['tok_plot'] + df['tok_plot_bi']

In [None]:
df.head(5)

In [None]:
# convert the tokenized words into list of word vectors

words = set(w2v_model.wv.index_to_key )
df['vect_plot'] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in df['tok_plot']])

In [None]:
df.head()

In [None]:
# as the length of the document vary so does the length of word vector list
# for machine learning we need same size word vector list
# the word count of the tokens and the length of the word vector list is different because of the words that are not
# in the vocabulary (e.g. too few occurences or being a stop word)

for i, v in enumerate(df['vect_plot']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
# to have the same size vectors for all document we are generating the averaged document vectors
# the result is a constant size word vector for all documents

text_vect_avg = []
for v in df['vect_plot']:
    # v.size = length of word vector list * word vector size
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
        # axis=0 means it is averaging the values in the same position in all lists. This guarantees the same size.
        # The number of lists are different for each documents but the length of the word vectors are the same.
    else:
        text_vect_avg.append(np.zeros(vector_size, dtype=float)) # the same vector size must be used here as for model training
        
        
df['vect_plot_avg'] = text_vect_avg
df.head()

In [None]:
# now we can see that the vector lengths are constant

for i, v in enumerate(df['vect_plot_avg']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
df_train = pd.DataFrame(text_vect_avg)
df_train.columns = ['vec_avg_' + str(i+1) for i in range(0, df_train.shape[1])]
df_train

In [None]:
final_df = pd.concat([df[['sci']], df_train], axis=1, sort=False)
final_df

In [None]:
X = final_df.drop(['sci'], axis=1,errors='ignore')
y = final_df['sci']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True, stratify=y)

In [None]:
lr = LogisticRegression(max_iter=1000)
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
svm = LinearSVC()

for model in [lr,knn,d_tree,forest,svm]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    #print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
display_closestwords_tsnescatterplot(w2v_model.wv, 300, ['love'], 20)

### Glove

In [None]:
df = pd.read_csv('movie_genre_prediction/cleaned_plots.csv')

In [None]:
df['tok_plot'] = df['cleaned_plot'].str.lower().apply(word_tokenize)

In [None]:
glove_file = datapath(r'e:\python\nlp\glove.6B.300d.txt')
tmp_file = get_tmpfile('test_word2vec.txt')

_ = glove2word2vec(glove_file, tmp_file)
w2v_model = KeyedVectors.load_word2vec_format(tmp_file)

In [None]:
w2v_model.most_similar('deep', topn=20)

In [None]:
words = set(w2v_model.index_to_key)
words = words - set(stopwords)

In [None]:
df['vect_plot'] = np.array([np.array([w2v_model[i] for i in ls if i in words])
                         for ls in df['tok_plot']])

In [None]:
df.head()

In [None]:
# as the length of the document vary so does the length of word vector list
# for machine learning we need same size word vector list
# the word count of the tokens and the length of the word vector list is different because of the words that are not
# in the vocabulary (e.g. too few occurences or being a stop word)

for i, v in enumerate(df['vect_plot']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
# to have the same size vectors for all document we are generating the averaged document vectors
# the result is a constant size word vector for all documents

text_vect_avg = []
for v in df['vect_plot']:
    # v.size = length of word vector list * word vector size
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
        # axis=0 means it is averaging the values in the same position in all lists. This guarantees the same size.
        # The number of lists are different for each documents but the length of the word vectors are the same.
    else:
        text_vect_avg.append(np.zeros(vector_size, dtype=float)) # the same vector size must be used here as for model training
        
        
df['vect_plot_avg'] = text_vect_avg
df.head()

In [None]:
# now we can see that the vector lengths are constant

for i, v in enumerate(df['vect_plot_avg']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
df_train = pd.DataFrame(text_vect_avg)
df_train.columns = ['vec_avg_' + str(i+1) for i in range(0, df_train.shape[1])]
df_train

In [None]:
final_df = pd.concat([df[['sci']], df_train], axis=1, sort=False)
final_df

In [None]:
X = final_df.drop(['sci'], axis=1,errors='ignore')
y = final_df['sci']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True, stratify=y)

In [None]:
lr = LogisticRegression(max_iter=1000)
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
svm = LinearSVC()

for model in [lr,knn,d_tree,forest,svm]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    #print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
display_closestwords_tsnescatterplot(w2v_model, 300, ['alien'], 20)

### FastText

#### in standalone mode

In [None]:
# fasttext is a subword based model

df = pd.read_csv('movie_genre_prediction/cleaned_plots.csv')

In [None]:
def concat_target_label(row):
    sci = row['sci']
    text = row['cleaned_plot']
    
    return '__label__' + str(sci) + ' ' + text
    
# fasttext expects to have the labels concatenated with the original text in the format of __label__<label_value><text>
df['cleaned_plot'] = df.apply(concat_target_label, axis=1)

In [None]:
df.head()

In [None]:
df_train, df_test = train_test_split(df[['sci','cleaned_plot']], test_size=0.2, random_state=31, 
                                     shuffle=True, stratify=df['sci'])

In [None]:
df_train.head()

In [None]:
# the training and test set has to be in a separate file for fasttext

df_train.to_csv('train.txt', 
    index = False, 
    sep = ' ',
    header = None, 
    quoting = csv.QUOTE_NONE, 
    quotechar = "", 
    escapechar = " ")

df_test.to_csv('test.txt', 
    index = False, 
    sep = ' ',
    header = None, 
    quoting = csv.QUOTE_NONE, 
    quotechar = "", 
    escapechar = " ")


# Training the fastText classifier.
# in this case the unigrams resulted in a stronger model than longer n-grams
model = fasttext.train_supervised('train.txt', wordNgrams = 1)


In [None]:
model.test('test.txt')  

In [None]:
model.get_nearest_neighbors('alien', k=20)

In [None]:
model.predict("Earth's future has been riddled by disasters, famines, and droughts. There is only one way to ensure mankind's survival: Interstellar travel. A newly discovered wormhole in the far reaches of our solar system allows a team of astronauts to go where no man has gone before, a planet that may have the right environment to sustain human life")

In [None]:
# predict the data
df_test["predicted"] = df_test["cleaned_plot"].apply(lambda x: int(model.predict(x)[0][0].replace('__label__','')))

# Create the confusion matrix
confusion_matrix(df_test["sci"], df_test["predicted"])

#### as a gensim model

In [None]:
df = pd.read_csv('movie_genre_prediction/cleaned_plots.csv')

In [None]:
from gensim.models import FastText

In [None]:
ft_model = FastText(vector_size=300, window=10, min_count=10)

In [None]:
sentences=[gensim_utils.simple_preprocess(x) for x in df['cleaned_plot'].tolist()]

In [None]:
ft_model.build_vocab(corpus_iterable=sentences)

In [None]:
ft_model.train(corpus_iterable=sentences, total_examples=len(sentences), epochs=10)

In [None]:
ft_model.wv.most_similar('alien')

In [None]:
# tokenize the documents into words

from nltk.tokenize import word_tokenize
df['tok_plot'] = df['cleaned_plot'].str.lower().apply(word_tokenize)

In [None]:
df.head()

In [None]:
words = set(ft_model.wv.index_to_key )
df['vect_plot'] = np.array([np.array([ft_model.wv[i] for i in ls if i in words])
                         for ls in df['tok_plot']])

In [None]:
df.head()

In [None]:
# as the length of the document vary so does the length of word vector list
# for machine learning we need same size word vector list
# the word count of the tokens and the length of the word vector list is different because of the words that are not
# in the vocabulary (e.g. too few occurences or being a stop word)

for i, v in enumerate(df['vect_plot']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
text_vect_avg = []
for v in df['vect_plot']:
    # v.size = length of word vector list * word vector size
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
        # axis=0 means it is averaging the values in the same position in all lists. This guarantees the same size.
        # The number of lists are different for each documents but the length of the word vectors are the same.
    else:
        text_vect_avg.append(np.zeros(vector_size, dtype=float)) # the same vector size must be used here as for model training
        
        
df['vect_plot_avg'] = text_vect_avg
df.head()

In [None]:
# now we can see that the vector lengths are constant

for i, v in enumerate(df['vect_plot_avg']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
df_train = pd.DataFrame(text_vect_avg)
df_train.columns = ['vec_avg_' + str(i+1) for i in range(0, df_train.shape[1])]
df_train

In [None]:
final_df = pd.concat([df[['sci']], df_train], axis=1, sort=False)
final_df

In [None]:
X = final_df.drop(['sci'], axis=1,errors='ignore')
y = final_df['sci']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True, stratify=y)

In [None]:
lr = LogisticRegression(max_iter=1000)
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
svm = LinearSVC()

for model in [lr,knn,d_tree,forest,svm]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    #print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
display_closestwords_tsnescatterplot(ft_model.wv, 300, ['alien'], 20)

### Transformers

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, TFAutoModel, TFAutoModelForSequenceClassification, DataCollatorWithPadding 

In [None]:
df = pd.read_csv('movie_genre_prediction/cleaned_plots_original.csv')

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=31, shuffle=True, stratify=df['sci'])

In [None]:
df_test, df_val = train_test_split(df_test, test_size=0.1, random_state=31, shuffle=True, stratify=df_test['sci'])

In [None]:
train_ds = Dataset.from_pandas(df_train, split="train")
test_ds = Dataset.from_pandas(df_test, split="test")
val_ds = Dataset.from_pandas(df_val, split="validation")

dataset_dict = DatasetDict({"train":train_ds,"test":test_ds,"val":val_ds})
dataset_dict

In [None]:
print(train_ds[:5])

In [None]:
model_name = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch["cleaned_plot"], padding=True, truncation=True)

In [None]:
encoded_datasets = dataset_dict.map(tokenize, batched=True, batch_size=None)

In [None]:
encoded_datasets

#### Transformer as feature extractor

Transformers can extract features from text similarly to word2vec. The extracted features are the hidden states of the neural network under the hood of the transformer model.

In [None]:
tf_model = TFAutoModel.from_pretrained(model_name)

In [None]:
encoded_datasets.reset_format()

def extract_hidden_states(batch):
    # First convert text to tokens
    inputs = tokenizer(batch["cleaned_plot"], padding=True, truncation=True, return_tensors='tf')
    # Extract last hidden states
    outputs = tf_model(inputs)
    return {"hidden_state": outputs.last_hidden_state[:,0].numpy()}

In [None]:
# larger batch sizes cause out of resource errors
datasets_hidden = encoded_datasets.map(extract_hidden_states, batched=True, batch_size=16)
datasets_hidden

In [None]:
X_train = np.array(datasets_hidden["train"]["hidden_state"]) 
X_valid = np.array(datasets_hidden["test"]["hidden_state"]) 
y_train = np.array(datasets_hidden["train"]["sci"]) 
y_valid = np.array(datasets_hidden["test"]["sci"]) 
X_train.shape, X_valid.shape

In [None]:
lr_clf = LogisticRegression(max_iter=3000) 
lr_clf.fit(X_train, y_train) 
lr_clf.score(X_valid, y_valid)

In [None]:
y_preds = lr_clf.predict(X_valid)

In [None]:
def plot_cm(y_true, y_pred, figsize=(7, 5)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)
    
df_eval = pd.DataFrame({'y_true': y_valid, 'y_preds': y_preds})
plot_cm(df_eval['y_true'], df_eval['y_preds'])

#### Fine tuning transformer

An other way transformers can classify is that instead of using the hidden states as features we train them as part of the
classification.

In [None]:
tf_model = (TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2))

In [None]:
tokenizer_columns = tokenizer.model_input_names
# Define a batch size
batch_size = 8
# collator is here to support batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = encoded_datasets["train"].to_tf_dataset(columns=tokenizer_columns,label_cols=["sci"], 
                                                           shuffle=True,batch_size=batch_size,collate_fn=data_collator)
tf_test_dataset = encoded_datasets["test"].to_tf_dataset(columns=tokenizer_columns, label_cols=["sci"], 
                                                         shuffle=False,batch_size=batch_size,
                                                         collate_fn=data_collator)
tf_train_dataset

In [None]:
tf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                 metrics=tf.metrics.SparseCategoricalAccuracy())
tf_model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=2)
loss, eval_accuracy = tf_model.evaluate(tf_test_dataset)
print("Loss: {}\t Test Accuracy: {}".format(loss, eval_accuracy))

In [None]:
output_logits = tf_model.predict(tf_test_dataset).logits
pred_labels = np.argmax(output_logits, axis=-1)

In [None]:
encoded_datasets["test"] = encoded_datasets["test"].add_column("predicted_label", pred_labels)

In [None]:
encoded_datasets

In [None]:
encoded_datasets.set_format("pandas") 
cols = ["cleaned_plot", "sci", "predicted_label"]
df_test = encoded_datasets["test"][:][cols] 
df_test.head(4)

In [None]:
plot_cm(df_test.sci, df_test.predicted_label)