# 2019-2021 DSAA - Text Mining Project 
## Classification of Authors by Texts

**Group members:** 

M20190551  Wenyi Liang <br>
M20190559  Ernesto Madrid <br>
M20190802  Eliane Maria Zanlorense 

###  Access the corpora


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# Define a function to read the text file
def read_text_file(path, author):
    files = os.listdir(path)
    texts_list = []
    for file in files:
        if file.endswith(".txt"):
            text = open(path+file, 'r', encoding='utf-8').read()
            texts_list.append(text)
    df = pd.DataFrame({'Text': texts_list})
    df['Author_Label'] = author
    return df

# Read text files of each author
df_AN=read_text_file(path='Corpora/train/AlmadaNegreiros/', author='Almada Negreiros')
df_CC=read_text_file(path='Corpora/train/CamiloCasteloBranco/', author='Camilo Castelo Branco')
df_EQ=read_text_file(path='Corpora/train/EcaDeQueiros/', author='Eca De Queiros')
df_JR=read_text_file(path='Corpora/train/JoseRodriguesSantos/', author='Jose Rodrigues Santos')
df_JS=read_text_file(path='Corpora/train/JoseSaramago/', author='Jose Saramago')
df_LM=read_text_file(path='Corpora/train/LuisaMarquesSilva/', author='Luisa Marques Silva')

### Data cleaning and linguistic preprocessing

In [2]:
# Import nltk library and download stopwords
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')

# Import necessary libraries for text cleaning
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
import string
from tqdm import tqdm_notebook as tqdm

stop = set(stopwords.words('portuguese'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/iris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/iris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Define a function to clean the text, including lowercasing, number & punctuation & HTML syntax & URL removal and lemmatization
def clean(text_list, lemmatize, stemmer):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    updates = []
    for j in tqdm(range(len(text_list))):
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        text = text.lower()
        
        #REMOVE NUMERICAL DATA AND PUNCTUATION
        text = re.sub("[^a-zA-ZÁÉÍÓÚÀÂÊÔÃÕÜÇáéíóúàâêôãõüç]", ' ', text) 
        
        #COMPILE WHITE SPACE
        white_space = re.compile(r'\s+')
        text = white_space.sub(' ', text).strip()
        
        #REMOVE TAGS（html syntax）
        text = BeautifulSoup(text).get_text()

        #REMOVE URL
        text = re.sub(r"http\S+", "", text)

        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

# Define a function to update the cleaned dataframe
def update_df(dataframe, list_updated):
    dataframe.update(pd.DataFrame({"Text": list_updated}))
    
# Define a function to clean and update six dataframes
def clean_and_update(df1, df2, df3, df4, df5, df6):
    updates = clean(df1["Text"], lemmatize = True, stemmer = False)
    update_df(df1, updates)
    updates = clean(df2["Text"], lemmatize = True, stemmer = False)
    update_df(df2, updates)
    updates = clean(df3["Text"], lemmatize = True, stemmer = False)
    update_df(df3, updates)
    updates = clean(df4["Text"], lemmatize = True, stemmer = False)
    update_df(df4, updates)
    updates = clean(df5["Text"], lemmatize = True, stemmer = False)
    update_df(df5, updates)
    updates = clean(df6["Text"], lemmatize = True, stemmer = False)
    update_df(df6, updates)

In [4]:
# Clean and update the texts of the six authors
clean_and_update(df1=df_AN, df2=df_CC, df3=df_EQ, df4=df_JR, df5=df_JS, df6=df_LM)

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




### Break books into chunks

In [5]:
# Define functions to break a big text in chunks
from itertools import islice
from random import randint
def extractDigits(lst): 
    return [[el] for el in lst] 

def random_chunk(li, min_chunk, max_chunk):
    it = iter(li)
    while True:
        nxt = list(islice(it,randint(min_chunk,max_chunk)))
        if nxt:
            yield nxt
        else:
            break

# Define a function to break the texts of each author
def text_to_chunks(df):
    to_list = []
    for i in df['Text'].index:
      if len(df['Text'][i].split()) <= 1050:
        to_list.append(df['Text'][i])

    to_list = extractDigits(to_list)

    for i in df['Text'].index:
      if len(df['Text'][i].split()) > 1050:
        Text_to_chunks = df['Text'][i].split()
        # Splitting a text into chunks
        my_chunk = random_chunk(Text_to_chunks, min_chunk=450, max_chunk=1050)
        chunks = list(my_chunk)
        # Ensuring each chunk has at least 450 words, otherwise combine with the previous chunk 
        if len(chunks[-1]) < 450:
          chunks[-1] = chunks[-2] + chunks[-1]
          del chunks[-2]
        for c in chunks:
          to_list.append(c)
    return to_list

# Define a function to put chunks in a dataframe
def chunks_to_df(author, l):
    data = {'Author_Label': [author for i in range(len(l))]}
    df = pd.DataFrame(data)
    df ['Text'] = l
    df ['Text']= df ['Text'].astype(str)
    return df

In [6]:
# Split the texts of each author into chunks between 450 and 1050 words and put chunks in a dataframe
df_AN = chunks_to_df(author='Almada Negreiros', l=text_to_chunks(df=df_AN))
df_CC = chunks_to_df(author='Camilo Castelo Branco', l=text_to_chunks(df=df_CC))
df_EQ = chunks_to_df(author='Eca De Queiros', l=text_to_chunks(df=df_EQ))
df_JR = chunks_to_df(author='Jose Rodrigues Santos', l=text_to_chunks(df=df_JR))
df_JS = chunks_to_df(author='Jose Saramago', l=text_to_chunks(df=df_JS))
df_LM = chunks_to_df(author='Luisa Marques Silva', l=text_to_chunks(df=df_LM))

### Text re-cleaning


In [7]:
clean_and_update(df1=df_AN, df2=df_CC, df3=df_EQ, df4=df_JR, df5=df_JS, df6=df_LM)

HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1071.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=624.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1563.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1375.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))




### Undersampling for a balanced learning

In [8]:
# The number of chunks by author is imabalanced
# To avoid imbalanced learning, we are going to randomly drop chunks from authors who have more chunks
# Undersampling

# Count the least chunks for an author
min_chunks = min([df_AN['Author_Label'].count(),
                 df_CC['Author_Label'].count(),
                 df_EQ['Author_Label'].count(),
                 df_JR['Author_Label'].count(),
                 df_JS['Author_Label'].count(),
                 df_LM['Author_Label'].count()])

print('The smallest number of chunks for an author is: ', min_chunks)

The smallest number of chunks for an author is:  55


In [9]:
# Define a function to drop excessive chunks
def drop_random_chunks(df):
  np.random.seed(10)
  remove_n = df.shape[0] - min_chunks
  drop_indices = np.random.choice(df.index, remove_n, replace=False)
  df_subset = df.drop(drop_indices)
  return df_subset

# Drop chunks for each author dataframe
df_AN_subset = drop_random_chunks(df_AN)
df_CC_subset = drop_random_chunks(df_CC)
df_EQ_subset = drop_random_chunks(df_EQ)
df_JR_subset = drop_random_chunks(df_JR)
df_JS_subset = drop_random_chunks(df_JS)
df_LM_subset = drop_random_chunks(df_LM)

In [10]:
# Show a list of all the number of chunks by author
balanced_chunks = [df_AN_subset.shape[0],
                   df_CC_subset.shape[0],
                   df_EQ_subset.shape[0],
                   df_JR_subset.shape[0],
                   df_JS_subset.shape[0],
                   df_LM_subset.shape[0]]
balanced_chunks

[55, 55, 55, 55, 55, 55]

In [11]:
# Concatenate all the author's chunks in a single dataframe called df_train
pd.options.display.max_colwidth = 500
df_train = pd.concat([df_AN_subset, df_CC_subset, df_EQ_subset, df_JR_subset, df_JS_subset,df_LM_subset], 
                     ignore_index=True)
df_train.head()

Unnamed: 0,Author_Label,Text
0,Almada Negreiros,exposição amadeo de souza cardoso liga naval de lisboa em portugal existe uma unica opinião sobre arte e abrange uma tão colossal maioria que receio que ella impere por esmagamento essa opinião é a do ex mo sr dr josé de figueiredo gago do governo não é porque este snr tenha opinião nem que este snr seja da igualha do resto de portugal ma o resto de portugal e este senhor em materia de opinião são da mesma igualha um dia um senhor grisalho disse me em meia hora o seus conhecimentos sobre art...
1,Almada Negreiros,title o jardim da pierrette author josé de almada negreiros release date september ebook language portuguese argumento do bailado pierrot n aquella noite depois de muito pensar disse que queria morrer pierrot coitado tinha razão um dia inteiro sem ver pierrette não é viver deitou se muito cançado n um jardim qualquer havia luz no palacio e uma linda menina a fazer serão poi logo quiz deus nosso senhor que fôsse o jardim de pierrette pierrot não quiz acreditar ma depois ficou muito contente p...
2,Almada Negreiros,me deixaste nada nem me deixaste a morte zutt poeira pingo microbio que gemes pequenissimo gemidos gigantes gravido de uma dôr profeta colossal zutt elefante berloque parasita do não presta zutt bugiganga celluloide bagatella zutt bêsta zutt bácoro zutt merda e tu tambem vieille roche castello medieval fechado por dentro da tuas ruinas fiel epitaphio da cronicas aduladoras e tu tambem ó sangue azul antigo que já nasceste co a biographia feita ó pagem loiro da cortezias avozinhas ó pergaminho...
3,Almada Negreiros,o grandes o que são estragados por ti e de que serve o livro e a sciencia se a experiencia da vida é que faz comprehender a sciencia e o livro ante não ter sciencias ante não ter livros larga a cidade masturbadora febril rabo decepado de lagartixa labyrintho cego de toupeiras raça de ignobeis myope tysicos tarados anemicos cancerosos e arseniados larga a cidade larga a infamia da ruas e do boulevard esse vae vem cynico de bandidos mudos esse mexer esponjoso de carne viva esse sêr lêsma nojen...
4,Almada Negreiros,o panno verde da meza do bridge diz muito bem e pouco a pouco como doi astros perdidos no infinito e cujas trajectorias antecipadamente traçadas por aquelle que tudo rege forçosamente um dia se hão de cruzar assim tambem a nossas duas almas já por varias vezes o tinha presentido era inevitavel que mais cêdo ou mais tarde não viessem a encontrar se face a face e ainda bem pra mim não me enganei continúa um dia déra lhe pra pintar e voltou pra mim numa tela um torso ancioso na intensão de vici...


## Model Development
### Model evaluation metrics

In [12]:
# Model evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

# To evaluate the results, we are going to use also the classification report method
def metrics(y_val, pred_val):
    print('                                     MODEL EVALUATION REPORT                                               ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))

### Extract text features by creating Bag of Words (BoW)

In [13]:
# CREATE BAG OF WORDS
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.9, binary=True, stop_words = stop, ngram_range=(1,2))

# DEFINING THE INDEPENDENT AND DEPENDENT VARIABLE
X = cv.fit_transform(df_train["Text"])
y = np.array(df_train["Author_Label"])

### Stratified K Fold cross validation for BoW


In [14]:
# K-FOLD CROSS VALIDATION
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False) 

# Split 
for train_index, val_index in skf.split(X, y):
  print("TRAIN:", train_index)
  print("VALIDATION:", val_index, '\n')
  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

TRAIN: [ 11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28
  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46
  47  48  49  50  51  52  53  54  66  67  68  69  70  71  72  73  74  75
  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 121 122
 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
 159 160 161 162 163 164 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211 212 213 214 215 216 217 218 219 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
 271 272 273 274 286 287 288 289 290 291 292 293 294 295 296 297 298 299
 300 301 302 303 304 305 306 307 308 309 310

### KNeighbors Classifier using BoW (Baseline)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Parameter Tuning
model_KNN = KNeighborsClassifier(weights='distance', algorithm='brute', p=2, 
                                 metric='cosine', metric_params=None, n_jobs=1)
n_neighbors = list(range(1,16))
leaf_size = list(range(1,41)) 
hyperparameters = dict(n_neighbors=n_neighbors, leaf_size=leaf_size)

In [16]:
# Fit and search the best parameters
grid = GridSearchCV(estimator=model_KNN, param_grid=hyperparameters)
grid.fit(X, y)

# summarize the results of the parameter search
print('Best Score: ',grid.best_score_)
print('Best n_neighbors: ',grid.best_estimator_.get_params()['n_neighbors'])
print('Best leaf_size: ',grid.best_estimator_.get_params()['leaf_size'])

Best Score:  0.8303030303030303
Best n_neighbors:  14
Best leaf_size:  1




In [17]:
# K neighbors classifier
KNN = KNeighborsClassifier(n_neighbors=grid.best_estimator_.get_params()['n_neighbors'], weights='distance', 
                           algorithm='brute', leaf_size=grid.best_estimator_.get_params()['leaf_size'], 
                           p=2,metric='cosine', metric_params=None, n_jobs=1)
KNN.fit(X_train, y_train)
labels_val = KNN.predict(X_val)

# Call the function metrics() defined previously
metrics(y_val, labels_val)

                                     MODEL EVALUATION REPORT                                               
-----------------------------------------------------------------------------------------------------------
                       precision    recall  f1-score   support

     Almada Negreiros       0.90      0.82      0.86        11
Camilo Castelo Branco       1.00      1.00      1.00        11
       Eca De Queiros       0.83      0.91      0.87        11
Jose Rodrigues Santos       1.00      0.64      0.78        11
        Jose Saramago       0.73      1.00      0.85        11
  Luisa Marques Silva       1.00      1.00      1.00        11

             accuracy                           0.89        66
            macro avg       0.91      0.89      0.89        66
         weighted avg       0.91      0.89      0.89        66

[[ 9  0  2  0  0  0]
 [ 0 11  0  0  0  0]
 [ 1  0 10  0  0  0]
 [ 0  0  0  7  4  0]
 [ 0  0  0  0 11  0]
 [ 0  0  0  0  0 11]]


## Improve the Baseline by training other models

### Multinomial Naive Bayes Classifier using BoW

In [18]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
labels_val = MNB.predict(X_val)

metrics(y_val, labels_val)

                                     MODEL EVALUATION REPORT                                               
-----------------------------------------------------------------------------------------------------------
                       precision    recall  f1-score   support

     Almada Negreiros       1.00      1.00      1.00        11
Camilo Castelo Branco       1.00      1.00      1.00        11
       Eca De Queiros       1.00      1.00      1.00        11
Jose Rodrigues Santos       1.00      1.00      1.00        11
        Jose Saramago       0.92      1.00      0.96        11
  Luisa Marques Silva       1.00      0.91      0.95        11

             accuracy                           0.98        66
            macro avg       0.99      0.98      0.98        66
         weighted avg       0.99      0.98      0.98        66

[[11  0  0  0  0  0]
 [ 0 11  0  0  0  0]
 [ 0  0 11  0  0  0]
 [ 0  0  0 11  0  0]
 [ 0  0  0  0 11  0]
 [ 0  0  0  0  1 10]]


### Logistic Regression Classifier using BoW

In [19]:
# Set parameters for Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
Logis_Reg = LogisticRegression()
Logis_Reg.fit(X_train, y_train)
labels_val = Logis_Reg.predict(X_val)

metrics(y_val, labels_val)

                                     MODEL EVALUATION REPORT                                               
-----------------------------------------------------------------------------------------------------------
                       precision    recall  f1-score   support

     Almada Negreiros       1.00      1.00      1.00        11
Camilo Castelo Branco       1.00      1.00      1.00        11
       Eca De Queiros       1.00      1.00      1.00        11
Jose Rodrigues Santos       0.91      0.91      0.91        11
        Jose Saramago       1.00      1.00      1.00        11
  Luisa Marques Silva       0.91      0.91      0.91        11

             accuracy                           0.97        66
            macro avg       0.97      0.97      0.97        66
         weighted avg       0.97      0.97      0.97        66

[[11  0  0  0  0  0]
 [ 0 11  0  0  0  0]
 [ 0  0 11  0  0  0]
 [ 0  0  0 10  0  1]
 [ 0  0  0  0 11  0]
 [ 0  0  0  1  0 10]]


### Another feature extraction method: TF-IDF

In [20]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2))
X = tfidf.fit_transform(df_train["Text"])
y = np.array(df_train["Author_Label"])

### Stratified K Fold cross validation for TF-IDF

In [21]:
# Stratefied K-fold cross validation

skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False) 

# Split 
for train_index, val_index in skf.split(X, y):
  print("TRAIN:", train_index)
  print("VALIDATION:", val_index, '\n')
  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

TRAIN: [ 11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28
  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46
  47  48  49  50  51  52  53  54  66  67  68  69  70  71  72  73  74  75
  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93
  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 121 122
 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
 159 160 161 162 163 164 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211 212 213 214 215 216 217 218 219 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
 271 272 273 274 286 287 288 289 290 291 292 293 294 295 296 297 298 299
 300 301 302 303 304 305 306 307 308 309 310

### KNeighbors Classifier using TF-IDF

In [22]:
# Parameter tuning
# Fit and search the best parameters
grid = GridSearchCV(estimator=model_KNN, param_grid=hyperparameters)
grid.fit(X, y)

# summarize the results of the parameter search
print('Best Score: ',grid.best_score_)
print('Best n_neighbors: ',grid.best_estimator_.get_params()['n_neighbors'])
print('Best leaf_size: ',grid.best_estimator_.get_params()['leaf_size'])

Best Score:  0.7212121212121212
Best n_neighbors:  6
Best leaf_size:  1




In [23]:
# K neighbors classifier for TD-IDF
KNN = KNeighborsClassifier(n_neighbors=grid.best_estimator_.get_params()['n_neighbors'], weights='distance', 
                           algorithm='brute', leaf_size=grid.best_estimator_.get_params()['leaf_size'], 
                           p=2, metric='cosine', metric_params=None, n_jobs=1)
KNN.fit(X_train, y_train)
labels_val = KNN.predict(X_val)

# Call the function metrics() defined previously
metrics(y_val, labels_val)

                                     MODEL EVALUATION REPORT                                               
-----------------------------------------------------------------------------------------------------------
                       precision    recall  f1-score   support

     Almada Negreiros       0.85      1.00      0.92        11
Camilo Castelo Branco       0.85      1.00      0.92        11
       Eca De Queiros       1.00      0.64      0.78        11
Jose Rodrigues Santos       0.80      0.36      0.50        11
        Jose Saramago       0.50      0.82      0.62        11
  Luisa Marques Silva       0.80      0.73      0.76        11

             accuracy                           0.76        66
            macro avg       0.80      0.76      0.75        66
         weighted avg       0.80      0.76      0.75        66

[[11  0  0  0  0  0]
 [ 0 11  0  0  0  0]
 [ 2  1  7  0  1  0]
 [ 0  0  0  4  5  2]
 [ 0  1  0  1  9  0]
 [ 0  0  0  0  3  8]]


### Multinomial Naive Bayes Classifier using TF-IDF

In [24]:
# Multinomial Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X_train, y_train)
labels_val = MNB.predict(X_val)

metrics(y_val, labels_val)

                                     MODEL EVALUATION REPORT                                               
-----------------------------------------------------------------------------------------------------------
                       precision    recall  f1-score   support

     Almada Negreiros       1.00      0.91      0.95        11
Camilo Castelo Branco       0.85      1.00      0.92        11
       Eca De Queiros       1.00      0.91      0.95        11
Jose Rodrigues Santos       1.00      0.73      0.84        11
        Jose Saramago       0.79      1.00      0.88        11
  Luisa Marques Silva       0.82      0.82      0.82        11

             accuracy                           0.89        66
            macro avg       0.91      0.89      0.89        66
         weighted avg       0.91      0.89      0.89        66

[[10  1  0  0  0  0]
 [ 0 11  0  0  0  0]
 [ 0  1 10  0  0  0]
 [ 0  0  0  8  1  2]
 [ 0  0  0  0 11  0]
 [ 0  0  0  0  2  9]]


### Logistic Regression Classifier using TF-IDF

In [25]:
Logis_Reg = LogisticRegression()
Logis_Reg.fit(X_train, y_train)
labels_val = Logis_Reg.predict(X_val)

metrics(y_val, labels_val)

                                     MODEL EVALUATION REPORT                                               
-----------------------------------------------------------------------------------------------------------
                       precision    recall  f1-score   support

     Almada Negreiros       0.92      1.00      0.96        11
Camilo Castelo Branco       1.00      1.00      1.00        11
       Eca De Queiros       1.00      0.91      0.95        11
Jose Rodrigues Santos       0.73      1.00      0.85        11
        Jose Saramago       0.69      1.00      0.81        11
  Luisa Marques Silva       1.00      0.18      0.31        11

             accuracy                           0.85        66
            macro avg       0.89      0.85      0.81        66
         weighted avg       0.89      0.85      0.81        66

[[11  0  0  0  0  0]
 [ 0 11  0  0  0  0]
 [ 1  0 10  0  0  0]
 [ 0  0  0 11  0  0]
 [ 0  0  0  0 11  0]
 [ 0  0  0  4  5  2]]


### Read text files from the test folder

In [26]:
# define a function to read test files
def read_test_files(path, words):
    path = path
    files = os.listdir(path)
    list_ = []
    list_file = []
    for file in files:
        if file.endswith(".txt"):
            text = open(path+file, 'r').read()
            list_.append(text)
            list_file.append(file)

    df = pd.DataFrame({'Text': list_, 'Text name': list_file})
    df['Nm. of Words'] = words
    return df

df_500 = read_test_files(path='Corpora/test/500Palavras/', words = '500')
df_1000 = read_test_files(path='Corpora/test/1000Palavras/', words = '1000')
df_test = pd.concat([df_500,df_1000], ignore_index=True)

### Clean the test texts

In [27]:
# Clean the test texts
updates = clean(df_test["Text"], lemmatize = True, stemmer = False)
update_df(df_test, updates)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




## Model selection for prediction on text excerpts
### Selected model: Multinomial Naive Bayes Classifier using BoW

In [28]:
# DEFINING THE INDEPENDENT AND DEPENDENT VARIABLE
X = cv.fit_transform(df_train["Text"])
y = np.array(df_train["Author_Label"])

# Create bag of words for the test set
X_test = cv.transform(df_test["Text"])

In [29]:
# Multinomial Naive Bayes classifier
MNB = MultinomialNB()
MNB.fit(X, y)
labels_test = MNB.predict(X_test)

In [30]:
# Add predicted labels to the df_test
df_test['Predicted_Author'] = labels_test

# Drop Text column for better visulization
df_test_prediction = df_test.drop(['Text'], axis=1)

# Sort the df_test with predicted labels by Text name
df_test_prediction.sort_values(['Text name'])

Unnamed: 0,Text name,Nm. of Words,Predicted_Author
3,text1.txt,500,Jose Saramago
9,text1.txt,1000,Jose Saramago
5,text2.txt,500,Almada Negreiros
11,text2.txt,1000,Almada Negreiros
4,text3.txt,500,Luisa Marques Silva
10,text3.txt,1000,Luisa Marques Silva
2,text4.txt,500,Eca De Queiros
8,text4.txt,1000,Eca De Queiros
1,text5.txt,500,Camilo Castelo Branco
7,text5.txt,1000,Camilo Castelo Branco
