## **Generate Features using TF-IDF**

features are constructed using word representation models and TFIDF vectorisation on the documents, transforming blog post texts into numerical vectors.

#### **Import Libraries**

In [23]:
# general
import pandas as pd
import numpy as np
from tqdm import tqdm

# data preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# train/test split
from sklearn.model_selection import train_test_split

# model
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

print("> Libraries Imported")

> Libraries Imported


#### **Import Dataset**

In [24]:
alz_df = pd.read_csv("data/alz_texts.csv")
alz_df

Unnamed: 0,doc_id,doc_text,doc_class
0,"Tuesday, January 29, 2019",18 months.In some ways it seems like an eterni...,0
1,"Sunday, July 29, 2018",One year.How can time seem fast and slow at th...,0
2,"Wednesday, May 16, 2018",It's been a while since I've last written.Life...,0
3,"Tuesday, January 16, 2018",It's been a while since my last post.Partly be...,0
4,"Thursday, November 16, 2017",Last Sunday was our 3rd annual Purple Boat Flo...,0
...,...,...,...
3646,"Thursday, June 4, 2009",My Neurologist has recently prescribed Physica...,1
3647,"Thursday, June 4, 2009",Sounds like something high society folks do.La...,1
3648,"Saturday, May 30, 2009","Life, and how it progresses is an interesting ...",1
3649,"Thursday, May 28, 2009",I am a retired Navy Master Chief Gunner's Mate...,1


Delete NAs observations:

In [25]:
alz_df = alz_df.dropna()
alz_df.shape

(3644, 3)

In [26]:
alz_df_class_0 = alz_df.loc[alz_df["doc_class"] == 0]
alz_df_class_1 = alz_df.loc[alz_df["doc_class"] == 1]

In [27]:
alz_df_class_0.shape


(1370, 3)

In [28]:
alz_df_class_1.shape

(2274, 3)

#### **Generate Features with TF-IDF**

In [29]:
# set X and Y
X = alz_df["doc_text"]
Y = alz_df["doc_class"]

In [30]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)

X_counts

<3644x28894 sparse matrix of type '<class 'numpy.int64'>'
	with 660208 stored elements in Compressed Sparse Row format>

In [31]:
print("> Words Array:", count_vect.get_feature_names_out())
print("> Total Number of Words:", len(count_vect.get_feature_names_out()))

print("\n> Array 'X_counts'")
display(X_counts.toarray())

> Words Array: ['00' '000' '000000' ... 'zyprexa' 'zzz' 'zzzs']
> Total Number of Words: 28894

> Array 'X_counts'


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Apply TF-IDF vectorisation:

In [32]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

X_tfidf

<3644x28894 sparse matrix of type '<class 'numpy.float64'>'
	with 660208 stored elements in Compressed Sparse Row format>

In [35]:
tfidf_vectorizer = TfidfVectorizer()

response = tfidf_vectorizer.fit_transform(alz_df["doc_text"])

In [37]:
feature_array = np.array(tfidf_vectorizer.get_feature_names_out())
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

n = 10
top_n = feature_array[tfidf_sorting[:n]]
top_n

array(['beginning', 'the', 'zzzs', 'expectancy', 'exorbitant', 'exorcise',
       'exotic', 'expand', 'expanded', 'expanding'], dtype=object)

Create dataset with words and idfs.

In [38]:
words_list = list(count_vect.get_feature_names_out())
idf_list = list(tfidf_transformer.idf_)

# create df from list
words_tfidf = pd.DataFrame(
    list(zip(words_list, idf_list)),
    columns =['word', 'idf']
    )

words_tfidf.sort_values(by=['idf'], ascending=False)[0:10]

Unnamed: 0,word,idf
14447,kitcehn,8.507964
15531,mabee,8.507964
15511,lyfestyle,8.507964
15512,lyft,8.507964
15514,lymph,8.507964
15515,lymphodema,8.507964
15516,lyn,8.507964
15517,lyndon,8.507964
15520,lynns,8.507964
15521,lyon,8.507964


---
## **Model training and testing**
### **Train, Validation and Test split**

In [39]:
X_train, X_val, Y_train, Y_val = train_test_split(
    X_tfidf,
    Y,
    test_size=0.30,
    random_state=42
)

In [40]:
X_val, X_test, Y_val, Y_test = train_test_split(
    X_val,
    Y_val,
    test_size=0.50,
    random_state=42
)

### **Validation phase**
#### **Linear kernel SVM hyperparameter tuning (cost)**

In [50]:
# define cost range to test
cost_list = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

Loop over the cost values, for each of them:
- train the model
- test on the validation set
- save accuracy, F1 score, precision and recall

In [231]:
SVC_linear_res = []

for i in cost_list:
    # training
    clf = SVC(C = i, kernel = SVC_KERNEL, verbose = True)
    model = clf.fit(X_train, Y_train)
    
    # validation
    # accuracy
    score_val = clf.score(X_val, Y_val)
    
    predicted_y = clf.predict(X_val)
    
    # confusion matrix
    tn, fp, fn, tp = confusion_matrix(Y_val, predicted_y).ravel()
    # precision
    precision_score_cl_1 = tp / (tp + fp)
    # recall
    recall = tp / (tp + fn)
    # F1 score
    F1_scors = 2 * (precision_score_cl_1 * recall) / (precision_score_cl_1 + recall)
    
    temp_list = ("Linear kernel SVM", i, score_val, F1_scors, precision_score_cl_1, recall)
    SVC_linear_res.append(temp_list)
    print(i, score_val)

[LibSVM]0.0001 0.6014625228519196
[LibSVM]0.001 0.6014625228519196
[LibSVM]0.01 0.6014625228519196
[LibSVM]0.1 0.7404021937842779
[LibSVM]1 0.9744058500914077
[LibSVM]10 0.9762340036563071
[LibSVM]100 0.9762340036563071
[LibSVM]1000 0.9762340036563071


Create results DataFrame:

In [232]:
SVC_linear_df = pd.DataFrame(SVC_linear_res, columns =['Model', 'Cost', 'Accuracy', 'F1 score', 'Precision', 'Recall'])
SVC_linear_df

Unnamed: 0,Model,Cost,Accuracy,F1 score,Precision,Recall
0,Linear kernel SVM,0.0001,0.601463,0.751142,0.601463,1.0
1,Linear kernel SVM,0.001,0.601463,0.751142,0.601463,1.0
2,Linear kernel SVM,0.01,0.601463,0.751142,0.601463,1.0
3,Linear kernel SVM,0.1,0.740402,0.8225,0.698514,1.0
4,Linear kernel SVM,1.0,0.974406,0.978916,0.970149,0.987842
5,Linear kernel SVM,10.0,0.976234,0.980392,0.973054,0.987842
6,Linear kernel SVM,100.0,0.976234,0.980392,0.973054,0.987842
7,Linear kernel SVM,1000.0,0.976234,0.980392,0.973054,0.987842


Save DataFrame:

In [181]:
SVC_linear_df.to_csv('SVC.csv', index=False)

#### **Polynomial kernel SVM hyperparameter tuning (cost and degree)**

In [96]:
# degree range to test
degree_list = [2, 3, 4, 5, 6]

In [171]:
SVC_poly_res = []

for i in cost_list:
    for d in degree_list:
        clf = SVC(C = i, kernel = "poly", degree = d, verbose = True)
        model = clf.fit(X_train, Y_train)
        score_val = clf.score(X_val, Y_val)
        
        predicted_y = clf.predict(X_val)
        tn, fp, fn, tp = confusion_matrix(Y_val, predicted_y).ravel()
        precision_score_cl_1 = tp / (tp + fp)
        recall = tp / (tp + fn)
        F1_scors = 2 * (precision_score_cl_1 * recall) / (precision_score_cl_1 + recall)
    
        temp_list = ("Polynomial kernel SVM", i, d, score_val, F1_scors, precision_score_cl_1, recall)
        SVC_poly_res.append(temp_list)
    
        print(i, "|", d, "|", score_val)

[LibSVM]0.0001 | 2 | 0.6014625228519196
[LibSVM]0.0001 | 3 | 0.6014625228519196
[LibSVM]0.0001 | 4 | 0.6014625228519196
[LibSVM]0.0001 | 5 | 0.6014625228519196
[LibSVM]0.0001 | 6 | 0.6014625228519196
[LibSVM]0.001 | 2 | 0.6014625228519196
[LibSVM]0.001 | 3 | 0.6014625228519196
[LibSVM]0.001 | 4 | 0.6014625228519196
[LibSVM]0.001 | 5 | 0.6014625228519196
[LibSVM]0.001 | 6 | 0.6014625228519196
[LibSVM]0.01 | 2 | 0.6014625228519196
[LibSVM]0.01 | 3 | 0.6014625228519196
[LibSVM]0.01 | 4 | 0.6014625228519196
[LibSVM]0.01 | 5 | 0.6014625228519196
[LibSVM]0.01 | 6 | 0.6014625228519196
[LibSVM]0.1 | 2 | 0.7349177330895795
[LibSVM]0.1 | 3 | 0.6398537477148081
[LibSVM]0.1 | 4 | 0.603290676416819
[LibSVM]0.1 | 5 | 0.6014625228519196
[LibSVM]0.1 | 6 | 0.6014625228519196
[LibSVM]1 | 2 | 0.9744058500914077
[LibSVM]1 | 3 | 0.903107861060329
[LibSVM]1 | 4 | 0.8025594149908593
[LibSVM]1 | 5 | 0.7038391224862889
[LibSVM]1 | 6 | 0.6563071297989032
[LibSVM]10 | 2 | 0.9744058500914077
[LibSVM]10 | 3 | 0.91

In [172]:
SVC_poly_df = pd.DataFrame(SVC_poly_res, columns =['Model', 'Cost', 'Degree','Accuracy', 'F1 score', 'Precision', 'Recall'])
SVC_poly_df

Unnamed: 0,Model,Cost,Degree,Accuracy,F1 score,Precision,Recall
0,Polynomial kernel SVM,0.0001,2,0.601463,0.751142,0.601463,1.0
1,Polynomial kernel SVM,0.0001,3,0.601463,0.751142,0.601463,1.0
2,Polynomial kernel SVM,0.0001,4,0.601463,0.751142,0.601463,1.0
3,Polynomial kernel SVM,0.0001,5,0.601463,0.751142,0.601463,1.0
4,Polynomial kernel SVM,0.0001,6,0.601463,0.751142,0.601463,1.0
5,Polynomial kernel SVM,0.001,2,0.601463,0.751142,0.601463,1.0
6,Polynomial kernel SVM,0.001,3,0.601463,0.751142,0.601463,1.0
7,Polynomial kernel SVM,0.001,4,0.601463,0.751142,0.601463,1.0
8,Polynomial kernel SVM,0.001,5,0.601463,0.751142,0.601463,1.0
9,Polynomial kernel SVM,0.001,6,0.601463,0.751142,0.601463,1.0


In [180]:
SVC_poly_df.sort_values("Accuracy", ascending=False).to_csv('SVC_poly.csv', index=False)

#### **Radial kernel SVM hyperparameter tuning (cost and degree)**

In [1]:
# gamma list to test
gamma_list = [0.0001, 0.001, 0.01, 0.1, 1, 10]

In [173]:
SVC_radial_res = []
for i in cost_list:
    for g in gamma_list:
        clf = SVC(C = i, kernel = "rbf", gamma = g, verbose = True)
        model = clf.fit(X_train, Y_train)
        score_val = clf.score(X_val, Y_val)
        
        predicted_y = clf.predict(X_val)
        tn, fp, fn, tp = confusion_matrix(Y_val, predicted_y).ravel()
        precision_score_cl_1 = tp / (tp + fp)
        recall = tp / (tp + fn)
        F1_scors = 2 * (precision_score_cl_1 * recall) / (precision_score_cl_1 + recall)
    
        temp_list = ("Radial kernel SVM", i, g, score_val, F1_scors, precision_score_cl_1, recall)
        SVC_radial_res.append(temp_list)
        
        print(i, "|", g, "|", score_val)

[LibSVM]0.0001 | 0.0001 | 0.6014625228519196
[LibSVM]0.0001 | 0.001 | 0.6014625228519196
[LibSVM]0.0001 | 0.01 | 0.6014625228519196
[LibSVM]0.0001 | 0.1 | 0.6014625228519196
[LibSVM]0.0001 | 1 | 0.6014625228519196
[LibSVM]0.0001 | 10 | 0.6014625228519196
[LibSVM]0.001 | 0.0001 | 0.6014625228519196
[LibSVM]0.001 | 0.001 | 0.6014625228519196
[LibSVM]0.001 | 0.01 | 0.6014625228519196
[LibSVM]0.001 | 0.1 | 0.6014625228519196
[LibSVM]0.001 | 1 | 0.6014625228519196
[LibSVM]0.001 | 10 | 0.6014625228519196
[LibSVM]0.01 | 0.0001 | 0.6014625228519196
[LibSVM]0.01 | 0.001 | 0.6014625228519196
[LibSVM]0.01 | 0.01 | 0.6014625228519196
[LibSVM]0.01 | 0.1 | 0.6014625228519196
[LibSVM]0.01 | 1 | 0.6014625228519196
[LibSVM]0.01 | 10 | 0.6014625228519196
[LibSVM]0.1 | 0.0001 | 0.6014625228519196
[LibSVM]0.1 | 0.001 | 0.6014625228519196
[LibSVM]0.1 | 0.01 | 0.6014625228519196
[LibSVM]0.1 | 0.1 | 0.6014625228519196
[LibSVM]0.1 | 1 | 0.7440585009140768
[LibSVM]0.1 | 10 | 0.6014625228519196
[LibSVM]1 | 0.00

In [174]:
SVC_radial_df = pd.DataFrame(SVC_radial_res, columns =['Model', 'Cost', 'Gamma','Accuracy', 'F1 score', 'Precision', 'Recall'])
SVC_radial_df

Unnamed: 0,Model,Cost,Gamma,Accuracy,F1 score,Precision,Recall
0,Radial kernel SVM,0.0001,0.0001,0.601463,0.751142,0.601463,1.0
1,Radial kernel SVM,0.0001,0.001,0.601463,0.751142,0.601463,1.0
2,Radial kernel SVM,0.0001,0.01,0.601463,0.751142,0.601463,1.0
3,Radial kernel SVM,0.0001,0.1,0.601463,0.751142,0.601463,1.0
4,Radial kernel SVM,0.0001,1.0,0.601463,0.751142,0.601463,1.0
5,Radial kernel SVM,0.0001,10.0,0.601463,0.751142,0.601463,1.0
6,Radial kernel SVM,0.001,0.0001,0.601463,0.751142,0.601463,1.0
7,Radial kernel SVM,0.001,0.001,0.601463,0.751142,0.601463,1.0
8,Radial kernel SVM,0.001,0.01,0.601463,0.751142,0.601463,1.0
9,Radial kernel SVM,0.001,0.1,0.601463,0.751142,0.601463,1.0


In [113]:
SVC_poly_df.sort_values("Accuracy", ascending=False)[:20] 

Unnamed: 0,Model,Cost,Degree,Accuracy,Precision,Recall
20,Polynomial kernel SVM,1.0,2,0.974406,0.967359,0.990881
35,Polynomial kernel SVM,1000.0,2,0.974406,0.967359,0.990881
25,Polynomial kernel SVM,10.0,2,0.974406,0.967359,0.990881
30,Polynomial kernel SVM,100.0,2,0.974406,0.967359,0.990881
26,Polynomial kernel SVM,10.0,3,0.912249,0.876676,0.993921
31,Polynomial kernel SVM,100.0,3,0.912249,0.876676,0.993921
36,Polynomial kernel SVM,1000.0,3,0.912249,0.876676,0.993921
21,Polynomial kernel SVM,1.0,3,0.903108,0.865079,0.993921
32,Polynomial kernel SVM,100.0,4,0.815356,0.765116,1.0
27,Polynomial kernel SVM,10.0,4,0.815356,0.765116,1.0


In [179]:
SVC_radial_df.to_csv('SVC_rad.csv', index=False)

### **Testing phase**

Final esting of the best model for each kernel (emerged from the previous validation phase) on the test set.

#### **Linear kernel SVM best model**


In [194]:
# setup parameters
SVC_C = 1
SVC_KERNEL = "linear"

# setup model
clf = SVC(C = SVC_C, kernel = SVC_KERNEL, verbose = True)

# fit
model = clf.fit(X_train, Y_train)

[LibSVM]

In [195]:
# accuracy on the test set
score = clf.score(X_test, Y_test)

predicted_y = clf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(Y_test, predicted_y).ravel()
# precision
precision_score_cl_1 = tp / (tp + fp)
# recall
recall = tp / (tp + fn)
# F1 score
F1_scors = 2 * (precision_score_cl_1 * recall) / (precision_score_cl_1 + recall)

# results touple
res_linear = ("Linear kernel SVC", score, F1_scors, precision_score_cl_1, recall)

#### **Polynomial kernel SVM best model**

In [197]:
# setup parameters
SVC_C = 1
SVC_KERNEL = "poly"
DEGREE = 2

# setup model
clf = SVC(C = SVC_C, kernel = SVC_KERNEL, degree = DEGREE, verbose = True)

# fit
model = clf.fit(X_train, Y_train)

# accuracy
score = clf.score(X_test, Y_test)

predicted_y = clf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(Y_test, predicted_y).ravel()
# precision
precision_score_cl_1 = tp / (tp + fp)
# recall
recall = tp / (tp + fn)
# F1 score
F1_scors = 2 * (precision_score_cl_1 * recall) / (precision_score_cl_1 + recall)

# results touple
res_poly = ("Polynomial kernel SVM", score, F1_scors, precision_score_cl_1, recall)

[LibSVM]

#### **Radial kernel SVM best model**

In [199]:
# setup parameters
SVC_C = 100
SVC_KERNEL = "rbf"
GAMMA = 0.1

# setup model
clf = SVC(C = SVC_C, kernel = SVC_KERNEL, gamma = GAMMA, verbose = True)

# fit
model = clf.fit(X_train, Y_train)

# accuracy
score = clf.score(X_test, Y_test)

predicted_y = clf.predict(X_test)
tn, fp, fn, tp = confusion_matrix(Y_test, predicted_y).ravel()
# precision
precision_score_cl_1 = tp / (tp + fp)
# recall
recall = tp / (tp + fn)
# F1 score
F1_scors = 2 * (precision_score_cl_1 * recall) / (precision_score_cl_1 + recall)

# results touple
res_rad = ("Radial kernel SVM", score, F1_scors, precision_score_cl_1, recall)

[LibSVM]

#### **Compose the final test results DataFrame**

In [213]:
results_df = pd.DataFrame([res_linear, res_poly, res_rad], columns =['Model', 'Accuracy', 'F1 score', 'Precision', 'Recall'])

In [214]:
results_df

Unnamed: 0,Model,Accuracy,F1 score,Precision,Recall
0,Linear kernel SVC,0.976234,0.980451,0.978979,0.981928
1,Polynomial kernel SVM,0.967093,0.973214,0.961765,0.98494
2,Radial kernel SVM,0.978062,0.981982,0.979042,0.98494


Sort the DataFrame based on performances

In [216]:
results_df.sort_values("Accuracy", ascending=False)

Unnamed: 0,Model,Accuracy,F1 score,Precision,Recall
2,Radial kernel SVM,0.978062,0.981982,0.979042,0.98494
0,Linear kernel SVC,0.976234,0.980451,0.978979,0.981928
1,Polynomial kernel SVM,0.967093,0.973214,0.961765,0.98494


In [217]:
results_df.sort_values("F1 score", ascending=False)

Unnamed: 0,Model,Accuracy,F1 score,Precision,Recall
2,Radial kernel SVM,0.978062,0.981982,0.979042,0.98494
0,Linear kernel SVC,0.976234,0.980451,0.978979,0.981928
1,Polynomial kernel SVM,0.967093,0.973214,0.961765,0.98494
