Creates the DataFrame from data_relevance_training.csv and extracts the two needed columns.
Then, manipulates the string content in the specified way before saving as df.

In [1]:
import pandas as pd

ENCODING = {'irrelevant': 0, 'relevant': 1}

data: pd.DataFrame = pd.read_csv('../data/labeled/cleaned/combined/data_relevance_training.csv')
encoded: pd.Series = data['relevance'].apply(lambda x: ENCODING[x])

# Splits on the separator ' .' and rejoins the post with its comment.
text: pd.Series = data['text'].apply(lambda x: ''.join(x.split(' .')))
df = pd.DataFrame([text, encoded]).T
df = df.astype({'text': 'string', 'relevance': 'int'})
df.head(5)

Unnamed: 0,text,relevance
0,The Coolest President Heart Suit Green Heart V...,0
1,After YRS Sandro will be the next president sm...,1
2,God bless you always sir pbbm red heart red he...,1
3,I was in tears of sincerity and inday sarah ho...,1
4,Good news you loyalist negative BBM's result i...,1


Vectorizes the text using sklearn Tfidfvectorizer.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

vectorizer = TfidfVectorizer(max_df=0.95, lowercase=True)
tf_idf = vectorizer.fit_transform(df['text'])
words = vectorizer.get_feature_names_out()
tf_idf = normalize(tf_idf).toarray()
tf_idf.shape

(500, 1605)

Does an 80-20 train-test split.

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(tf_idf, df.relevance, test_size= .2)

Generates all the classifier models to be tested.

In [4]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=5)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=2)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31)
ada = AdaBoostClassifier()
mlp = MLPClassifier()

classifiers = {'SVC': svc, 
               'KNeighborsClassifier': knc,
               'Multinomial Naive Bayes': mnb,
               'Decision Tree': dtc,
               'Logistic Regression': lrc,
               'Random Forest': rfc,
               'AdaBoost': ada,
               'Multi-Layer Perceptron': mlp}

Fits and subsequently evaluates the models.

In [5]:
train_scores = []
for name, model in classifiers.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    train_scores.append((name, accuracy_score(y_test , y_pred)))
    
crossval_scores = []
for name, model in classifiers.items():
    scores = cross_val_score(model, x_test, y_test, cv=5)
    crossval_scores.append((name, scores.mean()))



Prints the resultant scores.

In [6]:
print('Model Training Scores')
for item in train_scores:
    print(f'{item[0]}: {item[1]}')
    
print('\nCross-Validation Scores')
for item in crossval_scores:
    print(f'{item[0]}: {item[1]}')

Model Training Scores
SVC: 0.77
KNeighborsClassifier: 0.67
Multinomial Naive Bayes: 0.78
Decision Tree: 0.74
Logistic Regression: 0.78
Random Forest: 0.74
AdaBoost: 0.77
Multi-Layer Perceptron: 0.77

Cross-Validation Scores
SVC: 0.7200000000000001
KNeighborsClassifier: 0.63
Multinomial Naive Bayes: 0.7100000000000002
Decision Tree: 0.7300000000000001
Logistic Regression: 0.55
Random Forest: 0.65
AdaBoost: 0.71
Multi-Layer Perceptron: 0.7300000000000001


Tests them on the four quotes from the English Reference Approach Jupyter notebook.

In [7]:
TEXT = [["Imagine if late president ferdinand marcos sr was still alive and witnessed this glorious moment sneezing that's the president of the mass sweet and cute dad solid bbm smiling with hearts smiling with hearts smiling with hearts"],
        ["Long live APO UN PBBM Red Heart Red Heart The AFTR shocks are still on the ABRA POEPICENTER STAYSAFE FOLDED HANDS FOLDED HANDS Bangonabrenios Red Heart"],
        ["The amount of appreciation gratitude and respect he has for the frontliners Don't Irene Marcos unn seconds turned shade"],
        ["random word"]]

DECODING = ['Irrelevant', 'Relevant']

for tweet in TEXT:
    tweettfidf = vectorizer.transform(tweet).toarray()
    print(f'"{tweet[0]}"')
    for name, model in classifiers.items():
        print(f'\t{name}: {DECODING[model.predict(tweettfidf)[0]]}')
    print()

"Imagine if late president ferdinand marcos sr was still alive and witnessed this glorious moment sneezing that's the president of the mass sweet and cute dad solid bbm smiling with hearts smiling with hearts smiling with hearts"
	SVC: Relevant
	KNeighborsClassifier: Relevant
	Multinomial Naive Bayes: Relevant
	Decision Tree: Relevant
	Logistic Regression: Relevant
	Random Forest: Relevant
	AdaBoost: Relevant
	Multi-Layer Perceptron: Relevant

"Long live APO UN PBBM Red Heart Red Heart The AFTR shocks are still on the ABRA POEPICENTER STAYSAFE FOLDED HANDS FOLDED HANDS Bangonabrenios Red Heart"
	SVC: Relevant
	KNeighborsClassifier: Relevant
	Multinomial Naive Bayes: Relevant
	Decision Tree: Relevant
	Logistic Regression: Relevant
	Random Forest: Relevant
	AdaBoost: Relevant
	Multi-Layer Perceptron: Relevant

"The amount of appreciation gratitude and respect he has for the frontliners Don't Irene Marcos unn seconds turned shade"
	SVC: Irrelevant
	KNeighborsClassifier: Irrelevant
	Multin