Creates the DataFrame from data_relevance_training.csv and extracts the two needed columns.
Then, manipulates the string content in the specified way before saving as df.

In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import string

stops = set(word.lower() for word in list(stopwords.words('english')))

lemmatizer = WordNetLemmatizer()

def getPOS(word):
    """
    Gets the part of speech for some word.
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)



: 

In [None]:
ENCODING = {'irrelevant': 0, 'relevant': 1}

def processText(text: str) -> str:
    text = (text.translate(str.maketrans('', '', string.punctuation))
                .split(' . ')[0]
                .lower())
    split = [w for w in text.split(' ') if (not w in stops and len(w) > 1)]
    uniques = ' '.join(sorted(set(split), key=split.index))
    lemmatized = []
    for w in nltk.word_tokenize(uniques):
        lemmatizedW = lemmatizer.lemmatize(w, getPOS(w))
        if len(lemmatizedW) > 1:
            lemmatized.append(lemmatizedW)
    return ' '.join(lemmatized)

data: pd.DataFrame = pd.read_csv('../data/labeled/cleaned/combined/data_relevance_training.csv')
encoded: pd.Series = data['relevance'].apply(lambda x: ENCODING[x])

text: pd.Series = data['text'].apply(processText)

df = pd.DataFrame([text, encoded]).T
df = df.astype({'text': 'string', 'relevance': 'int'})
print(len(df))
df.head(5)

: 

Vectorizes the text using sklearn Tfidfvectorizer.

In [170]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize

vectorizer = CountVectorizer(max_df=0.95, lowercase=True)\
tf_idf = vectorizer.fit_transform(df['text'])
words = vectorizer.get_feature_names_out()
tf_idf = normalize(tf_idf).toarray()
tf_idf.shape

(1000, 2022)

Does an 80-20 train-test split.

In [171]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(tf_idf, df.relevance, test_size= .2)

Generates all the classifier models to be tested.

In [172]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier(n_neighbors=5)
mnb = MultinomialNB(alpha=0.2)
dtc = DecisionTreeClassifier(min_samples_split=2)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=31)
ada = AdaBoostClassifier()
mlp = MLPClassifier()

classifiers = {'SVC': svc, 
               'KNeighborsClassifier': knc,
               'Multinomial Naive Bayes': mnb,
               'Decision Tree': dtc,
               'Logistic Regression': lrc,
               'Random Forest': rfc,
               'AdaBoost': ada,
               'Multi-Layer Perceptron': mlp}

Fits and subsequently evaluates the models.

In [173]:
train_scores = []
for name, model in classifiers.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    train_scores.append((name, accuracy_score(y_test , y_pred)))
    
crossval_scores = []
for name, model in classifiers.items():
    scores = cross_val_score(model, x_test, y_test, cv=5)
    crossval_scores.append((name, scores.mean()))



Prints the resultant scores.

In [174]:
print('Model Training Scores')
for item in train_scores:
    print(f'{item[0]}: {item[1]}')
    
print('\nCross-Validation Scores')
for item in crossval_scores:
    print(f'{item[0]}: {item[1]}')

Model Training Scores
SVC: 0.875
KNeighborsClassifier: 0.825
Multinomial Naive Bayes: 0.88
Decision Tree: 0.865
Logistic Regression: 0.865
Random Forest: 0.865
AdaBoost: 0.845
Multi-Layer Perceptron: 0.875

Cross-Validation Scores
SVC: 0.875
KNeighborsClassifier: 0.76
Multinomial Naive Bayes: 0.8299999999999998
Decision Tree: 0.82
Logistic Regression: 0.82
Random Forest: 0.835
AdaBoost: 0.7949999999999999
Multi-Layer Perceptron: 0.795


Tests them on the four quotes from the English Reference Approach Jupyter notebook.

In [178]:
TEXT = ["Imagine if late president ferdinand marcos sr was still alive and witnessed this glorious moment sneezing",
        "Long live APO UN PBBM Red Heart Red Heart The AFTR shocks are still on the ABRA POEPICENTER STAYSAFE FOLDED HANDS FOLDED HANDS Bangonabrenios",
        "The amount of appreciation gratitude and respect he has for the frontliners",
        "random word"]

DECODING = ['Irrelevant', 'Relevant']

for tweet in TEXT:
    tweettfidf = vectorizer.transform([processText(tweet)]).toarray()
    print(f'"{tweet}"')
    print(f'"{processText(tweet)}"')
    for name, model in classifiers.items():
        print(f'\t{name}: {DECODING[model.predict(tweettfidf)[0]]}')
    print()

"Imagine if late president ferdinand marcos sr was still alive and witnessed this glorious moment sneezing"
"imagine late president ferdinand marcos sr still alive witness glorious moment sneeze"
	SVC: Relevant
	KNeighborsClassifier: Relevant
	Multinomial Naive Bayes: Relevant
	Decision Tree: Relevant
	Logistic Regression: Relevant
	Random Forest: Relevant
	AdaBoost: Relevant
	Multi-Layer Perceptron: Relevant

"Long live APO UN PBBM Red Heart Red Heart The AFTR shocks are still on the ABRA POEPICENTER STAYSAFE FOLDED HANDS FOLDED HANDS Bangonabrenios"
"long live apo un pbbm red heart aftr shock still abra poepicenter staysafe fold hand bangonabrenios"
	SVC: Relevant
	KNeighborsClassifier: Relevant
	Multinomial Naive Bayes: Relevant
	Decision Tree: Relevant
	Logistic Regression: Relevant
	Random Forest: Relevant
	AdaBoost: Relevant
	Multi-Layer Perceptron: Relevant

"The amount of appreciation gratitude and respect he has for the frontliners"
"amount appreciation gratitude respect front