<a href="https://www.kaggle.com/code/fengyuansun/feng-nlp-challenge?scriptVersionId=136282592" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Import packages
import os
import os.path as osp 

import re
import nltk
import string
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # fancier plotting
import matplotlib.pyplot as plt # plotting

from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import TweetTokenizer

# nltk.download('omw-1.4')
# nltk.download('wordnet')
# nltk.download('wordnet2022')

# ! cp -rf /usr/share/nltk_data/corpora/wordnet2022 /usr/share/nltk_data/corpora/wordnet # temp fix for lookup error.

In [None]:
pd.set_option("display.max_colwidth", 200) 

In [None]:
"""
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
"""
# Load training dataset
file_train = "/kaggle/input/nlp-getting-started/train.csv"
df = pd.read_csv(file_train)

df.head()

In [None]:
### Let's compare some negative and positive samples
pd.concat((df[df['target'] == 0][:5], df[df['target'] == 1][:5]))

In [None]:
# Looking at dataset info
df.info()

In [None]:
# # Drop useless columns
# df.drop(['id', 'keyword', 'location'], axis=1, inplace=True)
# df.head()

In [None]:
# Removing unwanted pattern from the tweets
def remove_pattern(text, pattern):
    return re.sub(pattern, '', text)

# Removing @
df['clean_tweet'] = np.vectorize(remove_pattern)(df['text'], r"@[\w]*") 

# Removing #
df['clean_tweet'] = np.vectorize(remove_pattern)(df['clean_tweet'], r"#") 

# Removing hyperlinks
df['clean_tweet'] = np.vectorize(remove_pattern)(df['clean_tweet'], r'https?://\S+')


In [None]:
# Remove words shorter than given length
min_word_len = 2 
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>min_word_len]))

df.head(5)

In [None]:
# Spacy
import spacy

# Load English language model
nlp = spacy.load('en_core_web_sm')

def preprocess_spacy(text, nlp):
    doc = nlp(text)
    
    # Remove stopwords and punctuation, and convert tokens to lowercase
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    
    return ' '.join(tokens)

df['clean_tweet'] = df['clean_tweet'].apply(preprocess_spacy, nlp=nlp)

df.head(5)

In [None]:
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text

df['clean_tweet'] = df['clean_tweet'].apply(lemmatize_text)

In [None]:
tokenizer = TweetTokenizer(preserve_case=False, 
                           reduce_len=True,
                           strip_handles=True)

df['clean_tweet'] = df['clean_tweet'].apply(lambda x: tokenizer.tokenize(x))
df

In [None]:
# WordCloud
all_words = ' '.join([' '.join(l) for l in df['clean_tweet']]) 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) 
plt.figure(figsize=(10, 7)) 
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

In [None]:
# Disaster tweets

negative_words = ' '.join([' '.join(l) for l in df['clean_tweet'][df['target'] == 1]]) 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negative_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off') 
plt.show()

In [None]:
# Non disaster tweets

normal_words =' '.join([' '.join(l) for l in df['clean_tweet'][df['target'] == 0]]) 
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(normal_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear") 
plt.axis('off')
plt.savefig('Non_disaster.png', bbox_inches='tight')
plt.show()

In [None]:
short_words = [x for x in all_words.split() if len(x) < 3 ]
print(short_words[:10])
        
# Remove words shorter than given length
# There are some new short utterances after tokenization. Most dont make sense
min_word_len = 2 
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join([w for w in x if len(w)>min_word_len]))

df.head(5)

#### Test:
1. Compare 'raw' less processed sentences vs fully pre-processed sentences input to Embedding

# 5. Model Selection and Training
   - Select appropriate machine learning models for text classification, such as Naive Bayes, Support Vector Machines (SVM), or deep learning models like Recurrent Neural Networks (RNNs) or Transformers.
   - Split the preprocessed data into training and validation sets.
   - Train the selected models using the training data and evaluate their performance using appropriate evaluation metrics.

In [None]:
from xgboost import XGBClassifier

# fit model no training data
model = XGBClassifier(n_estimators=1000, learning_rate=0.01, n_jobs=-1, 
                      early_stopping_rounds=5)

In [None]:
from sklearn.metrics import f1_score

def train_model(X, y, X_val, y_val, model='SVC'):
    if model == 'SVC':
        model = SVC() 
        model.fit(X, y)
    elif model == 'XGBClassifier':
        model = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1,
                              early_stopping_rounds=5,
                              max_depth=3)
        model.fit(X, y, 
                  eval_set=[(X_val, y_val)],
                  verbose=False)    
    return model

def eval_model(model, X, y):
    # Predict the labels for the training and testing data
    y_pred = model.predict(X)
    score = f1_score(y, y_pred)

    return score

In [None]:
!pip install sentence-transformers

In [None]:
# K-fold cross validation
from sklearn.model_selection import KFold
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC


X = df['text']
y = df['target']
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
X_embeddings = embedding_model.encode(X.tolist())

train_scores = []
val_scores = []
for train_index, val_index in tqdm(kf.split(X_embeddings), total=k):
    X_train, X_val = X_embeddings[train_index], X_embeddings[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Perform model training and evaluation on the current fold
    model = train_model(X_train, y_train, X_val, y_val, model="XGBClassifier")
    train_score = eval_model(model, X_train, y_train)
    val_score = eval_model(model, X_val, y_val)
    
    # Store the validation score for the current fold
    train_scores.append(train_score)
    val_scores.append(val_score)

# Print the scores for each fold
for fold in range(k):
    print(f"Fold {fold+1}: Train Score = {train_scores[fold]}")
print(f"Avg Train Score = {sum(train_scores) / len(train_scores)}")
    
for fold in range(k):
    print(f"Fold {fold+1}: Validation Score = {val_scores[fold]}")
print(f"Avg Validation Score = {sum(val_scores) / len(val_scores)}")

# Base XGBClassifier gives: 
# Avg Train Score = 0.9890328150989525
# Avg Validation Score = 0.7744712508570869

# Without GPU ~1:22 and ~2:58

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.1, 0.01],
#     'n_estimators': [100, 200, 300],
# }

# # Create the XGBClassifier model
# xgb_model = XGBClassifier()

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5)
# grid_search.fit(X_embeddings, y)

# # Get the best hyperparameter combination
# best_params = grid_search.best_params_

# X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# # Train the final model using the best hyperparameters
# final_model = XGBClassifier(**best_params)
# final_model.fit(X_train, y_train)

# # Evaluate the final model on the test set
# y_pred = final_model.predict(X_test)
# test_f1_score = f1_score(y, y_pred)

# print("Best Hyperparameters:", best_params)
# print("Test F1-score:", test_f1_score)

### Idea's to further boost performance

- There exists a small data imbalance between positive and negative samples. We could generate some positive samples using the synonym replacement method. Source: https://neptune.ai/blog/data-augmentation-nlp
- feature engineering

In [None]:
### You can look into 'texthero' package for user-friendly preprocessing of text
# example: https://www.kaggle.com/code/aravindanr22052001/stackoverflowrun/notebook

# Generate submission

In [None]:
# # Load training dataset
# sub_test = "test.csv"
# sub_df = pd.read_csv(sub_test)
# sub_df.drop(['keyword', 'location'], axis=1, inplace=True)

# # pre-process text
# corpus = preprocess_text(sub_df, stop=stopwords)
# sub_df['text'] = [" ".join(l) for l in corpus]

# sub_df.head()

In [None]:
# sub_df = df
# sub_df

In [None]:
# # Load your sentence encoding model
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# # Encode the testing data
# X_sub_embeddings = embedding_model.encode(sub_df['text'].tolist())

# # y_sub_pred = model.predict(X_sub_embeddings)
# y_sub_pred = predictor.predict(X_sub_embeddings)
# y_sub_pred


In [None]:
# import csv

# with open('submission.csv', 'w', newline='') as f:
#     writer = csv.writer(f)
    
#     writer.writerow(['id','target'])
    
#     for i, j in zip(sub_df['id'].tolist(), y_sub_pred):
#         writer.writerow([i, j])


In [None]:
from sentence_transformers import SentenceTransformer

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Load your sentence encoding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the training and testing data
# Takes ~1min on CPU
X_train_embeddings = embedding_model.encode(X_train.tolist())
X_test_embeddings = embedding_model.encode(X_test.tolist())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def report_metrics(predictor, y_train, y_train_pred, y_test, y_test_pred):
    # Report train and test statistics
    train_report = classification_report(y_train, y_train_pred)
    test_report = classification_report(y_test, y_test_pred)
    test_accuracy = (y_test_pred == y_test).mean()

    print(f"Performance for {type(predictor).__name__}")
    print("Train Statistics:")
    print(train_report)
    print("\nTest Statistics:")
    print(test_report)

    print("Test Accuracy:", test_accuracy)


# Select predictor type
for predictor in [SVC()]:   # LogisticRegression(), 
    
    # Train a logistic regression classifier
    classifier = predictor

    classifier.fit(X_train_embeddings, y_train)

    # Predict the labels for the training and testing data
    y_train_pred = classifier.predict(X_train_embeddings)
    y_test_pred = classifier.predict(X_test_embeddings)

    # Get metrics
    report_metrics(classifier, y_train, y_train_pred, y_test, y_test_pred)

    
""" Spacy, TweetTokenizer, and custom regex pre-processing
Performance for SVC
Train Statistics:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      3468
           1       0.95      0.84      0.89      2622

    accuracy                           0.91      6090
   macro avg       0.92      0.90      0.91      6090
weighted avg       0.91      0.91      0.91      6090


Test Statistics:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       874
           1       0.84      0.76      0.80       649

    accuracy                           0.84      1523
   macro avg       0.84      0.83      0.83      1523
weighted avg       0.84      0.84      0.84      1523

Test Accuracy: 0.8365068942875903
"""
""" with same processing steps
Performance for XGBClassifier
Train Statistics:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3468
           1       0.99      0.97      0.98      2622

    accuracy                           0.98      6090
   macro avg       0.99      0.98      0.98      6090
weighted avg       0.98      0.98      0.98      6090


Test Statistics:
              precision    recall  f1-score   support

           0       0.83      0.88      0.86       874
           1       0.83      0.75      0.79       649

    accuracy                           0.83      1523
   macro avg       0.83      0.82      0.82      1523
weighted avg       0.83      0.83      0.83      1523

Test Accuracy: 0.8286277084701248
"""