<div class="alert alert-info alertinfo" style="margin-top: 0px">
<h1> Natural Language Processing with Disaster Tweets </h1>
part 3 - machine learning
</div>

<div class="alert-success" style="margin-top: 0px">
<h1> Imports </h1>
</div> 

In [1]:
# standard
import pandas as pd
import numpy as np
from collections import Counter

# NLP
import re
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize, pos_tag, ne_chunk

# visualization
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier

# others
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


### 1. read data set

In [2]:
df = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


<div class="alert-success" style="margin-top: 0px">
<h1> Data cleaning </h1>
</div> 

In [3]:
df.fillna('', inplace=True)

### pre-clean and corpus set up
first cleaning function that will be updated later on

In [4]:
# Definitions 1
STOPWORDS = set(stopwords.words('english'))

def remove_urls(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    return text

def remove_emails(text):
    text = re.sub(r'\S+@\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'(^|\s)@\w+', '', text)
    return text.strip()

def remove_foreign_characters(text):
    text = re.sub(r'([^\x00-\x7F])+', '', text)
    return text

def remove_short_words(text):
    text = ' '.join([word for word in text.split() if len(word) > 2])
    return text

def remove_stopwords(text):
    stopwords = set(STOPWORDS)
    return ' '.join([word for word in text.split() if word.lower() not in stopwords])

def remove_symbols_and_numbers(text):
    text = ''.join(' ' if not c.isalpha() else c for c in text)
    text = ' '.join(text.split())
    return text

def clean_phase_1(text):
    text = text.lower()
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_foreign_characters(text)
    text = remove_symbols_and_numbers(text)
    text = remove_stopwords(text)
    text = remove_short_words(text)
    return text

In [5]:
english_words = set(words.words())

# find 'important words' - 'keywords' in our training set
data = df.copy()
feature_name = 'keyword'
data[feature_name] = data[feature_name].apply(clean_phase_1)
set_values = set(data[feature_name].values)
set_values.remove('')
important_words = set()
for value in set_values:
    words = value.split()
    important_words.update(words)
    
examples = list(important_words)[:5]
examples   

['destroyed', 'destroy', 'tsunami', 'landslide', 'meltdown']

In [6]:
# find 'unimportant words' - 'locations' in our training set
feature_name = 'location'
data[feature_name] = data[feature_name].apply(clean_phase_1)
set_values = ' '.join(data[feature_name].dropna())
set_values = set_values.split()
set_values = set([word for word in set_values if len(word) > 2])
unimportant_words = set()
for value in set_values:
    words = value.split()
    unimportant_words.update(words)
words_to_keep = unimportant_words.intersection(important_words)
unimportant_words = unimportant_words - words_to_keep

examples = list(unimportant_words)[:5]
examples

['doflamingo', 'brazos', 'cairo', 'bandar', 'berhati']

In [7]:
# updating corpus
english_words.update(important_words)
english_words = english_words - unimportant_words

In [8]:
# train data words
feature_name = 'text'
data[feature_name] = data[feature_name].apply(clean_phase_1)
all_text = ' '.join(data['text'].dropna())
words = all_text.split()
word_counts = Counter(words)
word_counts_df = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count']).reset_index()
word_counts_df.columns = ['word', 'count']
word_counts_df = word_counts_df.sort_values(by='count', ascending=False).reset_index(drop=True)
data_words_df = word_counts_df[word_counts_df['count'] >= 5]
train_data_words = set(data_words_df['word'].values)

train_words_common = train_data_words.intersection(english_words)
train_words_uncommon = train_data_words - train_words_common

# updating corpus
english_words.update(train_data_words) # all words on the first phase

### 3. prepare for Machine Learning

In [9]:
# prepare_for_ml
def filter_words(text, english_words, filter=True):
    if filter:
        words = text.split()
        text = ' '.join([word for word in words if word.lower() in english_words])
    return text

def stem_text(text):
    porter = PorterStemmer()
    words = word_tokenize(text)
    stemmed_words = [porter.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

def prepare_for_ml(data, english_words):
    data.fillna('', inplace=True)
    data['keyword'] = data['keyword'].apply(clean_phase_1)
    data['keyword'] = data['keyword'].apply(lambda x: filter_words(x, english_words, filter)).apply(stem_text)
    data['processed text'] = data['text'].apply(clean_phase_1)
    data['processed text'] = data['processed text'].apply(lambda x: filter_words(x, english_words, filter)).apply(stem_text)
    return data

df = prepare_for_ml(df, english_words)
test_set = prepare_for_ml(test_set, english_words)
df.head()

Unnamed: 0,id,keyword,location,text,target,processed text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,reason earthquak may allah forgiv
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir school


<div class="alert-success" style="margin-top: 0px">
<h1> Vectorising </h1>
</div> 

In [10]:
corpus = df['processed text'].tolist()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# Convert the TF-IDF matrix to a DataFrame (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,aal,aba,abandon,abc,abil,abject,abl,ablaz,aboard,abomin,...,zaman,zar,zeal,zionist,zip,zipper,zodiac,zombi,zone,zoom
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
test_matrix = vectorizer.transform(test_set['processed text'].tolist())
test_tifi = pd.DataFrame(test_matrix.toarray(), columns=vectorizer.get_feature_names_out())   
assert(test_tifi.columns == tfidf_df.columns).all()  
test_tifi

Unnamed: 0,aal,aba,abandon,abc,abil,abject,abl,ablaz,aboard,abomin,...,zaman,zar,zeal,zionist,zip,zipper,zodiac,zombi,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<div class="alert-success" style="margin-top: 0px">
<h1> Train test split </h1>
</div> 

In [12]:
# Assuming you have a 'target' column in your DataFrame
X = tfidf_matrix  # Use the TF-IDF matrix as features
y = df['target']  # Assuming 'target' is the column you want to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<div class="alert-success" style="margin-top: 0px">
<h1> Model comparison</h1>
</div> 

In [13]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': make_pipeline(StandardScaler(with_mean=False), SVC()),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss')
}

# Train and evaluate models
for name, model in models.items():
    print('starting new model', name, model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Save the trained model
    model_filename = f'{name}_model.joblib'
    joblib.dump(model, model_filename)
    
    # Display results
    print(f'\nModel: {name}')
    print(f'Accuracy: {accuracy:.4f}')
    print('Classification Report:\n', report)
    print(f'Model saved as: {model_filename}')
    print('--------------------------------------------------')


starting new model Logistic Regression LogisticRegression()

Model: Logistic Regression
Accuracy: 0.8024
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

Model saved as: Logistic Regression_model.joblib
--------------------------------------------------
starting new model Multinomial Naive Bayes MultinomialNB()

Model: Multinomial Naive Bayes
Accuracy: 0.8024
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0

### best accuracy of 80.24% acchieved Multinomial Naive Bayes model (slightly better recall than Logistic regression)
### re-evaluation of the model can be found in part 4