# Natural Language Processing with Disaster Tweets

This is the link of the competition: https://www.kaggle.com/competitions/nlp-getting-started/overview

In [1]:
# Importing libraries
import pandas as pd
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# This is the sample format way to submit the contest
file_loc = 'data/sample_submission.csv'
df = pd.read_csv(file_loc)
df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


# Machine Learning

In [3]:
file_loc = 'data/train.csv'
df = pd.read_csv(file_loc)

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
stop_w = set(stopwords.words('english'))

def preprocessing(sentence):
    
    def custom_lem(word):
        dict_lem = [{'fire':['wildfires', 'wildfire', '#wildfires']}]
        lem_word = [{k for k,v in di.items() if word in v} for di in dict_lem][0]
        
        if lem_word:
            return iter(lem_word)
        else:
            return word
    
    wl = WordNetLemmatizer()
    sentence = sentence.strip()
    sentence = sentence.lower()
    list_word = sentence.split()
    list_word = [wl.lemmatize(''.join(
                    [
                        c for c in custom_lem(char) if not c.isdigit() 
                          and c not in string.punctuation

                ]               
            ), pos='v' )
        for char in list_word
        if char not in stop_w
        ]
    
    return ' '.join(list_word)

In [6]:
preprocessing('dogs were on #wildfire')

'dog wildfire'

In [7]:
df['clean_text'] = [preprocessing(s) for s in df['text']]

In [8]:
count = 0
for sentence in df['clean_text']:
    if 'fire' in sentence:
        count += 1
print(count)

474


In [9]:
# The vectorizer model
vectorizer = TfidfVectorizer()

# Transform the text data to feature vectors
X = vectorizer.fit_transform(df['text'])
y = df['target']

In [10]:


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [11]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.8122
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1318
           1       0.84      0.69      0.76       966

    accuracy                           0.81      2284
   macro avg       0.82      0.80      0.80      2284
weighted avg       0.82      0.81      0.81      2284

