# Natural Language Processing with Disaster Tweets

This is the link of the competition: https://www.kaggle.com/competitions/nlp-getting-started/overview

In [1]:
# Importing libraries
import pandas as pd
import string
import re
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('display.max_colwidth', None)

In [2]:
# This is the sample format way to submit the contest
file_loc = 'data/sample_submission.csv'
df = pd.read_csv(file_loc)
df.head()


Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


# Machine Learning

In [3]:
file_loc = 'data/train.csv'
df = pd.read_csv(file_loc)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
df.head(100).loc[df['target']== 1, 'text']

0                                                                              Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1                                                                                                             Forest fire near La Ronge Sask. Canada
2              All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
3                                                                                  13,000 people receive #wildfires evacuation orders in California 
4                                                           Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
5                                     #RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
6                                                    #flood #disaster Heavy rain causes flash flooding of 

In [5]:
stop_w = set(stopwords.words('english'))

def preprocessing(sentence):
    
    wl = WordNetLemmatizer()
    sentence = sentence.strip()
    sentence = sentence.lower()
    list_word = sentence.split()
    pattern = 'http\S+'
    list_word = [wl.lemmatize(''.join(
                    [
                        c for c in char if not c.isdigit() 
                          and c not in string.punctuation
                    ]               
            ), pos='v' )
        for char in list_word
        if char not in stop_w
        and not re.match(pattern, char)
        ]
    
    return ' '.join(list_word)

In [6]:
df['clean_text'] = [preprocessing(s) for s in df['text']]

In [13]:
df.head(100).loc[df['target']== 1, 'clean_text']

0                                                                 deeds reason earthquake may allah forgive us
1                                                                        forest fire near la ronge sask canada
2                             residents ask shelter place notify officer evacuation shelter place order expect
3                                                         people receive wildfires evacuation order california
4                                                       get send photo ruby alaska smoke wildfires pour school
5                    rockyfire update  california hwy  close directions due lake county fire  cafire wildfires
6                            flood disaster heavy rain cause flash flood streets manitou colorado spring areas
7                                                                                   im top hill see fire woods
8                                                       theres emergency evacuation happen build across street
9

In [7]:
count = 0
for sentence in df['clean_text']:
    if 'fire' in sentence:
        count += 1
print(count)

474


In [8]:
df['clean_text'].unique()

array(['deeds reason earthquake may allah forgive us',
       'forest fire near la ronge sask canada',
       'residents ask shelter place notify officer evacuation shelter place order expect',
       ...,
       'officials say quarantine place alabama home possible ebola case develop symptoms',
       'flip side im walmart bomb everyone evacuate stay tune blow',
       'suicide bomber kill  saudi security site mosque  reuters via world  google news  wall '],
      dtype=object)

In [9]:
# The vectorizer model
vectorizer = TfidfVectorizer()

# Transform the text data to feature vectors
X = vectorizer.fit_transform(df['clean_text'])
y = df['target']

In [10]:


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [11]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.8039
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1318
           1       0.83      0.68      0.75       966

    accuracy                           0.80      2284
   macro avg       0.81      0.79      0.79      2284
weighted avg       0.81      0.80      0.80      2284



In [12]:
"""
Logistic Regression model: improved 0
list of keywords in TfidVectorizer(): improved -

"""

'\nLogistic Regression model: improved 0\nlist of keywords in TfidVectorizer(): improved -\n\n'