# Natural Language Processing with Disaster Tweets

This is the link of the competition: https://www.kaggle.com/competitions/nlp-getting-started/overview

In [1]:
# Importing libraries
import pandas as pd
import string
import re
from nltk.corpus import stopwords 
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

pd.set_option('display.max_colwidth', None)

In [2]:
# This is the sample format way to submit the contest
file_loc = '../data/sample_submission.csv'
df = pd.read_csv(file_loc)
df.head()


Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


# Machine Learning

In [3]:
file_loc_train = '../data/train.csv'
file_loc_test = '../data/test.csv'
df = pd.read_csv(file_loc_train)
df_test = pd.read_csv(file_loc_test)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
df.head(100).loc[df['target']== 1, 'text']

0                                                                              Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
1                                                                                                             Forest fire near La Ronge Sask. Canada
2              All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
3                                                                                  13,000 people receive #wildfires evacuation orders in California 
4                                                           Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school 
5                                     #RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
6                                                    #flood #disaster Heavy rain causes flash flooding of 

In [5]:
stop_w = set(stopwords.words('english'))

def preprocessing(sentence):
    
    wl = WordNetLemmatizer()
    sentence = sentence.strip()
    sentence = sentence.lower()
    list_word = sentence.split()
    pattern = 'http\S+'
    list_word = [wl.lemmatize(''.join(
                    [
                        c for c in char if not c.isdigit() 
                          and c not in string.punctuation
                    ]               
            ), pos='n' )
        for char in list_word
        if char not in stop_w
        and not re.match(pattern, char)
        ]
    
    return ' '.join(list_word)

In [6]:
df['clean_text'] = [preprocessing(s) for s in df['text']]

In [7]:
df.head(100).loc[df['target']== 1, 'clean_text']

0                                                                       deed reason earthquake may allah forgive u
1                                                                            forest fire near la ronge sask canada
2                            resident asked shelter place notified officer evacuation shelter place order expected
3                                                              people receive wildfire evacuation order california
4                                                           got sent photo ruby alaska smoke wildfire pours school
5                         rockyfire update  california hwy  closed direction due lake county fire  cafire wildfire
6                               flood disaster heavy rain cause flash flooding street manitou colorado spring area
7                                                                                        im top hill see fire wood
8                                                      there emergency evacuatio

In [8]:
# The vectorizer model
vectorizer = TfidfVectorizer()

# Transform the text data to feature vectors
X = vectorizer.fit_transform(df['clean_text'])
y = df['target']
words_weight = pd.DataFrame(X.toarray())
words_weight.columns = vectorizer.get_feature_names_out()
words_weight.index = y 

Unnamed: 0_level_0,aa,aaaa,aaaaaaallll,aaaaaand,aaarrrgghhh,aaceorg,aal,aampb,aampw,aan,...,ûò,ûòthe,ûòåêcnbc,ûó,ûóbbc,ûóher,ûókody,ûónegligence,ûótech,ûówe
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the classifier
lg = LogisticRegression()
lg.fit(X_train, y_train)


LogisticRegression()

In [10]:
# Predict on the test set
y_pred = lg.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.8021
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.92      0.84      1318
           1       0.86      0.64      0.73       966

    accuracy                           0.80      2284
   macro avg       0.82      0.78      0.79      2284
weighted avg       0.81      0.80      0.80      2284



In [13]:
# Submit with the model 
df_test['clean_text'] = [preprocessing(s) for s in df_test['text']]
pred = vectorizer.transform(df_test['clean_text'])
predictions = lg.predict(pred)

df_submit = pd.DataFrame({'id': df_test['id'], 'target': predictions})
df_submit.to_csv('submission_09_09_2024.csv', index=False)
df_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
"""
Logistic Regression model: was the same accuracy
list of keywords in TfidVectorizer(): it got worsed

"""

In [None]:
# Accuracy score was 0.79436