## Import Necessary Libraries

In [5]:


import pandas as pd
import numpy as np
import re
import warnings
import emoji
import random
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models, callbacks

# Set random seed for reproducibility
random_seed = 2024
np.random.seed(random_seed)
random.seed(random_seed)
tf.random.set_seed(random_seed)

In [None]:
# Define configuration
class Config():
    seed = random_seed
    word_count = 1000
    train_path = "/kaggle/input/nlp-getting-started/train.csv"
    test_path = "/kaggle/input/nlp-getting-started/test.csv"

# Function to remove HTML tags
def remove_html_tags(text):
    html_pattern = re.compile(r'<.*?>')
    return html_pattern.sub(r'', text)

# Function for data preprocessing
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = remove_html_tags(text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub("@\w+", '', text)
    text = re.sub("'\d+", '', text)
    text = re.sub("\d+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub("http\w+", '', text)
    text = re.sub("\s[a-z]\s", '', text)
    text = text.strip()
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

In [None]:
# Seed everything
np.random.seed(Config.seed)
random.seed(Config.seed)
tf.random.set_seed(Config.seed)

# Read data
train_df = pd.read_csv(Config.train_path)
train_df['text'] = train_df['text'].apply(preprocess_text)
print(f"len(train_df): {len(train_df)}")

test_df = pd.read_csv(Config.test_path)
test_df['text'] = test_df['text'].apply(preprocess_text)
print(f"len(test_df): {len(test_df)}")

# Create word dictionary
text = train_df['text'].values
word_dict = {}
for i in range(len(text)):
    for j in range(len(text[i])):
        if text[i][j] in word_dict:
            word_dict[text[i][j]] += 1
        else:
            word_dict[text[i][j]] = 1

sorted_items = sorted(word_dict.items(), key=lambda x: -x[1])
sorted_list = list(sorted_items)
top_words = [value[0] for value in sorted_list[:Config.word_count]]

In [None]:
train_text=train_df['text'].values
test_text=test_df['text'].values
X=np.array([[int(top_word in text) for top_word in top_words] for text in train_text])
y=train_df['target'].values
test_X=np.array([[int(top_word in text) for top_word in top_words] for text in test_text])
print(f"X.shape:{X.shape},y.shape:{y.shape}")

In [None]:
train_df

In [None]:
train_df[train_df["target"] == 0]

In [None]:
train_df[train_df["target"] == 0]["text"]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Visualizing the target classes
plt.figure(figsize=(8,5))
plt.title("Count of Target Classes")
sns.countplot(y=train_df["target"],linewidth=2,
                   edgecolor='black')

plt.show()

# Let's start by analysing total number of characters in text.

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))
char_len_dis = train_df[train_df['target']==1]['text'].str.len()
ax1.hist(char_len_dis,color='red',edgecolor='black', linewidth=1.2)
ax1.set_title('Disaster Tweets')
char_len_ndis = train_df[train_df['target']==0]['text'].str.len()
ax2.hist(char_len_ndis,color='blue',edgecolor='black', linewidth=1.2)
ax2.set_title('Non-Disaster Tweets')
plt.suptitle("Length of Characters in text")
plt.tight_layout()
plt.show()

In [None]:
# Creating sample corpus for further analysis.
def create_corpus(target):
    corpus = []
    for x in train_df[train_df['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict

# Function to create corpus
def create_corpus(text):
    corpus = []
    for sentence in text:
        for word in sentence:
            corpus.append(word)
    return corpus

# Function to analyze top stop words in text
def analyze_stopwords(data, func, target):
    values_list = []
    for label in target:
        dic = defaultdict(int)
        corpus = func(data[data['target'] == label]['text'])
        for word in corpus:
            dic[word] += 1
        top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
        x_items, y_values = zip(*top)
        values_list.extend(list(x_items))
        values_list.extend(list(y_values))
        
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    ax1.barh(values_list[:10], values_list[10:20], color="lightblue", edgecolor='black', linewidth=1.2)
    ax1.set_title("Non-Disaster Tweets")
    
    ax2.barh(values_list[20:30], values_list[30:], color="lightgreen", edgecolor='black', linewidth=1.2)
    ax2.set_title("Disaster Tweets")
            
    plt.suptitle("Top Stop words in text")
    plt.show()

# Call analyze_stopwords function
analyze_stopwords(train_df, create_corpus, [0, 1])


In [None]:
# Analysing Top 20  disastrous KeyWords in text .
plt.figure(figsize=(10,7))
train_df[train_df['target']==1]['keyword'].value_counts()[:20].plot(kind='barh', fontsize=12,title='Top 20 Disastrous Keywords in Text', color='#0096FF',edgecolor='black', linewidth=1.2)
plt.show()

In [None]:
# Analysing Top 20 disastrous Locations in text.
plt.figure(figsize=(10,7))
train_df[train_df["target"]==1]["location"].value_counts()[:20].plot(kind='barh',fontsize=12, title='Top 20 Disastrous Locations in Text', color='#66ff00',edgecolor='black', linewidth=1.2)
plt.show()

# # Classification.

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import emoji
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten
from sklearn.metrics import accuracy_score

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['target'], test_size=0.2, random_state=2024)

# Convert text to sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 1000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

# Build CNN model
embedding_dim = 100
vocab_size = max_words

cnn_model = Sequential([
    Embedding(vocab_size, embedding_dim),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN model
history = cnn_model.fit(X_train_pad, y_train,
                        epochs=10,
                        batch_size=64,
                        validation_data=(X_val_pad, y_val))

# Evaluate CNN model on validation set
val_loss, val_accuracy = cnn_model.evaluate(X_val_pad, y_val)
print(f'Validation accuracy: {val_accuracy}')

# Predict on test set
X_test_seq = tokenizer.texts_to_sequences(test_df['text'])
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
test_pred = (cnn_model.predict(X_test_pad) > 0.5).astype(int)



In [None]:
# Load true labels of the test data
true_labels = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")['target']

# Calculate accuracy
test_accuracy = accuracy_score(true_labels, test_pred)
print(f'Test accuracy: {test_accuracy}')


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, confusion_matrix
import seaborn as sns

# Obtain predictions from the CNN model on the test set
y_pred_proba = cnn_model.predict(X_test_pad)
y_pred = (y_pred_proba > 0.5).astype(int)

# Calculate ROC curve
fpr, tpr, _ = roc_curve(true_labels, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Generate confusion matrix
conf_matrix = confusion_matrix(true_labels, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
