# Natural Language Processing with Disaster Tweets

competition: https://www.kaggle.com/competitions/nlp-getting-started/overview

code: https://www.kaggle.com/code/faressayah/natural-language-processing-nlp-for-beginners

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

## Presenting text as numerical data

In [2]:
# Sample text for training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'Please call me... PLEASE!']

In [3]:
# Import CountVectorizer and create an instnace of it
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

# Fit the CountVectorizer to the training data
vect.fit(simple_train)

# Transform the text messages in the training data to a document-term matrix
vect.get_feature_names_out()

array(['cab', 'call', 'me', 'please', 'tonight', 'you'], dtype=object)

In [4]:
# Transform the text messages in the training data to a document-term matrix
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm


<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [5]:
# Convert the sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]])

In [6]:
# Examine the vocabulary and document-term matrix together
pd.DataFrame(simple_train_dtm.toarray(), columns = vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [7]:
# Check the type of the document-term matrix
print(type(simple_train_dtm))

<class 'scipy.sparse._csr.csr_matrix'>


In [8]:
# Print the sparse matrix
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [9]:
# Example text for model testing
simple_test = ["Please don't call me"]

In [10]:
# Transform the test data to a document-term matrix
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]])

In [13]:
# Exmain vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns = vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


## Reading a text-based dataset to pandas

In [14]:
# read file into pandas using a relative path
sms = pd.read_csv('~/aiffel/data/spam.csv', encoding='latin-1')
sms.dropna(how="any", inplace=True, axis=1)
sms.columns = ['label', 'message']

sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 🔍 Exploratory Data Analysis (EDA)

In [15]:
sms.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [16]:
sms.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4
