In [32]:
!python --version

Python 3.10.14


In [33]:
import requests, zipfile, io

url = 'https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip'

In [34]:
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

z.extractall()

In [35]:
import pandas as pd
df = pd.read_table('SMSSpamCollection', header=None, encoding='UTF-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [36]:
classes = df[0]
classes

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: 0, Length: 5572, dtype: object

In [37]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
import re
pattern = re.compile(r'[\d@_!#$%&^*()<>?/|{}~:\\/,.;?!]')
text_messages = df[1]
messages = text_messages[0:10]
for line in messages:
    words = line.split(' ')
    for word in words:
        if re.search(pattern, word):
            print(word)

point,
crazy..
buffet...
wat...
lar...
oni...
2
21st
2005.
87121
question(std
rate)T&C's
08452810075over18's
hor...
say...
usf,
3
back!
still?
ok!
send,
£1.50
me.
patent.
(Oru
Vettam)'
Callers.
*9
WINNER!!
£900
reward!
09061701461.
KL341.
12
only.
11
more?
Free!
08002986030


In [None]:
processed_lines = text_messages.str.lower()
patterns = [
    # Substituir endereços de email por 'emailaddress'
    (r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress'),
    # Substituir URLs por 'webaddress'
    (r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
    'webaddress'),
    # Substituir símbolos monetários (Dólar e Euro) por 'moneysymb'
    (r'£|\$', 'moneysymb'),
    # Substituir números de telefone por 'phonenumbr'
    (r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr'),
    # Substituir números por 'numbr'
    (r'\d+(\.\d+)?', 'numbr'),
    # Remover pontuação (! ?)
    (r'[^\w\d\s]', ' '),
    # Substitua dois ou mais espaços em branco por um único espaço
    (r'\s+', ' '),
    # Remova os espaços em branco à esquerda e à direita
    (r'^\s+|\s+?$', '')
]
# Substituir no texto os padrões encontrados
for pattern, new_word in patterns:
    processed_lines = processed_lines.str.replace(pattern, new_word, regex=True)

In [47]:
text_messages.str.lower()

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                 will ü b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: 1, Length: 5572, dtype: object

In [48]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stop_words = stopwords.words('english')
ps = PorterStemmer()
for i in range(len(processed_lines)):
    word_tokens = word_tokenize(processed_lines[i])
    filtered_sentence = []

    for word in word_tokens:
        if word not in stop_words:
            stemmed_word = ps.stem(word)
            filtered_sentence.append(stemmed_word)
    
    processed_lines[i] = ' '.join(filtered_sentence)

[nltk_data] Downloading package stopwords to /home/eliabe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eliabe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/eliabe/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [52]:
all_words = []

for line in processed_lines:
    word_tokens = word_tokenize(line)
    for word in word_tokens:
        all_words.append(word)

all_words = nltk.FreqDist(all_words)

print(f'Total de palavras: {len(all_words)}')

Total de palavras: 7995


In [55]:
print(f'Palavras mais comuns: {all_words.most_common(10)}')
word_features = list(all_words.keys())[0:1500]

print(f'Lista de características: {word_features[:10]}')

Palavras mais comuns: [('.', 4776), (',', 1939), ('?', 1550), ('!', 1397), ('...', 1146), ('u', 1138), ('&', 922), (';', 768), (':', 722), ('..', 697)]
Lista de características: ['go', 'jurong', 'point', ',', 'crazi', '..', 'avail', 'bugi', 'n', 'great']


In [51]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'