In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string
import re

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# SUB FUNCTION TO PRE-PROCESS STRING:

In [29]:
# Lowercasing:

def text_lower_case(text):

  return text.lower()


# Removing numbers/digits:

def remove_numbers(text):

  result = re.sub("\d+", "", text)

  return result


# Removing ponctuation & special characters:

def remove_punctuation(text):

  regular_punct = list(string.punctuation)

  for punct in regular_punct:

    if punct in text:

      text = text.replace(punct, "")

  return text.strip()


# Handling double whitespace:

def remove_whitespace(text):

  return " ".join(text.split())


# Define a function to remove URLs from text:

def remove_url(text):

  url_patterns = []
  url_patterns.append(re.compile(r'https?://\S+'))
  url_patterns.append(re.compile(r'http?://\S+'))
  url_patterns.append(re.compile(r'www\.\S+'))

  for pattern in url_patterns:
    pattern.sub("", text)

  return text


# Define a function to remove stop-words from a list of words:

def remove_stopwords(words_list):

  stop_words = set(stopwords.words('english'))

  # Filter out the stopwords:
  filtered_text = [word for word in words_list if word.lower() not in stop_words]

  return filtered_text


# Stemming

def stem_word(text):
  word_tokens = word_tokenize(text)
  stems = [stemmer.stem(word) for word in word_tokens]

  return stems


# Lemmatization:

def lemma_words(text):

  word_tokens = word_tokenize(text)
  lemmas = [lemmatizer.lemmatize(word) for word in word_tokens]

  return lemmas

# FUNCTION WHO APPLY ALL SUB-FUNCTION FROM DATAFRAME:

In [30]:
# Function to pre-process dataframe (.csv) with case normalization, noise removal, tokenization, stop-word, stemming and lemmatization:

def pre_processing(path_file_csv, encoding):

  df = pd.read_csv(path_file_csv, encoding=encoding)

  for index, row in df.iterrows():

    text_lower = row['Message_body'].lower()
    df.loc[index, 'Case Normalization'] = text_lower

    noise_remove_text = remove_numbers(text_lower)
    noise_remove_text = remove_punctuation(noise_remove_text)
    noise_remove_text = remove_whitespace(noise_remove_text)
    noise_remove_text = remove_url(noise_remove_text)
    df.loc[index, 'Noise Removal'] = noise_remove_text

    tokenized = nltk.wordpunct_tokenize(noise_remove_text)
    df.loc[index, 'Tokenization'] = str(tokenized)

    stop_word = remove_stopwords(tokenized)
    df.loc[index, 'Stopwords'] = str(stop_word)

    stem = stem_word(noise_remove_text)
    df.loc[index, 'Stemming'] = str(stem)

    lemma = lemma_words(noise_remove_text)
    df.loc[index, 'Lemmatization'] = str(lemma)

  return df

In [31]:
pre_processing('/content/drive/MyDrive/Colab Notebooks/SMS_test.csv', 'latin')

Unnamed: 0,S. No.,Message_body,Label,Case Normalization,Noise Removal,Tokenization,Stopwords,Stemming,Lemmatization
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam,"upgrdcentre orange customer, you may now claim...",upgrdcentre orange customer you may now claim ...,"['upgrdcentre', 'orange', 'customer', 'you', '...","['upgrdcentre', 'orange', 'customer', 'may', '...","['upgrdcentr', 'orang', 'custom', 'you', 'may'...","['upgrdcentre', 'orange', 'customer', 'you', '..."
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam,"loan for any purpose £500 - £75,000. homeowner...",loan for any purpose £ £ homeowners tenants we...,"['loan', 'for', 'any', 'purpose', '£', '£', 'h...","['loan', 'purpose', '£', '£', 'homeowners', 't...","['loan', 'for', 'ani', 'purpos', '£', '£', 'ho...","['loan', 'for', 'any', 'purpose', '£', '£', 'h..."
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam,congrats! nokia 3650 video camera phone is you...,congrats nokia video camera phone is your call...,"['congrats', 'nokia', 'video', 'camera', 'phon...","['congrats', 'nokia', 'video', 'camera', 'phon...","['congrat', 'nokia', 'video', 'camera', 'phone...","['congrats', 'nokia', 'video', 'camera', 'phon..."
3,4,URGENT! Your Mobile number has been awarded wi...,Spam,urgent! your mobile number has been awarded wi...,urgent your mobile number has been awarded wit...,"['urgent', 'your', 'mobile', 'number', 'has', ...","['urgent', 'mobile', 'number', 'awarded', '£',...","['urgent', 'your', 'mobil', 'number', 'ha', 'b...","['urgent', 'your', 'mobile', 'number', 'ha', '..."
4,5,Someone has contacted our dating service and e...,Spam,someone has contacted our dating service and e...,someone has contacted our dating service and e...,"['someone', 'has', 'contacted', 'our', 'dating...","['someone', 'contacted', 'dating', 'service', ...","['someon', 'ha', 'contact', 'our', 'date', 'se...","['someone', 'ha', 'contacted', 'our', 'dating'..."
...,...,...,...,...,...,...,...,...,...
120,121,7 wonders in My WORLD 7th You 6th Ur style 5th...,Non-Spam,7 wonders in my world 7th you 6th ur style 5th...,wonders in my world th you th ur style th ur s...,"['wonders', 'in', 'my', 'world', 'th', 'you', ...","['wonders', 'world', 'th', 'th', 'ur', 'style'...","['wonder', 'in', 'my', 'world', 'th', 'you', '...","['wonder', 'in', 'my', 'world', 'th', 'you', '..."
121,122,Try to do something dear. You read something f...,Non-Spam,try to do something dear. you read something f...,try to do something dear you read something fo...,"['try', 'to', 'do', 'something', 'dear', 'you'...","['try', 'something', 'dear', 'read', 'somethin...","['tri', 'to', 'do', 'someth', 'dear', 'you', '...","['try', 'to', 'do', 'something', 'dear', 'you'..."
122,123,Sun ah... Thk mayb can if dun have anythin on....,Non-Spam,sun ah... thk mayb can if dun have anythin on....,sun ah thk mayb can if dun have anythin on thk...,"['sun', 'ah', 'thk', 'mayb', 'can', 'if', 'dun...","['sun', 'ah', 'thk', 'mayb', 'dun', 'anythin',...","['sun', 'ah', 'thk', 'mayb', 'can', 'if', 'dun...","['sun', 'ah', 'thk', 'mayb', 'can', 'if', 'dun..."
123,124,"SYMPTOMS when U are in love: ""1.U like listeni...",Non-Spam,"symptoms when u are in love: ""1.u like listeni...",symptoms when u are in love u like listening s...,"['symptoms', 'when', 'u', 'are', 'in', 'love',...","['symptoms', 'u', 'love', 'u', 'like', 'listen...","['symptom', 'when', 'u', 'are', 'in', 'love', ...","['symptom', 'when', 'u', 'are', 'in', 'love', ..."
