# **A Comprehensive Survey of Machine Learning Methods for Text Classification**
## Data Preprocessing

#### *Carlos Santiago Bañón, Moazam Soomro*
#### *CAP 6307, Fall '21*

In [1]:
import nltk
import pandas as pd
import re
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from string import punctuation
from textblob import Word

In [2]:
# Download the necessary NLTK components.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## **Loading the Dataset**
---

In [4]:
# Load the dataset.
data = pd.read_csv(file, encoding='latin-1')[[X_label, y_label]]
data.head()

Unnamed: 0,v2,v1
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


## **Transforming the Data**
---

In [5]:
# Rename the columns
data = data.rename(columns={X_label: "text", y_label: "label"})
data.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [6]:
# Encode the labels.
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'].values)
data.head()

Unnamed: 0,text,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
def transformations(data):
  """
  Preprocess the text in 'dataframe' with the defined transformations.
  """

  # Remove the HTML tags.
  data['text'] = data['text'].apply(lambda words: re.sub('<.*?>', '', words))

  # Tokenize the words.
  data['text'] = data['text'].apply(word_tokenize)

  # Convert all words to lowercase.
  data['text'] = data['text'].apply(lambda words: [w.lower() for w in words])

  # Remove all punctuation.
  data['text'] = data['text'].apply(lambda words: [w for w in words if not w in punctuation])

  # Remove all stopwords.
  data['text'] = data['text'].apply(lambda words: [w for w in words if w not in stopwords.words('english')])

  # Perform lemmatization.
  data['text'] = data['text'].apply(lambda words: " ".join([Word(w).lemmatize() for w in words]))

  return data

In [8]:
# Apply the preprocessing transformations.
data = transformations(data)
data.head()

Unnamed: 0,text,label
0,go jurong point crazy.. available bugis n grea...,0
1,ok lar ... joking wif u oni ...,0
2,free entry 2 wkly comp win fa cup final tkts 2...,1
3,u dun say early hor ... u c already say ...,0
4,nah n't think go usf life around though,0


In [9]:
# Split the data into training and testing sets.
train, test = train_test_split(data, test_size=0.2, random_state=42)
print("Training Set:", train.shape)
print("Test Set:", test.shape)

Training Set: (4457, 2)
Test Set: (1115, 2)


## **Saving the Data**
---

In [10]:
# Save the data as CSV files.
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)