In [2]:
import pandas as pd

# **Read data process**

In [None]:
# Read data from train.csv
train_data = pd.read_csv('train.csv')

# Display 5 first rows of the dataframe
train_data.head()

In [None]:
# Read data from val.csv
val_data = pd.read_csv('val.csv')

# Display 5 first rows of the dataframe
val_data.head()

In [None]:
# Pre-process the data

import re
import nltk

from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [None]:
def clean_message(m):
    m = re.sub(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$','',str(m)) # email addresses
    m = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', ' ', str(m)) # URLs
    m = re.sub(r'\d+', ' ', str(m)) # number
    m = re.sub(r'[^a-zA-Z]', ' ', str(m)) # non alphabet
    m = m.translate(str.maketrans('', '', punctuation)) # punctuation
    m = re.sub(r'\s+', ' ', str(m)) # remove multiple spaces
    m = m.lower() # lower case
    return m

def clean_column(data, col_name):
    # remove email addresses, URLs, numbers, non-alphabets, punctuations, multiple spaces, and lower case
    data[col_name] = data[col_name].apply(clean_message)

    # remove stopwords
    data[col_name]= data[col_name].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords]))

    # remove words with length less than 3 and more than 15 characters to remove noise
    data[col_name]= data[col_name].apply(lambda x: ' '.join([item for item in x.split() if 3 <= len(item) <= 15]))

    # lemmatization
    lem = WordNetLemmatizer()

    # lemmatize the words into verb form
    data[col_name] = data[col_name].apply(lambda x: ' '.join([lem.lemmatize(word,pos='v') for word in x.split()]))
    # lemmatize the words into noun form
    data[col_name] = data[col_name].apply(lambda x: ' '.join([lem.lemmatize(word,pos='n') for word in x.split()]))

    return data
