In [53]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, Dataset

from collections import Counter

import os
import re

In [54]:
nltk.download('punkt');
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [55]:
torch.manual_seed(1);

In [56]:
word_counter = Counter();

In [57]:
file_directory = "ml-nlp/dataset"
file_name = "/C4_200M.tsv"

df = pd.read_csv(file_directory + file_name, sep='\t', header=0)

In [58]:
df

Unnamed: 0,"Bitcoin is for $7,094 this morning, which CoinDesk says.","Bitcoin goes for $7,094 this morning, according to CoinDesk."
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo..."
...,...,...
18386516,"Filming for BBC my world kitchen ""","Filming for BBC ""my world kitchen """
18386517,These posts categorizeed under Earing and post...,This post categorized under Earing and posted ...
18386518,NEWS: Mann Gallery featured in: American Style...,NEWS: Mann Gallery has been featured in: Ameri...
18386519,6Let stand a king for 5 mins before serving.,6Leave to stand for 5 mins before serving.


In [59]:
## Add a header data to the new row, salvaging as much data as we can.
new_row = list(df.columns.values)
df.rename(columns={new_row[0]:"incorrect", new_row[1]:"correct"}, inplace=True)
df.loc[df.index.max() + 1] = new_row

In [60]:
df

Unnamed: 0,incorrect,correct
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo..."
...,...,...
18386517,These posts categorizeed under Earing and post...,This post categorized under Earing and posted ...
18386518,NEWS: Mann Gallery featured in: American Style...,NEWS: Mann Gallery has been featured in: Ameri...
18386519,6Let stand a king for 5 mins before serving.,6Leave to stand for 5 mins before serving.
18386520,Preheat oven to 356 degrees Fahrenheit. (180 &...,Preheat oven to 356 degrees Fahrenheit (180 de...


In [61]:
"""
  Data Preprocessing
  - Drop any duplicates in the dataset
  - Check for any missing values in each column
  - Turn all words in the sentences to lowercase
  - Tokenize the words for each incorrect and correct data
  - Remove any special symbols or characters
  - Normalize the punctuations
"""
column_names = df.columns.tolist();
df.drop_duplicates(inplace=True); # Drop duplicates
df.dropna(inplace=True); # Drop null rows

In [62]:
def lowercase_words(text):
  text = text.lower(); # Lowercase all words in the sentences
  return text;

def clean_extra_spaces(text):
  text = text.strip(); # Remove any leading and trailing spaces
  text = re.sub(r'\s+', ' ', text); # Replace multiple spaces with a single space
  return text;

def remove_repeated_words(text):
  pattern = r'\b(\w+)( \1\b)+'; # Find repeated words like "is is" or "the the"
  text = re.sub(pattern, r'\1', text);
  return text;

def clean_unwanted_characters(text):
  text = re.sub(r'[^\w\s,.!?\'"]+', '', text); # Remove unnecessary symbols, but keep common punctuations
  return text;

In [63]:
def data_prerocessing(text):
  text = lowercase_words(text);
  text = clean_extra_spaces(text);
  text = remove_repeated_words(text);
  text = clean_unwanted_characters(text);

  return text;

for column in column_names:
  df[column + "_cleaned"] = df[column].apply(data_prerocessing);

In [64]:
df

Unnamed: 0,incorrect,correct,incorrect_cleaned,correct_cleaned
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ...",the effect of widespread dud targets two face ...,"1. the effect of ""widespread dud"" targets two ..."
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...,tax on sales of stores for non residents are s...,capital gains tax on the sale of properties fo...
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.,much many brands and sellers still in the market.,many brands and sellers still in the market.
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...,this is the latest maintenance release of samb...,this is the latest maintenance release of samb...
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo...","fairy or not, i'm the godmother no just look, ...","fairy or not, i'm the godmother not just a loo..."
...,...,...,...,...
18386517,These posts categorizeed under Earing and post...,This post categorized under Earing and posted ...,these posts categorizeed under earing and post...,this post categorized under earing and posted ...
18386518,NEWS: Mann Gallery featured in: American Style...,NEWS: Mann Gallery has been featured in: Ameri...,news mann gallery featured in american style m...,news mann gallery has been featured in america...
18386519,6Let stand a king for 5 mins before serving.,6Leave to stand for 5 mins before serving.,6let stand a king for 5 mins before serving.,6leave to stand for 5 mins before serving.
18386520,Preheat oven to 356 degrees Fahrenheit. (180 &...,Preheat oven to 356 degrees Fahrenheit (180 de...,preheat oven to 356 degrees fahrenheit. 180 d...,preheat oven to 356 degrees fahrenheit 180 deg...
