# Modules import

In [4]:
import os.path

import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Piotrek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Piotrek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data import

In [5]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed')
STOPWORDS = set(stopwords.words('english'))
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [6]:
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
validation.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,human#1,"TIGHT , TIGHT , TIGHT , YEAH ! ! ! <URL>",human,human
1,human#11,India has millennia old relations with Oman . ...,human,human
2,human#8,Anxious Teenagers,human,human
3,human#9,Our top priority is keeping Canadians safe . W...,human,human
4,bot#9,nah bro You ’ re taking sis so much I ’ m just...,bot,others


In [7]:
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
test.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,human#17,justin timberlake really one of the goats if y...,human,human
1,human#11,Thank you <MENTION> for your gracious prayers ...,human,human
2,human#1,Theory : the number of red lights you will hit...,human,human
3,bot#12,Respects on the Upt of the I good with the peo...,bot,rnn
4,human#10,Might give the BASIC #10Liner game contest ano...,human,human


# Data preprocessing
## Tokenization And Stopwords removal

In [8]:
def tokenize_remove_stop_words(dataset: pd.DataFrame) -> pd.DataFrame:
    tknzr = TweetTokenizer(strip_handles=True)
    dataset['tokens'] = dataset['text'].apply(lambda x: tknzr.tokenize(x))
    dataset['tokens'] = dataset['tokens'].apply(lambda x: [w for w in x if not w in STOPWORDS])
    return dataset

train = tokenize_remove_stop_words(train)
train.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens
0,bot#9,YEA now that note GOOD,bot,others,"[YEA, note, GOOD]"
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"[Listen, This, Charming, Man, The, Smiths, <URL>]"
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"[wish, would, seeing, hoes, worst, part]"
3,bot#1,The decade in the significantly easier schedul...,bot,others,"[The, decade, significantly, easier, schedule,..."
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"["", Theim, class, =\, "", alignnone, size-full,..."


In [9]:
validation = tokenize_remove_stop_words(validation)
validation.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens
0,human#1,"TIGHT , TIGHT , TIGHT , YEAH ! ! ! <URL>",human,human,"[TIGHT, ,, TIGHT, ,, TIGHT, ,, YEAH, !, !, !, ..."
1,human#11,India has millennia old relations with Oman . ...,human,human,"[India, millennia, old, relations, Oman, ., We..."
2,human#8,Anxious Teenagers,human,human,"[Anxious, Teenagers]"
3,human#9,Our top priority is keeping Canadians safe . W...,human,human,"[Our, top, priority, keeping, Canadians, safe,..."
4,bot#9,nah bro You ’ re taking sis so much I ’ m just...,bot,others,"[nah, bro, You, ’, taking, sis, much, I, ’, go..."


In [10]:
test = tokenize_remove_stop_words(test)
test.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens
0,human#17,justin timberlake really one of the goats if y...,human,human,"[justin, timberlake, really, one, goats, think]"
1,human#11,Thank you <MENTION> for your gracious prayers ...,human,human,"[Thank, <MENTION>, gracious, prayers, wishes, ..."
2,human#1,Theory : the number of red lights you will hit...,human,human,"[Theory, :, number, red, lights, hit, driving,..."
3,bot#12,Respects on the Upt of the I good with the peo...,bot,rnn,"[Respects, Upt, I, good, people, West, Bengal,..."
4,human#10,Might give the BASIC #10Liner game contest ano...,human,human,"[Might, give, BASIC, #10Liner, game, contest, ..."


## Stemming

In [11]:
PS = PorterStemmer()
def stemming(dataset: pd.DataFrame) -> pd.DataFrame:
    new_dataset = dataset.copy()
    new_dataset['tokens'] = new_dataset['tokens'].apply(lambda x: [PS.stem(w) for w in x])
    new_dataset['new_text'] = new_dataset['tokens'].apply(lambda x: ' '.join(x))
    return new_dataset
stemmed_train = stemming(train)
stemmed_validation = stemming(validation)
stemmed_test = stemming(test)
stemmed_train.head()

Unnamed: 0,screen_name,text,account.type,class_type,tokens,new_text
0,bot#9,YEA now that note GOOD,bot,others,"[yea, note, good]",yea note good
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human,"[listen, thi, charm, man, the, smith, <url>]",listen thi charm man the smith <url>
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others,"[wish, would, see, hoe, worst, part]",wish would see hoe worst part
3,bot#1,The decade in the significantly easier schedul...,bot,others,"[the, decad, significantli, easier, schedul, i...",the decad significantli easier schedul i don't...
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn,"["", theim, class, =\, "", alignnon, size-ful, w...",""" theim class =\ "" alignnon size-ful wp-imag -..."


### Saving stemmed data

In [12]:
STEMMED_DATA_PATH = os.path.join('..', 'data', 'stemmed')
if not os.path.exists(STEMMED_DATA_PATH):
    os.mkdir(STEMMED_DATA_PATH)
stemmed_train.to_csv(os.path.join(STEMMED_DATA_PATH, 'train.csv'), index=False)
stemmed_validation.to_csv(os.path.join(STEMMED_DATA_PATH, 'validation.csv'), index=False)
stemmed_test.to_csv(os.path.join(STEMMED_DATA_PATH, 'test.csv'), index=False)

## Lemmatization

In [13]:
LEMMATIZER = WordNetLemmatizer()
def lemmatization(dataset: pd.DataFrame) -> pd.DataFrame:
    new_dataset = dataset.copy()
    new_dataset['tokens'] = new_dataset['tokens'].apply(lambda x: [LEMMATIZER.lemmatize(w) for w in x])
    new_dataset['new_text'] = new_dataset['tokens'].apply(lambda x: ' '.join(x))
    return new_dataset
lemmatized_train = lemmatization(train)
lemmatized_validation = lemmatization(validation)
lemmatized_test = lemmatization(test)

### Saving lemmatized data

In [14]:
LEMMATIZED_DATA_PATH = os.path.join('..', 'data', 'lemmatized')
if not os.path.exists(LEMMATIZED_DATA_PATH):
    os.mkdir(LEMMATIZED_DATA_PATH)
stemmed_train.to_csv(os.path.join(LEMMATIZED_DATA_PATH, 'train.csv'), index=False)
stemmed_validation.to_csv(os.path.join(LEMMATIZED_DATA_PATH, 'validation.csv'), index=False)
stemmed_test.to_csv(os.path.join(LEMMATIZED_DATA_PATH, 'test.csv'), index=False)