In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
nltk.download('punkt')
nltk.download('words')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import string
from translate import Translator

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sri01\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sri01\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# show output from all steps in a cell instead of only the last step
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', 100)

In [3]:
# read the processed training and testing data
train = pd.read_csv('train_processed_non_text.csv')
test = pd.read_csv('test_processed_non_text.csv')

In [4]:
# shape of data
train.shape
test.shape

(442961, 6)

(238323, 5)

In [5]:
# convert to categorical
train.gender = train.gender.astype('category')
train.topic = train.topic.astype('category')
train.sign = train.sign.astype('category')

test.gender = test.gender.astype('category')
test.topic = test.topic.astype('category')
test.sign = test.sign.astype('category')

In [6]:
train.dtypes
test.dtypes

userid       int64
gender    category
topic     category
sign      category
text        object
age          int64
dtype: object

userid       int64
gender    category
topic     category
sign      category
text        object
dtype: object

In [7]:
# train: split into [userid, gender, topic, sign, age] and [userid, text]
non_text = train.loc[:, ['userid', 'gender', 'topic', 'sign', 'age']]
non_text.head()

text = train.loc[:, ['userid', 'text']]
text.head()

# combine non_text by userid
non_text = non_text.drop_duplicates()
non_text = non_text.sort_values(by = 'userid')
non_text.reset_index(drop=True, inplace=True)
non_text.head()
non_text.shape

# combine text by userid
combined_text = pd.DataFrame(train.groupby('userid')['text'].agg(lambda x: ' '.join(x)))
combined_text['userid'] = combined_text.index
combined_text.reset_index(drop=True, inplace=True)
combined_text.head()
combined_text.shape

# join
combined_train = non_text
combined_train['text'] = text.text
combined_train.head()
combined_train.shape

Unnamed: 0,userid,gender,topic,sign,age
0,11869,male,student,leo,15
1,11869,male,student,leo,15
2,11869,male,student,leo,15
3,11869,male,student,leo,15
4,16332,male,investmentbanking,aquarius,33


Unnamed: 0,userid,text
0,11869,"info has been found (+/- 100 pages, and 4.5 mb of .pdf files) now i have to wait unti..."
1,11869,these are the team members: drewes van der laag urllink mail ruiyu xie ...
2,11869,in het kader van kernfusie op aarde: maak je eigen waterstofbom how to build an h-...
3,11869,testing!!! testing!!!
4,16332,thanks to yahoo!'s toolbar i can now 'capture' the urls of popups...which means now...


Unnamed: 0,userid,gender,topic,sign,age
0,1,male,indunk,sagittarius,17
1,3,female,indunk,leo,16
2,4,male,student,aquarius,17
3,5,female,accounting,pisces,47
4,6,male,student,pisces,16


(12880, 5)

Unnamed: 0,text,userid
0,doritos fuel of space urllink : ian myers nasa's new plan urllink posted by u...,1
1,"yeah, i set up this blog so heather would be happy...she better be! lol well, i'm not s...",3
2,"not too much exciting has happened this weekend, i've just been really busy. i've wor...",4
3,european pilgrimage for high school students of st. michael and all angels escor...,5
4,last night's dream is very blurry. the only details i can remember: patty quit the yo...,6


(12880, 2)

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"info has been found (+/- 100 pages, and 4.5 mb of .pdf files) now i have to wait unti..."
1,3,female,indunk,leo,16,these are the team members: drewes van der laag urllink mail ruiyu xie ...
2,4,male,student,aquarius,17,in het kader van kernfusie op aarde: maak je eigen waterstofbom how to build an h-...
3,5,female,accounting,pisces,47,testing!!! testing!!!
4,6,male,student,pisces,16,thanks to yahoo!'s toolbar i can now 'capture' the urls of popups...which means now...


(12880, 6)

In [8]:
# test: split into [userid, gender, topic, sign] and [userid, text]
non_text = test.loc[:, ['userid', 'gender', 'topic', 'sign']]

text = test.loc[:, ['userid', 'text']]

# combine non_text by userid
non_text = non_text.drop_duplicates()
non_text = non_text.sort_values(by = 'userid')
non_text.reset_index(drop=True, inplace=True)
non_text.shape

# combine text by userid
combined_text = pd.DataFrame(test.groupby('userid')['text'].agg(lambda x: ' '.join(x)))
combined_text['userid'] = combined_text.index
combined_text.reset_index(drop=True, inplace=True)
combined_text.shape

# join
combined_test = non_text
combined_test['text'] = text.text
combined_test.shape

(6440, 4)

(6440, 2)

(6440, 5)

In [9]:
# train: remove punctuation
combined_train.head()
print(string.punctuation)
table = str.maketrans('', '', string.punctuation)
combined_train.text = pd.DataFrame(sentence.translate(table) for sentence in combined_train.text)
combined_train.head()

# test: remove punctuation
combined_test.text = pd.DataFrame(sentence.translate(table) for sentence in combined_test.text)

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"info has been found (+/- 100 pages, and 4.5 mb of .pdf files) now i have to wait unti..."
1,3,female,indunk,leo,16,these are the team members: drewes van der laag urllink mail ruiyu xie ...
2,4,male,student,aquarius,17,in het kader van kernfusie op aarde: maak je eigen waterstofbom how to build an h-...
3,5,female,accounting,pisces,47,testing!!! testing!!!
4,6,male,student,pisces,16,thanks to yahoo!'s toolbar i can now 'capture' the urls of popups...which means now...


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,info has been found 100 pages and 45 mb of pdf files now i have to wait untill our t...
1,3,female,indunk,leo,16,these are the team members drewes van der laag urllink mail ruiyu xie ...
2,4,male,student,aquarius,17,in het kader van kernfusie op aarde maak je eigen waterstofbom how to build an hbo...
3,5,female,accounting,pisces,47,testing testing
4,6,male,student,pisces,16,thanks to yahoos toolbar i can now capture the urls of popupswhich means now i can ...


In [10]:
# train: remove numbers
combined_train.text = combined_train.text.str.replace('\d+', '')
combined_train.head()

# test: remove numbers
combined_test.text = combined_test.text.str.replace('\d+', '')

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,info has been found pages and mb of pdf files now i have to wait untill our team l...
1,3,female,indunk,leo,16,these are the team members drewes van der laag urllink mail ruiyu xie ...
2,4,male,student,aquarius,17,in het kader van kernfusie op aarde maak je eigen waterstofbom how to build an hbo...
3,5,female,accounting,pisces,47,testing testing
4,6,male,student,pisces,16,thanks to yahoos toolbar i can now capture the urls of popupswhich means now i can ...


In [11]:
# train: convert to words
combined_train.text = combined_train.text.apply(word_tokenize)
combined_train.head()

# test: convert to words
combined_test.text = combined_test.text.apply(word_tokenize)

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"[info, has, been, found, pages, and, mb, of, pdf, files, now, i, have, to, wait, untill, our, te..."
1,3,female,indunk,leo,16,"[these, are, the, team, members, drewes, van, der, laag, urllink, mail, ruiyu, xie, urllink, mai..."
2,4,male,student,aquarius,17,"[in, het, kader, van, kernfusie, op, aarde, maak, je, eigen, waterstofbom, how, to, build, an, h..."
3,5,female,accounting,pisces,47,"[testing, testing]"
4,6,male,student,pisces,16,"[thanks, to, yahoos, toolbar, i, can, now, capture, the, urls, of, popupswhich, means, now, i, c..."


In [12]:
# train: remove stop words
stop_words = stopwords.words('english')
stop_words = [word.translate(table) for word in stop_words]
stop_words = set(stop_words)
print(stop_words)

def rmstp(x):
    wrd_lst = []
    for y in x:
        if y not in stop_words:
            wrd_lst.append(y)
    return wrd_lst

combined_train.text = combined_train.text.apply(lambda x: rmstp(x))
combined_train.head()

# test: remove stop words
combined_test.text = combined_test.text.apply(lambda x: rmstp(x))

{'more', 'while', 'don', 'his', 'isn', 'so', 'shan', 'or', 'ma', 'she', 'themselves', 'below', 'o', 'our', 'mightn', 'y', 'wouldnt', 'hasn', 'under', 'who', 'for', 'shouldve', 'we', 'through', 'havent', 'your', 'those', 'on', 'mustnt', 'was', 'against', 'with', 'why', 'nor', 'is', 'mustn', 's', 'doing', 'wouldn', 'than', 'in', 'there', 'because', 'they', 'this', 'youll', 'youve', 'whom', 'above', 'hadnt', 'ain', 'being', 'during', 'didnt', 'these', 'shes', 'couldn', 'should', 'myself', 'when', 'few', 'll', 'hadn', 'into', 'isnt', 'what', 'no', 'me', 'been', 'the', 'again', 'now', 're', 'arent', 'couldnt', 'hasnt', 'youd', 'an', 'her', 'be', 'himself', 'itself', 'are', 'm', 'aren', 'wasn', 'has', 'yourselves', 'does', 'dont', 'doesn', 'had', 'my', 'him', 'i', 'shant', 'you', 'which', 'between', 'here', 'own', 'won', 'where', 'thatll', 'yours', 'of', 'just', 'herself', 'needn', 'will', 'having', 'yourself', 'theirs', 'about', 'd', 'most', 'once', 'didn', 'not', 'other', 't', 'neednt', 'h

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"[info, found, pages, mb, pdf, files, wait, untill, team, leader, processed, learns, html]"
1,3,female,indunk,leo,16,"[team, members, drewes, van, der, laag, urllink, mail, ruiyu, xie, urllink, mail, bryan, aalderi..."
2,4,male,student,aquarius,17,"[het, kader, van, kernfusie, op, aarde, maak, je, eigen, waterstofbom, build, hbomb, ascotttarta..."
3,5,female,accounting,pisces,47,"[testing, testing]"
4,6,male,student,pisces,16,"[thanks, yahoos, toolbar, capture, urls, popupswhich, means, show, cool, links, korean, pop, kpo..."


In [13]:
# train: lemmatize words
wordnet_lemmatizer = WordNetLemmatizer()
def lmmtze(x):
    wrd_lst = []
    for y in x:
        wrd_lst.append(wordnet_lemmatizer.lemmatize(y))
    return wrd_lst

combined_train.text = combined_train.text.apply(lambda x: lmmtze(x))
combined_train.head()

# test: lemmatize words
combined_test.text = combined_test.text.apply(lambda x: lmmtze(x))

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"[info, found, page, mb, pdf, file, wait, untill, team, leader, processed, learns, html]"
1,3,female,indunk,leo,16,"[team, member, drewes, van, der, laag, urllink, mail, ruiyu, xie, urllink, mail, bryan, aalderin..."
2,4,male,student,aquarius,17,"[het, kader, van, kernfusie, op, aarde, maak, je, eigen, waterstofbom, build, hbomb, ascotttarta..."
3,5,female,accounting,pisces,47,"[testing, testing]"
4,6,male,student,pisces,16,"[thanks, yahoo, toolbar, capture, url, popupswhich, mean, show, cool, link, korean, pop, kpop, a..."


In [14]:
# remove non-english words
from nltk.corpus import wordnet

# words = set(nltk.corpus.words.words())

def fltreng(x):
    wrd_lst = []
    for y in x:
        if wordnet.synsets(y):
            wrd_lst.append(y)
    return wrd_lst

combined_train.text = combined_train.text.apply(lambda x: fltreng(x))
combined_train.head()

combined_test.text = combined_test.text.apply(lambda x: fltreng(x))

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"[info, found, page, mb, file, wait, team, leader, processed, learns, html]"
1,3,female,indunk,leo,16,"[team, member, van, der, mail, mail, bryan, mail]"
2,4,male,student,aquarius,17,"[het, van, eigen, build, andrew, scott, subject, build, humorous, date, feb, gmt, organization, ..."
3,5,female,accounting,pisces,47,"[testing, testing]"
4,6,male,student,pisces,16,"[thanks, yahoo, capture, url, mean, show, cool, link, korean, pop, audio, video, need, relate, i..."


In [15]:
# remove words less than one character in length

# train:
def rmshrt(x):
    wrd_lst = []
    for y in x:
        if len(y) > 1:
            wrd_lst.append(y)
    return wrd_lst

combined_train.text = combined_train.text.apply(lambda x: rmshrt(x))
combined_train.head()

# test: 
combined_test.text = combined_test.text.apply(lambda x: rmshrt(x))

Unnamed: 0,userid,gender,topic,sign,age,text
0,1,male,indunk,sagittarius,17,"[info, found, page, mb, file, wait, team, leader, processed, learns, html]"
1,3,female,indunk,leo,16,"[team, member, van, der, mail, mail, bryan, mail]"
2,4,male,student,aquarius,17,"[het, van, eigen, build, andrew, scott, subject, build, humorous, date, feb, gmt, organization, ..."
3,5,female,accounting,pisces,47,"[testing, testing]"
4,6,male,student,pisces,16,"[thanks, yahoo, capture, url, mean, show, cool, link, korean, pop, audio, video, need, relate, i..."


In [16]:
# count of words 
def cntwrds(x):
        return len(x)
    
combined_train['word_count'] = combined_train.text.apply(lambda x: cntwrds(x))
combined_train.head()

combined_test['word_count'] = combined_test.text.apply(lambda x: cntwrds(x))

Unnamed: 0,userid,gender,topic,sign,age,text,word_count
0,1,male,indunk,sagittarius,17,"[info, found, page, mb, file, wait, team, leader, processed, learns, html]",11
1,3,female,indunk,leo,16,"[team, member, van, der, mail, mail, bryan, mail]",8
2,4,male,student,aquarius,17,"[het, van, eigen, build, andrew, scott, subject, build, humorous, date, feb, gmt, organization, ...",2078
3,5,female,accounting,pisces,47,"[testing, testing]",2
4,6,male,student,pisces,16,"[thanks, yahoo, capture, url, mean, show, cool, link, korean, pop, audio, video, need, relate, i...",28


In [17]:
# # remove rows with word count less than 10
# combined_train = combined_train[combined_train.word_count > 9]
# min(combined_train.word_count)
# combined_train.head()

# combined_test = combined_test[combined_test.word_count > 9]

In [18]:
combined_train.shape
combined_test.shape
train.shape
test.shape

(12880, 7)

(6440, 6)

(442961, 6)

(238323, 5)

In [19]:
# output files
combined_train.to_csv('train_processed_text.csv', index = False)
combined_test.to_csv('test_processed_text.csv', index = False)