In [1]:
import pandas as pd
import numpy as np
import time
import datetime

In [2]:
# show output from all steps in a cell instead of only the last step
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# read the training and testing data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# convert user.id and post.id to userid and postid for easy referencing
train = train.rename(columns={'user.id': 'userid', 'post.id': 'postid'})
test = test.rename(columns={'user.id': 'userid', 'post.id': 'postid'})

In [5]:
# drop cols
train = train.drop(columns=['date', 'postid'])
test = test.drop(columns=['date', 'postid'])

In [6]:
# explore data
train.head(20)

Unnamed: 0,userid,gender,topic,sign,text,age
0,11869,male,Student,Leo,"Info has been found (+/- 100 pages,...",15
1,11869,male,Student,Leo,These are the team members: Drewe...,15
2,11869,male,Student,Leo,In het kader van kernfusie op aarde...,15
3,11869,male,Student,Leo,testing!!! testing!!!,15
4,16332,male,InvestmentBanking,Aquarius,Thanks to Yahoo!'s Toolbar I can ...,33
5,16332,male,InvestmentBanking,Aquarius,I had an interesting conversation...,33
6,16332,male,InvestmentBanking,Aquarius,Somehow Coca-Cola has a way of su...,33
7,16332,male,InvestmentBanking,Aquarius,"If anything, Korea is a country o...",33
8,16332,male,InvestmentBanking,Aquarius,Take a read of this news article ...,33
9,16332,male,InvestmentBanking,Aquarius,I surf the English news sites a l...,33


In [7]:
# find null values
train.isnull().values.any()
test.isnull().values.any()

False

False

In [8]:
# find empty or missing values
train.isna().values.any()
test.isna().values.any()

False

False

In [9]:
# view type of fields
train.dtypes
test.dtypes

userid     int64
gender    object
topic     object
sign      object
text      object
age        int64
dtype: object

userid     int64
gender    object
topic     object
sign      object
text      object
dtype: object

In [10]:
# convert all text fields to lowercase
train = train.applymap(lambda x:x.lower() if type(x) == str else x)
test = test.applymap(lambda x:x.lower() if type(x) == str else x)
train.head(20)

Unnamed: 0,userid,gender,topic,sign,text,age
0,11869,male,student,leo,"info has been found (+/- 100 pages,...",15
1,11869,male,student,leo,these are the team members: drewe...,15
2,11869,male,student,leo,in het kader van kernfusie op aarde...,15
3,11869,male,student,leo,testing!!! testing!!!,15
4,16332,male,investmentbanking,aquarius,thanks to yahoo!'s toolbar i can ...,33
5,16332,male,investmentbanking,aquarius,i had an interesting conversation...,33
6,16332,male,investmentbanking,aquarius,somehow coca-cola has a way of su...,33
7,16332,male,investmentbanking,aquarius,"if anything, korea is a country o...",33
8,16332,male,investmentbanking,aquarius,take a read of this news article ...,33
9,16332,male,investmentbanking,aquarius,i surf the english news sites a l...,33


In [11]:
# strip white spaces for categorical fields
train.gender = pd.DataFrame([gender.strip() for gender in train.gender])
train.topic = pd.DataFrame([topic.strip() for topic in train.topic])
train.sign = pd.DataFrame([sign.strip() for sign in train.sign])

test.gender = pd.DataFrame([gender.strip() for gender in test.gender])
test.topic = pd.DataFrame([topic.strip() for topic in test.topic])
test.sign = pd.DataFrame([sign.strip() for sign in test.sign])

In [12]:
# convert categorical variables to factors
train.gender = train.gender.astype('category')
train.topic = train.topic.astype('category')
train.sign = train.sign.astype('category')

test.gender = test.gender.astype('category')
test.topic = test.topic.astype('category')
test.sign = test.sign.astype('category')

In [13]:
# view type of fields
train.dtypes
test.dtypes

userid       int64
gender    category
topic     category
sign      category
text        object
age          int64
dtype: object

userid       int64
gender    category
topic     category
sign      category
text        object
dtype: object

In [14]:
train.head()

Unnamed: 0,userid,gender,topic,sign,text,age
0,11869,male,student,leo,"info has been found (+/- 100 pages,...",15
1,11869,male,student,leo,these are the team members: drewe...,15
2,11869,male,student,leo,in het kader van kernfusie op aarde...,15
3,11869,male,student,leo,testing!!! testing!!!,15
4,16332,male,investmentbanking,aquarius,thanks to yahoo!'s toolbar i can ...,33


In [15]:
# output the data processed
train.to_csv('train_processed_non_text.csv', index = False)
test.to_csv('test_processed_non_text.csv', index = False)

In [16]:
train.shape
test.shape

(442961, 6)

(238323, 5)