# NLP Basics: Reading in text data & why do we need to clean the text?

### Read in semi-structured text data

In [1]:
# Read in the raw text
# Pulling in the text with no semblance of structure at all
# Gives first look of data
rawData = open("SMSSpamCollection.tsv").read()

# At this point, there is no indication of format, doesn't know how many rows, colymns, words, or anything
# Python sees this as just one long string
# Print the raw data
rawData[0:500] # print out first 500 characters

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [9]:
# One way to handle this, replace \t's with \n's, that will allows us to split this up into a list [update, i think replace \t's with \n's]
# .replace(): first param is what to replace for, second is what to replace that with -> allows a split()
# .split(): take string and split it on a certain character and return a list, so split on \n
# So anytime it sees \n it will chop it off and add that component to a list
parsedData = rawData.replace('\t', '\n').split('\n') # string.replace(old, new, count)

In [10]:
parsedData[0:5] # print out first 5 elements in list, this gives us some sort of structure

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [13]:
labelList = parsedData[0::2] # start at 0, go to the end so omit number, and step 2, gets all 'spams', 'hams' etc.
textList = parsedData[1::2] # retrieve text

In [15]:
print(labelList[0:5])
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [16]:
# Now we have the information we need separated into two list
# We can now think about combining these so that we can actually use for analysis

In [17]:
import pandas as pd

# Create DF
fullCorpus = pd.DataFrame({
    'labal': labelList, # first column
    'body_list': textList,# second column
})

fullCorpus.head() # prints out first 5 rows by default


ValueError: arrays must all be same length

In [18]:
# ValueError: arrays must all be same length

# Check the length of each, should be the same but it's not
print(len(labelList)) # 5571
print(len(textList))  # 5570

5571
5570


In [19]:
# print last 5 last items of labelList
print(labelList[-5:]) # go to the very end, count 5 backwards,a nd print out those last five

['ham', 'ham', 'ham', 'ham', '']


In [21]:
# The very last entry is empty, so drop it!
# Create DF
fullCorpus = pd.DataFrame({
    'labal': labelList[:-1], # start at the very beginning, but don't get the last one
    'body_list': textList,
})
fullCorpus.head()

Unnamed: 0,labal,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [23]:
# \t is telling that a file is tab delimited, panda allows you to read tab-separated files very easily

# Easiest way to read this, use read_csv()
# header = None, because the raw dataset does not have column names, and if not specified, it will take first columns
# and assume they are column names
dataset = pd.read_csv("SMSSpamCollection.tsv", sep="\t", header=None) 
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
