## NLTK BASICS

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
dir(nltk)

['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGraph',
 'Depen

In [3]:
from nltk.corpus import stopwords

stopwords.words('english')[:15]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours']

### Reading in semi-structured raw data 

In [5]:
rawData = open("SMSSpamCollection.tsv").read()
rawData[:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [6]:
parsedData = rawData.replace('\t', '\n').split('\n')
parsedData[:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [8]:
labelList = parsedData[0::2] #this will extract string at position 0 and 2, 4, etc..till last
textList = parsedData[1::2] #Similarly for text 

print(labelList[:5])
print(textList[:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


### Combining both lists for analysis using pandas DF

In [9]:
import pandas as pd 

fullCorpus = pd.DataFrame({
    'label' : labelList,
    'text' : textList
})

fullCorpus.head()

ValueError: arrays must all be same length

In [10]:
print(len(labelList))
print(len(textList))

5571
5570


In [11]:
print(labelList[-5:]) #The extra entry which is empty str, will drop it so the array length matches

['ham', 'ham', 'ham', 'ham', '']


In [13]:
fullCorpus = pd.DataFrame({
    'label' : labelList[:-1],
    'text' : textList
})

fullCorpus.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Alternate way to store semi-structured as dataframe

In [22]:
fullCorpus = pd.read_csv("SMSSpamCollection.tsv", sep = '\t', header = None)
fullCorpus.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [23]:
fullCorpus.columns = ['label', 'text']

In [24]:
fullCorpus.head()

Unnamed: 0,label,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Exploring the dataset

In [25]:
# Shape of the dataset:
print("The corpus has {} rows and {} columns.".format(fullCorpus.shape[0], fullCorpus.shape[1]))

The corpus has 5568 rows and 2 columns.


In [34]:
# How many spam and ham are there?
print('Number of spam in corpus: {} '.format(len(fullCorpus[fullCorpus['label']=='spam'])))
print('Number of ham in corpus: {} '.format(len(fullCorpus[fullCorpus['label']=='ham'])))

Number of spam in corpus: 746 
Number of ham in corpus: 4822 


### Are there any missing data?

In [35]:
print('Number of null in label: {}'.format(fullCorpus['label'].isnull().sum()))
print('Number of null in text: {}'.format(fullCorpus['text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0
