In [226]:
'''
Prediction 1:
scenario_1: The word test appears in at least in one field
scenario_2: All fields are at most the same
scenario_3: One word field
scenario_4: Random characters in at least in one field
'''
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import words
import random
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/darrylbalderas/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [227]:
# import spam collection messages
column_names = ['label', 'title']
message_data = pd.read_table("SMSSpamCollection", names=column_names)

In [228]:
# Remove rows that contain spam label and reset indexes
message_data = message_data.drop(message_data[message_data.label == "spam"].index).reset_index(drop=True)

In [229]:
# Drop label column
message_data = message_data.drop('label', axis=1)

In [230]:
message_data.head(10)

Unnamed: 0,title
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,U dun say so early hor... U c already then say...
3,"Nah I don't think he goes to usf, he lives aro..."
4,Even my brother is not like to speak with me. ...
5,As per your request 'Melle Melle (Oru Minnamin...
6,I'm gonna be home soon and i don't want to tal...
7,I've been searching for the right words to tha...
8,I HAVE A DATE ON SUNDAY WITH WILL!!
9,Oh k...i'm watching here:)


In [231]:
english_words = dict.fromkeys(words.words(), None)

In [232]:
def is_word(word):
    try:
        x = english_words[word]
        return True
    except:
        return False


def get_false_word_indexes(data, percentage):
    indexes = []
    for index, message in enumerate(data['title']):
        false_word_count = 0
        for word in message.split():
            if not is_word(word):
                false_word_count += 1
                if false_word_count/len(message) > false_word_percentage:
                    indexes.append(index)
                    break
    return indexes

def remove_single_words(data):
    indexes = []
    for index, message in enumerate(data['title']):
        if len(message.split()) > 1:
            continue
        indexes.append(index)
    return indexes
    

In [233]:
%%time
false_word_percentage = 0.02
indexes = get_false_word_indexes(message_data, false_word_percentage)
print('{} number of indexes will be deleted'.format(len(indexes)))

4741 number of indexes will be deleted
CPU times: user 17.3 ms, sys: 648 µs, total: 18 ms
Wall time: 17.7 ms


In [234]:
%%time
original_size = message_data.shape[0]
real_messages = message_data.drop(message_data.index[indexes]).reset_index(drop=True)
print('Data size went from {} to {}'.format(original_size, real_messages.shape[0]))

Data size went from 4825 to 84
CPU times: user 3.59 ms, sys: 1.89 ms, total: 5.48 ms
Wall time: 3.81 ms


In [235]:
real_messages.head()

Unnamed: 0,title
0,I see the letter B on my car
1,Ok i am on the way to home hi hi
2,I see a cup of coffee animation
3,Keep yourself safe for me because I need you a...
4,Or ill be a little closer like at the bus stop...


In [236]:
%%time
indexes = remove_single_words(real_messages)
print('{} number of indexes will be deleted'.format(len(indexes)))
no_single_messages = real_messages.drop(real_messages.index[indexes]).reset_index(drop=True)
print('Data size went from {} to {}'.format(real_messages.shape[0], no_single_messages.shape[0]))

8 number of indexes will be deleted
Data size went from 84 to 76
CPU times: user 1.4 ms, sys: 124 µs, total: 1.52 ms
Wall time: 1.45 ms


In [237]:
def add_test_message(sentence):
    words = sentence.split()
    choice = random.randint(0,len(words))
    update_sentence = " ".join(words[:choice]) + " test " + " ".join(words[choice:])
    return update_sentence.strip()

In [238]:
# Show that the test message in a random index of the sentence
add_test_message("Hello my name")

'Hello my name test'

In [239]:
def shuffle_dataframe(df, num_shuffles):
    for x in range(num_shuffles):
        for column in df.columns:
            df[column] = df[column].sample(frac=1).reset_index(drop=True)
    return df

In [240]:
def create_default_scenario(df):
    scenario = pd.DataFrame(df)
    scenario['subtitle'] = scenario['title']
    scenario['message'] = scenario['title']
    return scenario

In [241]:
def create_scenario_one(messages):
    df = create_default_scenario(messages)
    df = shuffle_dataframe(scenario_one, num_shuffles=30)
    for index in df.index:
        prob = random.random()
        choice = random.choice(df.columns)
        if prob < 0.33:
            df.loc[index][choice] = add_test_message(df.loc[index][choice])
        elif prob < 0.66:
            columns = list(df.columns)
            df.loc[index][choice] = add_test_message(df.loc[index][choice])
            columns.remove(choice)
            choice = random.choice(columns)
            df.loc[index][choice] = add_test_message(df.loc[index][choice])
        else:
            for column in df.columns:
                df.loc[index][column] = add_test_message(df.loc[index][column])
        return df

In [242]:
def create_scenario_two(messages):
    return create_default_scenario(messages)

In [243]:
def create_scenario_three(messages):
    df = create_default_scenario(messages)
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x.split()[0])
    return df

In [267]:
import string
def create_random_chars():
    letters = string.ascii_letters
    num_chars = random.randint(10, 30)
    return "".join([random.choice(string.ascii_letters) for _ in range(num_chars)])

def create_scenario_four(messages):
    df = create_default_scenario(messages)
    for column in df.columns:
        df[column] = df[column].apply(lambda x: create_random_chars())
    return df

Unnamed: 0,title,subtitle,message
0,fsSuIRfecjFfuvm,kpdeLJiwYnehOXdSwRYmDfCMBksXQa,wUjaOwSaZRXQ
1,AImLPoEudbnSVUbeiTaFvkiG,CFAyuwfDxXaxMpe,GKmxkyyOunIBwpCRjJGfANsHcUkz
2,XjiiXSRYIIkaR,kvDGKdPYNVUX,ultfmgFZQdhBcmMHvUlsdUZMNdeReR
3,ClsbkkqskHhcFvzFPYgD,RgjRGqJqkwInlo,RiMIOdkGiMwTHuTpcPqTsfeHFj
4,VehRJqJovFLGzIvWGN,YKeYJmzIPfkkV,kSZZByXRiLQf
5,bdlVHwNTTLnvwMWQcO,tDKTiiRQFQAMhYAxiKagPvK,CmVOfLRHMpKzWeUsnyglOLwTbDyxh
6,eMxuejyPMxRiPYBIYRDoKDqxB,mLzzRGKxeFrKypUSsE,JlbMAcrtfDQNmJpHXOuWTiHGovyZu
7,hhBwxFqwCvODInfSdP,trpkDzezxhKDNFhnuzoUXarreX,emZhyhFNplcYTYSzVgwFWUqpEDXz
8,JqezTaAQDpvrPxAaDvJb,ImLnYZYZixYYjPJPiWf,OJSRdQvNLgfYdmrAnsZBIShGH
9,icKZnRcqYbaplNIkW,alZvAqnIGSTdfuqGTqrqDCmKbsRN,QkCHsxpJNFEv


In [245]:
create_scenario_one(no_single_messages).head()

Unnamed: 0,title,subtitle,message
0,Stupid.its not test test possible,"Madam,regret disturbance.might receive a refer...",Why must we sit around and test wait for summe...
1,U should have made an appointment test,Or maybe my fat fingers just press all these b...,"Cool, want me to go to kappa or should I meet ..."
2,Have a great trip to India. And bring the ligh...,i thought we were doing a king of the hill thi...,No dear i was sleeping test :-P
3,I will reach ur home in &lt;#&gt; minutes,I meant as an apology from me for texting you ...,What is important test is that you prevent deh...
4,"So anyways, you can just go to your gym or wha...",Carlos is down but test I have to pick it up f...,Why you keeping me away like this


In [246]:
create_scenario_two(no_single_messages).head()

Unnamed: 0,title,subtitle,message
0,I see the letter B on my car,I see the letter B on my car,I see the letter B on my car
1,Ok i am on the way to home hi hi,Ok i am on the way to home hi hi,Ok i am on the way to home hi hi
2,I see a cup of coffee animation,I see a cup of coffee animation,I see a cup of coffee animation
3,Keep yourself safe for me because I need you a...,Keep yourself safe for me because I need you a...,Keep yourself safe for me because I need you a...
4,Or ill be a little closer like at the bus stop...,Or ill be a little closer like at the bus stop...,Or ill be a little closer like at the bus stop...


In [247]:
create_scenario_three(no_single_messages).head()

Unnamed: 0,title,subtitle,message
0,I,I,I
1,Ok,Ok,Ok
2,I,I,I
3,Keep,Keep,Keep
4,Or,Or,Or


In [268]:
create_scenario_four(no_single_messages).head()

Unnamed: 0,title,subtitle,message
0,lxyUQoOhTxtZsuoGlaoj,fOMqZngfVhfUSILYHdcPEQnMtMor,PLLxwTorZivbxBGrMRnyGvRXpdYG
1,JLGUbZdObj,ppLbGfIwLfEWqORUT,aVeQGHxDSh
2,ODGzpeyAdzqcilCbJGH,VsuUDzJRnkvbdrJBID,gyzGzxLdAqWgnFdQPHwXEYIsc
3,qMsMihZPSYDzXAKsQVroBL,KxCBpHUBPteUZBCypaGAbPGWl,yMMflanNdoWvWAPVB
4,GlYJtSspTmmxOvmDBigPRVZWNAupD,GxbSQTTQKomE,XIYapIUzSJPgdrJD
