In [125]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print(train.head())
print(test.head())

print(np.where(pd.isnull(train)))
print(np.where(pd.isnull(test)))

print(train.shape)
print(test.shape)

   Index  Sentiment                                               Text
0      0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      1          0  is upset that he can't update his Facebook by ...
2      2          0  @Kenichan I dived many times for the ball. Man...
3      3          0    my whole body feels itchy and like its on fire 
4      4          0  @nationwideclass no, it's not behaving at all....
   Index  Sentiment                                               Text
0      0          1  @stellargirl I loooooooovvvvvveee my Kindle2. ...
1      1          1  Reading my kindle2...  Love it... Lee childs i...
2      2          1  Ok, first assesment of the #kindle2 ...it fuck...
3      3          1  @kenburbary You'll love your Kindle2. I've had...
4      4          1  @mikefish  Fair enough. But i have the Kindle2...
(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))
(1048575, 3)
(359, 3)


In [126]:
'''

Part 2 - Text Preprocessing

'''


# 1: lower-casing
train['Text'] = train['Text'].str.lower()
print(train.head())

test['Text'] = test['Text'].str.lower()
print(test.head())

# 2: remove digital numbers

import re #python regular expression library

train['Text'] = train['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[0-9]', '', x).strip())
print(train.head())
print(test.head())


# 3: Remove urls
                    
train['Text'] = train['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('http\S+', ' ', x).strip())
print(train.head())
print(test.head())


# 4: Remove username

train['Text'] = train['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('@[^\s]+', '', x).strip())
print(train.head())
print(test.head())



# 5: Remove special character and puncation 
train['Text'] = train['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
test['Text'] = test['Text'].apply(lambda x: re.sub('[^a-z0-9<>\']', ' ', x).strip())
print(train.head())
print(test.head())


# 6: Stemming

# train['Text'] = train['Text'].apply(porter.stem)
# import pandas as pd
# from nltk.stem.snowball import SnowballStemmer

# stemmer = SnowballStemmer("english")


# print(train.head())


# from nltk.tokenize import TweetTokenizer
# from nltk.stem.wordnet import WordNetLemmatizer

# tt = TweetTokenizer()
# train['Text'] = train['Text'].apply(tt.tokenize)
# print(train.head())

# lmtzr = WordNetLemmatizer()
# train['Text'] = train['Text'].apply(lambda lst:[lmtzr.lemmatize(word) for word in lst])
# print(train.head())







   Index  Sentiment                                               Text
0      0          0  @switchfoot http://twitpic.com/2y1zl - awww, t...
1      1          0  is upset that he can't update his facebook by ...
2      2          0  @kenichan i dived many times for the ball. man...
3      3          0    my whole body feels itchy and like its on fire 
4      4          0  @nationwideclass no, it's not behaving at all....
   Index  Sentiment                                               Text
0      0          1  @stellargirl i loooooooovvvvvveee my kindle2. ...
1      1          1  reading my kindle2...  love it... lee childs i...
2      2          1  ok, first assesment of the #kindle2 ...it fuck...
3      3          1  @kenburbary you'll love your kindle2. i've had...
4      4          1  @mikefish  fair enough. but i have the kindle2...
   Index  Sentiment                                               Text
0      0          0  @switchfoot http://twitpic.com/yzl - awww, tha...
1     

KeyboardInterrupt: 

In [None]:
# Stemming
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer("english") #define stemming dict
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

train['Text'] = train['Text'].apply(lambda x : stem_sentences(x))
test['Text'] = test['Text'].apply(lambda x : stem_sentences(x))

print(train.head())
print(test.head())



   Index  Sentiment                                               Text
0      0          0  [awww, that, a, bummer, you, shoulda, got, dav...
1      1          0  [is, upset, that, he, can't, updat, his, faceb...
2      2          0  [i, dive, mani, time, for, the, ball, manag, t...
3      3          0  [my, whole, bodi, feel, itchi, and, like, it, ...
4      4          0  [no, it, not, behav, at, all, i'm, mad, whi, a...
   Index  Sentiment                                               Text
0      0          1  [i, loooooooovvvvvvee, my, kindl, not, that, t...
1      1          1  [read, my, kindl, love, it, lee, child, is, go...
2      2          1  [ok, first, asses, of, the, kindl, it, fuck, r...
3      3          1  [you'll, love, your, kindl, i'v, had, mine, fo...
4      4          1  [fair, enough, but, i, have, the, kindl, and, ...


In [None]:
'''

Part 3 - Linguistic Feature Extraction

'''

# 1. Bag of Words
import numpy as np

wordCount = {}
for tokens in train['Text']:
    for word in tokens:
        if word not in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

unique_words = list(wordCount.keys())

print(unique_words)

bag_of_words = []

for tokens in train['Text']:
    bag_vector = np.zeros(len(unique_words))
    for words in tokens:
        for i, word in enumerate(unique_words):
            if word == words:
                bag_vector[i] += 1
    print(bag_vector)
    bag_of_words.append(bag_vector)


print(bag_of_words[0])


['awww', 'that', 'a', 'bummer', 'you', 'shoulda', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', 'd', 'is', 'upset', 'he', "can't", 'updat', 'his', 'facebook', 'by', 'text', 'and', 'might', 'cri', 'as', 'result', 'school', 'today', 'also', 'blah', 'i', 'dive', 'mani', 'time', 'for', 'the', 'ball', 'manag', 'save', 'rest', 'go', 'out', 'bound', 'my', 'whole', 'bodi', 'feel', 'itchi', 'like', 'on', 'fire', 'no', 'not', 'behav', 'at', 'all', "i'm", 'mad', 'whi', 'am', 'here', 'becaus', 'see', 'over', 'there', 'crew', 'need', 'hug', 'hey', 'long', 'yes', 'rain', 'bit', 'onli', 'lol', 'fine', 'thank', 'how', 'nope', 'they', "didn't", 'have', 'que', 'me', 'muera', 'spring', 'break', 'in', 'plain', 'citi', 'snow', 'just', 're', 'pierc', 'ear', "couldn't", 'bear', 'watch', 'thought', 'ua', 'loss', 'was', 'embarrass', 'count', 'idk', 'did', 'either', 'never', 'talk', 'anymor', "would'v", 'been', 'first', 'but', 'gun', 'realli', 'though', 'zac', 'snyder', 'doucheclown', 'wish', 

Traceback (most recent call last):
  File "C:\Users\chris\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 3442, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\chris\AppData\Local\Temp\ipykernel_9468\481969913.py", line -1, in <module>
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\chris\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 2057, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "C:\Users\chris\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\ultratb.py", line 1118, in structured_traceback
    return FormattedT