In [57]:
import re
from tqdm import tqdm
from collections import Counter

import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split

### 1. Load the tweets file from pandas package

In [31]:
data = pd.read_csv('./Input/TwitterHate.csv')

In [32]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [33]:
data.shape

(31962, 3)

### 2. Get the tweets into a list for easy text cleanup and manipulation

In [34]:
tweets = list(data['tweet'])
tweets

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation',
 '[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  ',
 ' @user camping tomorrow @user @user @user @user @user @user @user dannyâ\x80¦',
 "the next school year is the year for exams.ð\x9f\x98¯ can't think about that ð\x9f\x98\xad #school #exams   #hate #imagine #actorslife #revolutionschool #girl",
 'we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â\x80¦ ',
 " @user @user welcome here !  i'm   it's so #gr8 ! ",
 ' â\x86\x9d #ireland consume

In [35]:
tweets[4000]

"i'm still in a little bit of disbelief that in a couple of months i'm going to be staing my year abroad in the us  "

### 3. To cleanup

#### 3.1. Normalize casing

In [36]:
tweets = [tweet.lower() for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:00<00:00, 2992640.96it/s]


#### 3.2. Using regular expressions, remove user handles. These begin with '@’.

In [37]:
tweets = [re.sub(r'@', '', tweet) for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:00<00:00, 1838379.97it/s]


In [38]:
tweets[0]

' user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

#### 3.3. Using regular expressions, remove URLs.

In [39]:
tweets = [re.sub(r'http\S+|www\S+', '', tweet) for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:00<00:00, 1012770.04it/s]


#### 3.4 Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.

In [40]:
tokenizer = TweetTokenizer()

tokens = [tokenizer.tokenize(tweet) for tweet in tqdm(tweets)]

100%|██████████| 31962/31962 [00:01<00:00, 29286.13it/s]


In [41]:
tokens

[['user',
  'when',
  'a',
  'father',
  'is',
  'dysfunctional',
  'and',
  'is',
  'so',
  'selfish',
  'he',
  'drags',
  'his',
  'kids',
  'into',
  'his',
  'dysfunction',
  '.',
  '#run'],
 ['user',
  'user',
  'thanks',
  'for',
  '#lyft',
  'credit',
  'i',
  "can't",
  'use',
  'cause',
  'they',
  "don't",
  'offer',
  'wheelchair',
  'vans',
  'in',
  'pdx',
  '.',
  '#disapointed',
  '#getthanked'],
 ['bihday', 'your', 'majesty'],
 ['#model',
  'i',
  'love',
  'u',
  'take',
  'with',
  'u',
  'all',
  'the',
  'time',
  'in',
  'urð',
  '\x9f',
  '\x93',
  '±',
  '!',
  '!',
  '!',
  'ð',
  '\x9f',
  '\x98',
  '\x99',
  'ð',
  '\x9f',
  '\x98',
  '\x8e',
  'ð',
  '\x9f',
  '\x91',
  '\x84',
  'ð',
  '\x9f',
  '\x91',
  'ð',
  '\x9f',
  '\x92',
  '¦',
  'ð',
  '\x9f',
  '\x92',
  '¦',
  'ð',
  '\x9f',
  '\x92',
  '¦'],
 ['factsguide', ':', 'society', 'now', '#motivation'],
 ['[',
  '2/2',
  ']',
  'huge',
  'fan',
  'fare',
  'and',
  'big',
  'talking',
  'before',
  'th

#### 3.5  Remove stop words.

In [42]:
sw = stopwords.words('english')
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [43]:
tokens = [[word for word in t if word not in sw] for t in tqdm(tokens)] 


100%|██████████| 31962/31962 [00:00<00:00, 55638.82it/s]


#### 3.6 Remove redundant terms like ‘amp’, ‘rt’, etc.

In [44]:
for i, token in enumerate(tokens):
    for word in token:
        if re.search(r'&', word) or re.search(r'retweet', word):
            print(i, word)

17 retweet
22 &
64 &
82 &
111 &
131 &
141 retweeted
163 &
185 &
188 &
219 &
219 &
259 &
261 &
293 &
298 &
316 &
322 &
332 &
335 &
335 &
335 &
370 &
398 &
405 &
422 &
431 &
436 &
442 #retweet
472 &
474 &
498 &
498 #retweet
565 &
587 &
604 &
620 &
622 &
689 &
699 &
701 &
701 &
701 &
701 &
723 &
723 &
745 &
759 &
792 &
815 &
815 &
861 &
916 &
926 &
927 &
932 &
972 &
1010 &
1074 &
1100 &
1136 &
1142 &
1142 &
1170 &
1170 &
1173 &
1186 &
1194 &
1201 &
1229 &
1229 &
1250 &
1250 &
1270 &
1281 &
1308 &
1320 &
1329 &
1373 &
1425 &
1445 &
1445 &
1465 &
1487 &
1500 &
1502 &
1562 &
1564 &
1564 &
1583 &
1583 &
1610 &
1635 &
1650 retweet
1669 &
1695 &
1699 &
1705 &
1705 &
1705 &
1744 &
1759 &
1760 &
1801 &
1822 &
1858 &
1858 #retweet
1868 &
1876 &
1888 &
1896 &
1903 &
1931 &
1953 &
1953 &
1978 &
2014 &
2048 &
2048 &
2048 &
2065 &
2065 &
2066 &
2075 &
2144 &
2145 &
2206 &
2223 &
2252 &
2255 &
2301 &
2313 &
2348 &
2349 &
2384 &
2414 &
2415 &
2433 &
2437 &
2438 &
2453 &
2457 &
2468 &
2493 &
2512 &
2522 

In [45]:
tokens[17]

['retweet', 'agree', '!']

In [46]:
tokens[22]

['product',
 'day',
 ':',
 'happy',
 'man',
 '#wine',
 'tool',
 "who's",
 '#weekend',
 '?',
 'time',
 'open',
 '&',
 'drink',
 '!']

In [47]:
# remove amp from tokens and print the index of that token
for token in (tokens):
    for i, word in enumerate(token):
        if re.search(r'&', word):
            token[i] = re.sub(r'&', '', word)
        if re.search(r'retweet', word):
            token[i] = re.sub(r'retweet', '', word)
  

In [48]:
tokens[22]

['product',
 'day',
 ':',
 'happy',
 'man',
 '#wine',
 'tool',
 "who's",
 '#weekend',
 '?',
 'time',
 'open',
 '',
 'drink',
 '!']

In [49]:
tokens[17]

['', 'agree', '!']

#### 3.7 Remove ‘#’ symbols from the tweet while retaining the term.

In [50]:
tokens = [[re.sub(r'#', '', word) for word in token] for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:00<00:00, 215548.68it/s]


In [51]:
tokens

[['user',
  'father',
  'dysfunctional',
  'selfish',
  'drags',
  'kids',
  'dysfunction',
  '.',
  'run'],
 ['user',
  'user',
  'thanks',
  'lyft',
  'credit',
  "can't",
  'use',
  'cause',
  'offer',
  'wheelchair',
  'vans',
  'pdx',
  '.',
  'disapointed',
  'getthanked'],
 ['bihday', 'majesty'],
 ['model',
  'love',
  'u',
  'take',
  'u',
  'time',
  'urð',
  '\x9f',
  '\x93',
  '±',
  '!',
  '!',
  '!',
  'ð',
  '\x9f',
  '\x98',
  '\x99',
  'ð',
  '\x9f',
  '\x98',
  '\x8e',
  'ð',
  '\x9f',
  '\x91',
  '\x84',
  'ð',
  '\x9f',
  '\x91',
  'ð',
  '\x9f',
  '\x92',
  '¦',
  'ð',
  '\x9f',
  '\x92',
  '¦',
  'ð',
  '\x9f',
  '\x92',
  '¦'],
 ['factsguide', ':', 'society', 'motivation'],
 ['[',
  '2/2',
  ']',
  'huge',
  'fan',
  'fare',
  'big',
  'talking',
  'leave',
  '.',
  'chaos',
  'pay',
  'disputes',
  'get',
  '.',
  'allshowandnogo'],
 ['user',
  'camping',
  'tomorrow',
  'user',
  'user',
  'user',
  'user',
  'user',
  'user',
  'user',
  'dannyâ',
  '\x80',
  '

### 4. Extra cleanup by removing terms with a length of 1.

In [52]:
tokens = [[word for word in token if len(word) > 1] for token in tqdm(tokens)]

100%|██████████| 31962/31962 [00:00<00:00, 1307720.43it/s]


In [53]:
tokens

[['user',
  'father',
  'dysfunctional',
  'selfish',
  'drags',
  'kids',
  'dysfunction',
  'run'],
 ['user',
  'user',
  'thanks',
  'lyft',
  'credit',
  "can't",
  'use',
  'cause',
  'offer',
  'wheelchair',
  'vans',
  'pdx',
  'disapointed',
  'getthanked'],
 ['bihday', 'majesty'],
 ['model', 'love', 'take', 'time', 'urð'],
 ['factsguide', 'society', 'motivation'],
 ['2/2',
  'huge',
  'fan',
  'fare',
  'big',
  'talking',
  'leave',
  'chaos',
  'pay',
  'disputes',
  'get',
  'allshowandnogo'],
 ['user',
  'camping',
  'tomorrow',
  'user',
  'user',
  'user',
  'user',
  'user',
  'user',
  'user',
  'dannyâ'],
 ['next',
  'school',
  'year',
  'year',
  'exams',
  "can't",
  'think',
  'school',
  'exams',
  'hate',
  'imagine',
  'actorslife',
  'revolutionschool',
  'girl'],
 ['love',
  'land',
  'allin',
  'cavs',
  'champions',
  'cleveland',
  'clevelandcavaliers'],
 ['user', 'user', 'welcome', "i'm", 'gr8'],
 ['ireland',
  'consumer',
  'price',
  'index',
  'mom',
 

### 5. Check out the top terms in the tweets

#### 5.1 First, get all the tokenized terms into one large list

In [54]:
tokens_list = []
for token in tokens:
    for word in token:
        tokens_list.append(word)

In [55]:
tokens_list

['user',
 'father',
 'dysfunctional',
 'selfish',
 'drags',
 'kids',
 'dysfunction',
 'run',
 'user',
 'user',
 'thanks',
 'lyft',
 'credit',
 "can't",
 'use',
 'cause',
 'offer',
 'wheelchair',
 'vans',
 'pdx',
 'disapointed',
 'getthanked',
 'bihday',
 'majesty',
 'model',
 'love',
 'take',
 'time',
 'urð',
 'factsguide',
 'society',
 'motivation',
 '2/2',
 'huge',
 'fan',
 'fare',
 'big',
 'talking',
 'leave',
 'chaos',
 'pay',
 'disputes',
 'get',
 'allshowandnogo',
 'user',
 'camping',
 'tomorrow',
 'user',
 'user',
 'user',
 'user',
 'user',
 'user',
 'user',
 'dannyâ',
 'next',
 'school',
 'year',
 'year',
 'exams',
 "can't",
 'think',
 'school',
 'exams',
 'hate',
 'imagine',
 'actorslife',
 'revolutionschool',
 'girl',
 'love',
 'land',
 'allin',
 'cavs',
 'champions',
 'cleveland',
 'clevelandcavaliers',
 'user',
 'user',
 'welcome',
 "i'm",
 'gr8',
 'ireland',
 'consumer',
 'price',
 'index',
 'mom',
 'climbed',
 'previous',
 '0.2',
 '0.5',
 'may',
 'blog',
 'silver',
 'gold

#### 5.2 Use the counter and find the 10 most common terms.

In [56]:
counter = Counter(tokens_list)
counter.most_common(10)

[('user', 17534),
 ('...', 2809),
 ('love', 2748),
 ('day', 2274),
 ('happy', 1684),
 ('time', 1131),
 ('life', 1118),
 ('like', 1047),
 ("i'm", 1018),
 ('today', 1013)]

### 6. Data formatting for predictive modeling:

#### 6.1 Join the tokens back to form strings. This will be required for the vectorizers.

#### 6.2 assign x and y.

#### 6.3 Perform train_test_split using sklearn