In [11]:
import pandas as pd
import numpy as np
import re
import string

from collections import Counter
from nltk.corpus import stopwords

from keras_preprocessing import text

from itertools import groupby

from tqdm.autonotebook import tqdm

pd.set_option('display.max_colwidth', -1)



In [2]:
pos_df = pd.read_csv("Datasets/twitter-datasets/train_pos_full.txt", delimiter="\\n", header=None, 
                     names = ["tweets"], engine='python')
neg_df = pd.read_csv("Datasets/twitter-datasets/train_neg_full.txt", delimiter="\\n", header=None,
                     names = ["tweets"], engine='python')

In [3]:
test_df = pd.read_csv("Datasets/twitter-datasets/test_data.txt", delimiter="\\n", header=None,
                     names = ["tweets"], engine='python')
test_df = pd.DataFrame(test_df.tweets.str.split(',',1).tolist(), columns = ['id','tweets'])

In [4]:
pos_df['label'] = 1
neg_df['label'] = 0

Remove numbers

In [5]:
pos_df.tweets = pos_df.tweets.str.replace('\d+', '')
neg_df.tweets = neg_df.tweets.str.replace('\d+', '')
test_df.tweets = test_df.tweets.str.replace('\d+', '')

Remove `<user>`

In [6]:
pos_df.tweets = pos_df.tweets.str.replace("<user>", "")
neg_df.tweets = neg_df.tweets.str.replace("<user>", "")
test_df.tweets = test_df.tweets.str.replace("<user>", "")

Remove `<url>`

In [7]:
pos_df.tweets = pos_df.tweets.str.replace("<url>", "")
neg_df.tweets = neg_df.tweets.str.replace("<url>", "")
test_df.tweets = test_df.tweets.str.replace("<url>", "")

In [9]:
pos_df.head()

Unnamed: 0,tweets,label
0,i dunno justin read my mention or not . only ...,1
1,"because your logic is so dumb , i won't even c...",1
2,""" just put casper in a box ! "" looved the bat...",1
3,thanks sir > > don't trip lil mama ... just ...,1
4,visiting my brother tmr is the bestest birthda...,1


Remove punctuation

In [12]:
pos_df.tweets = pos_df.tweets.str.translate(str.maketrans('', '', string.punctuation.replace('#','')))
neg_df.tweets = neg_df.tweets.str.translate(str.maketrans('', '', string.punctuation.replace('#','')))
test_df.tweets = test_df.tweets.str.translate(str.maketrans('', '', string.punctuation.replace('#','')))

In [15]:
pos_df.head()

Unnamed: 0,tweets,label
0,i dunno justin read my mention or not only justin and god knows about that but i hope you will follow me #believe,1
1,because your logic is so dumb i wont even crop out your name or your photo tsk,1
2,just put casper in a box looved the battle #crakkbitch,1
3,thanks sir dont trip lil mama just keep doin ya thang,1
4,visiting my brother tmr is the bestest birthday gift eveerrr,1


In [16]:
results_pos = Counter()
pos_df.tweets.str.lower().str.split().apply(results_pos.update)
results_neg = Counter()
neg_df.tweets.str.lower().str.split().apply(results_neg.update)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
1249970    None
1249971    None
1249972    None
1249973    None
1249974    None
1249975    None
1249976    None
1249977    None
1249978    None
1249979    None
1249980    None
1249981    None
1249982    None
1249983    None
1249984    None
1249985    None
1249986    None
1249987    None
1249988    None
1249989    None
1249990    None
1249991    None
1249992    None
1249993    None
1249994    None
1249995    None
1249996    None
1249997    None
1249998    None
1249999    None
Name: tweets, Length: 12

In [39]:
count_pos = results_pos.most_common(5000)
count_neg = results_neg.most_common(5000)

dict_count_pos = dict(count_pos)
dict_count_neg = dict(count_neg)

both = dict_count_pos.keys() & (dict_count_neg.keys())

count_total = [(word, dict_count_pos[word]+dict_count_neg[word]) for word in both]
dict_both = dict(count_total)

In [40]:
pos_90 = [(word, dict_count_pos[word]) for word in both if 100*dict_count_pos[word]/dict_both[word]>90]
neg_90 = [(word, dict_count_neg[word]) for word in both if 100*dict_count_neg[word]/dict_both[word]>90]

In [41]:
pos_90

[('thank', 36670),
 ('goodmorning', 2540),
 ('thanks', 62337),
 ('thankyou', 4418),
 ('congrats', 3488),
 ('shoutout', 6033)]

In [42]:
neg_90

[('digital', 6435),
 ('co', 2252),
 ('series', 14385),
 ('technology', 2499),
 ('headache', 2831),
 ('large', 4193),
 ('print', 3650),
 ('product', 5271),
 ('inch', 9441),
 ('classic', 5530),
 ('system', 3996),
 ('mb', 6945),
 ('professional', 1794),
 ('design', 4414),
 ('collection', 5022),
 ('throat', 2566),
 ('silver', 5361),
 ('cd', 13578),
 ('kit', 10439),
 ('health', 6076),
 ('gb', 11486),
 ('edition', 22579),
 ('cap', 1862),
 ('guide', 9125),
 ('brand', 8899),
 ('japanese', 1964),
 ('cried', 3409),
 ('ap', 1891),
 ('screen', 5161),
 ('cell', 3775),
 ('sports', 4703),
 ('studies', 2349),
 ('kindle', 9346),
 ('memory', 9868),
 ('pro', 4083),
 ('volume', 4172),
 ('toy', 5171),
 ('screw', 2974),
 ('size', 6600),
 ('mah', 5945),
 ('oz', 6958),
 ('dvd', 14668),
 ('mobile', 2899),
 ('pack', 31995),
 ('kitchen', 4877),
 ('mm', 5315),
 ('pc', 8871),
 ('battery', 10358),
 ('fits', 2882),
 ('metal', 1687),
 ('ca', 1672),
 ('sad', 24209),
 ('poster', 26372),
 ('black', 32352),
 ('hurts', 56

In [43]:
pos_80_90 = [(word, dict_count_pos[word]) for word in both if ((100*dict_count_pos[word]/dict_both[word]>80) & (100*dict_count_pos[word]/dict_both[word]<=90))]
neg_80_90 = [(word, dict_count_neg[word]) for word in both if ((100*dict_count_neg[word]/dict_both[word]>80) & (100*dict_count_neg[word]/dict_both[word]<=90))]

In [44]:
pos_80_90

[('hehehe', 1303),
 ('pussy', 1322),
 ('glad', 8093),
 ('goodnight', 4806),
 ('happy', 31629),
 ('xoxo', 2003),
 ('lovely', 5848),
 ('retweet', 5151),
 ('excited', 8302),
 ('yay', 5265),
 ('directioners', 1553),
 ('following', 13919),
 ('hey', 22355),
 ('goood', 1553),
 ('smile', 10834),
 ('proud', 5693),
 ('followback', 1326),
 ('wonderful', 2835),
 ('spread', 1138),
 ('vote', 2328),
 ('shout', 3782),
 ('awesome', 10923),
 ('welcome', 7195),
 ('follower', 2275),
 ('hehe', 6099),
 ('kissing', 1254),
 ('birthday', 19452),
 ('enjoyed', 1279),
 ('appreciate', 2498)]

In [45]:
neg_80_90

[('china', 919),
 ('accident', 710),
 ('sadly', 1557),
 ('tape', 1472),
 ('broke', 3243),
 ('gr', 1111),
 ('badly', 1321),
 ('worse', 2359),
 ('heavy', 1666),
 ('bags', 1990),
 ('feet', 4140),
 ('blackberry', 1206),
 ('awful', 1335),
 ('foods', 786),
 ('however', 1621),
 ('science', 2865),
 ('toe', 713),
 ('died', 3437),
 ('lg', 1256),
 ('press', 3251),
 ('net', 716),
 ('multiple', 982),
 ('flash', 1680),
 ('olive', 656),
 ('killing', 2164),
 ('g', 8864),
 ('map', 1167),
 ('oil', 1656),
 ('lifetime', 1209),
 ('sales', 951),
 ('sore', 2314),
 ('ash', 1693),
 ('shoe', 1051),
 ('calendar', 662),
 ('mountain', 897),
 ('dragon', 760),
 ('information', 1604),
 ('web', 1608),
 ('case', 12919),
 ('research', 1350),
 ('camera', 5941),
 ('pocket', 1929),
 ('current', 1385),
 ('value', 1339),
 ('mouse', 1178),
 ('windows', 1179),
 ('entertainment', 784),
 ('foot', 1774),
 ('parts', 1798),
 ('written', 2812),
 ('broken', 1846),
 ('style', 3925),
 ('smooth', 2239),
 ('limited', 1336),
 ('release', 

In [46]:
pos_70_80 = [(word, dict_count_pos[word]) for word in both if ((100*dict_count_pos[word]/dict_both[word]>70) & (100*dict_count_pos[word]/dict_both[word]<80))]
neg_70_80 = [(word, dict_count_neg[word]) for word in both if ((100*dict_count_neg[word]/dict_both[word]>70) & (100*dict_count_neg[word]/dict_both[word]<=80))]

In [47]:
pos_70_80

[('bday', 3439),
 ('unless', 1491),
 ('yup', 1783),
 ('checking', 733),
 ('luck', 6047),
 ('ahah', 884),
 ('surprise', 1183),
 ('aye', 957),
 ('good', 72422),
 ('check', 7301),
 ('#thoughtsduringschool', 1447),
 ('beautiful', 14209),
 ('trend', 1845),
 ('hahaa', 843),
 ('hah', 830),
 ('sharing', 936),
 ('marry', 1123),
 ('sure', 13298),
 ('alll', 2412),
 ('lls', 924),
 ('interested', 749),
 ('cool', 10291),
 ('kay', 720),
 ('laugh', 4161),
 ('ya', 10385),
 ('hi', 11714),
 ('selena', 1203),
 ('lemme', 752),
 ('tune', 666),
 ('ha', 5376),
 ('agree', 2373),
 ('hello', 5281),
 ('anytime', 1481),
 ('amazing', 15384),
 ('#np', 920),
 ('loving', 2546),
 ('yesss', 1856),
 ('singer', 1307),
 ('followers', 9231),
 ('surf', 706),
 ('hoes', 1022),
 ('girlfriend', 3290),
 ('btw', 4071),
 ('aha', 2861),
 ('#teamfollowback', 1620),
 ('chilling', 747),
 ('lets', 9335),
 ('niggas', 1004),
 ('fave', 729),
 ('jk', 2297),
 ('relax', 768),
 ('trending', 2262),
 ('funny', 7606),
 ('freak', 782),
 ('yah', 68

In [51]:
positive = pos_70_80 + pos_90
negative = neg_70_80 + neg_90

In [52]:
positive

[('bday', 3439),
 ('unless', 1491),
 ('yup', 1783),
 ('checking', 733),
 ('luck', 6047),
 ('ahah', 884),
 ('surprise', 1183),
 ('aye', 957),
 ('good', 72422),
 ('check', 7301),
 ('#thoughtsduringschool', 1447),
 ('beautiful', 14209),
 ('trend', 1845),
 ('hahaa', 843),
 ('hah', 830),
 ('sharing', 936),
 ('marry', 1123),
 ('sure', 13298),
 ('alll', 2412),
 ('lls', 924),
 ('interested', 749),
 ('cool', 10291),
 ('kay', 720),
 ('laugh', 4161),
 ('ya', 10385),
 ('hi', 11714),
 ('selena', 1203),
 ('lemme', 752),
 ('tune', 666),
 ('ha', 5376),
 ('agree', 2373),
 ('hello', 5281),
 ('anytime', 1481),
 ('amazing', 15384),
 ('#np', 920),
 ('loving', 2546),
 ('yesss', 1856),
 ('singer', 1307),
 ('followers', 9231),
 ('surf', 706),
 ('hoes', 1022),
 ('girlfriend', 3290),
 ('btw', 4071),
 ('aha', 2861),
 ('#teamfollowback', 1620),
 ('chilling', 747),
 ('lets', 9335),
 ('niggas', 1004),
 ('fave', 729),
 ('jk', 2297),
 ('relax', 768),
 ('trending', 2262),
 ('funny', 7606),
 ('freak', 782),
 ('yah', 68

In [53]:
positive = [i[0] for i in positive]
negative = [i[0] for i in negative]


In [54]:
positive

['bday',
 'unless',
 'yup',
 'checking',
 'luck',
 'ahah',
 'surprise',
 'aye',
 'good',
 'check',
 '#thoughtsduringschool',
 'beautiful',
 'trend',
 'hahaa',
 'hah',
 'sharing',
 'marry',
 'sure',
 'alll',
 'lls',
 'interested',
 'cool',
 'kay',
 'laugh',
 'ya',
 'hi',
 'selena',
 'lemme',
 'tune',
 'ha',
 'agree',
 'hello',
 'anytime',
 'amazing',
 '#np',
 'loving',
 'yesss',
 'singer',
 'followers',
 'surf',
 'hoes',
 'girlfriend',
 'btw',
 'aha',
 '#teamfollowback',
 'chilling',
 'lets',
 'niggas',
 'fave',
 'jk',
 'relax',
 'trending',
 'funny',
 'freak',
 'yah',
 'weed',
 'alright',
 'ladies',
 'follow',
 'follows',
 'liam',
 'yours',
 'interesting',
 'loves',
 'nice',
 'sometime',
 'mmm',
 'haha',
 'sweet',
 'worry',
 'youu',
 'obviously',
 'youre',
 'gorgeous',
 'ahaha',
 'fantastic',
 'hows',
 'smell',
 'mention',
 'fav',
 'cutest',
 'finally',
 'hun',
 'sounds',
 'bitches',
 'cute',
 'avi',
 'followed',
 'course',
 'conversations',
 'ooh',
 'yeahh',
 'hahaha',
 'umm',
 'info'

In [58]:
dic = {}
for el in positive:
    dic[el] = 'happy'
    
for el in negative:
    dic[el] = 'sad'

In [59]:
#pos_df.tweet_as_list = pos_df.tweet_as_list.apply(lambda tweet_list: [dic(word) if word in dic.keys else word for word i])

pos_df.tweets = pos_df.tweets.apply(lambda tweet: " ".join([dic[word] if word in dic.keys() else word for word in tweet.split(" ")]))
                                    
neg_df.tweets = neg_df.tweets.apply(lambda tweet: " ".join([dic[word] if word in dic.keys() else word for word in tweet.split(" ")]))
                                    
test_df.tweets = test_df.tweets.apply(lambda tweet: " ".join([dic[word] if word in dic.keys() else word for word in tweet.split(" ")]))
                                    
                                    

In [61]:
np.savetxt("Datasets/twitter-datasets/train_pos_full_cleaned.txt", pos_df.values, fmt='%s')
np.savetxt("Datasets/twitter-datasets/train_neg_full_cleaned.txt", neg_df.values, fmt='%s')

In [62]:
pos_df.to_csv("Datasets/twitter-datasets/train_pos_full_cleaned.csv")
neg_df.to_csv("Datasets/twitter-datasets/train_neg_full_cleaned.csv")
test_df.to_csv("Datasets/twitter-datasets/test_data_cleaned.csv")