In [7]:
#importing required libraries
import pandas as pd
train = pd.read_csv('georgefloyd-tweets.csv')
print('\n\nDATA\n\n')
print(train.head())




DATA


                                                Text
0  Do you feel used by God? \nHebrews 13:20 Now m...
1                            https://t.co/9xu5kNtKuS
2  Perhaps God loves a good story with a really d...
3  The Red Dragon \n Revelation 12:3 And another ...
4  Scott Morrison: Increase the Newstart payment ...


In [8]:
#Converting the Data into Lower Case
train['Text'] = train['Text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['Text'].head()

0    do you feel used by god? hebrews 13:20 now may...
1                              https://t.co/9xu5kntkus
2    perhaps god loves a good story with a really d...
3    the red dragon revelation 12:3 and another sig...
4    scott morrison: increase the newstart payment ...
Name: Text, dtype: object

In [9]:
#Removing the Pucntuations form the text

train['Text'] = train['Text'].str.replace('[^\w\s]','')
train['Text'].head()

0    do you feel used by god hebrews 1320 now may t...
1                                   httpstco9xu5kntkus
2    perhaps god loves a good story with a really d...
3    the red dragon revelation 123 and another sign...
4    scott morrison increase the newstart payment b...
Name: Text, dtype: object

In [10]:
#Removel of the stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['Text'] = train['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['Text'].head()

0    feel used god hebrews 1320 may god peace broug...
1                                   httpstco9xu5kntkus
2    perhaps god loves good story really dramatic c...
3    red dragon revelation 123 another sign appeare...
4    scott morrison increase newstart payment 75 pe...
Name: Text, dtype: object

In [11]:
# Removal of the Common words
freq = pd.Series(' '.join(train['Text']).split()).value_counts()[:10]
freq

hi             29
swommers       22
free           22
via            19
swom           16
httpswomcom    15
get            12
site           10
new             9
traffic         8
dtype: int64

In [12]:
freq = list(freq.index)
train['Text'] = train['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Text'].head()

0    feel used god hebrews 1320 may god peace broug...
1                                   httpstco9xu5kntkus
2    perhaps god loves good story really dramatic c...
3    red dragon revelation 123 another sign appeare...
4    scott morrison increase newstart payment 75 pe...
Name: Text, dtype: object

In [13]:
# Removal of Rare words
freq = pd.Series(' '.join(train['Text']).split()).value_counts()[-10:]
freq

5                                                1
blades                                           1
here2helpservic                                  1
personal                                         1
httptinyurlcomgpf1234                            1
httpswomcomkra1d                                 1
updated                                          1
httpwwwwndcompageid125439                        1
httpwwwlucypringlecoukphotos2009uk2009bmshtml    1
whoohoo                                          1
dtype: int64

In [14]:
freq = list(freq.index)
train['Text'] = train['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Text'].head()

0    feel used god hebrews 1320 may god peace broug...
1                                   httpstco9xu5kntkus
2    perhaps god loves good story really dramatic c...
3    red dragon revelation 123 another sign appeare...
4    scott morrison increase newstart payment 75 pe...
Name: Text, dtype: object

In [15]:
# Spelling Correction
from textblob import TextBlob
train['Text'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    feel used god hebrew 1320 may god peace brough...
1                                   httpstco9xu5kntkus
2    perhaps god loves good story really dramatic c...
3    red dragon revelation 123 another sign appeare...
4    scott morrison increase newstart payment 75 pe...
Name: Text, dtype: object

In [16]:
#Tokenization
TextBlob(train['Text'][1]).words

WordList(['httpstco9xu5kntkus'])

In [17]:
#Stemming
from nltk.stem import PorterStemmer
st = PorterStemmer()
train['Text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    feel use god hebrew 1320 may god peac brought ...
1                                    httpstco9xu5kntku
2    perhap god love good stori realli dramat clima...
3    red dragon revel 123 anoth sign appear heaven ...
4    scott morrison increas newstart payment 75 per...
Name: Text, dtype: object

In [18]:
#Lemmatization
from textblob import Word
train['Text'] = train['Text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['Text'].head(20)

0     feel used god hebrew 1320 may god peace brough...
1                                    httpstco9xu5kntkus
2     perhaps god love good story really dramatic cl...
3     red dragon revelation 123 another sign appeare...
4     scott morrison increase newstart payment 75 pe...
5          denialisahoax genesismining seen payouts yet
6                   genesismining classic scam response
7                                          zcash zcrash
8     poloniex im getting there problem website secu...
9                            zayahas httpstcojn5ftopu93
10    rt gitcoingg 100 mill 600 giveaway 100 winner ...
11    moon bitcoin bitcoin faucet decide claim https...
12    mininghere thank giving burst mine burst minin...
13    joined linkedin created professional profile j...
14                there rumour there prison break hakea
15                                start httptcorwob6k46
16    success doesnt show didnt work today building ...
17    wind power without big pic discovery news 

In [19]:
from textblob import TextBlob
TextBlob("not a very great calculation").sentiment

Sentiment(polarity=-0.3076923076923077, subjectivity=0.5769230769230769)

In [20]:
#POSITIVITY

TextBlob("great").sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [21]:
#NEGATION

TextBlob("not great").sentiment

Sentiment(polarity=-0.4, subjectivity=0.75)

In [22]:
#MODIFIER WORDS

TextBlob("very great").sentiment

Sentiment(polarity=1.0, subjectivity=0.9750000000000001)

In [23]:
#MODIFIER + NEGATION

TextBlob("not very great").sentiment

Sentiment(polarity=-0.3076923076923077, subjectivity=0.5769230769230769)

In [24]:
#MODIFIER + NEGATION + UNKNOWN WORD

TextBlob("not a very great").sentiment

Sentiment(polarity=-0.3076923076923077, subjectivity=0.5769230769230769)

In [25]:
from textblob import TextBlob
train['Text'][:20].apply(lambda x: TextBlob(x).sentiment)


0                                    (-0.2, 0.4)
1                                     (0.0, 0.0)
2      (0.39166666666666666, 0.6375000000000001)
3                    (0.26666666666666666, 0.25)
4                                     (0.0, 0.0)
5                                     (0.0, 0.0)
6     (0.16666666666666666, 0.16666666666666666)
7                                     (0.0, 0.0)
8                                     (0.0, 0.0)
9                                     (0.0, 0.0)
10                                    (0.0, 0.0)
11                                    (0.0, 0.0)
12                                    (0.0, 0.0)
13                                    (0.1, 0.1)
14                                    (0.0, 0.0)
15                                    (0.0, 0.0)
16                                    (0.3, 0.0)
17                                    (0.0, 0.1)
18                                    (0.4, 0.5)
19                    (0.0, 0.35714285714285715)
Name: Text, dtype: o

In [26]:
train['sentiment'] = train['Text'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['Text','sentiment']].head(200)

Unnamed: 0,Text,sentiment
0,feel used god hebrew 1320 may god peace brough...,-0.200000
1,httpstco9xu5kntkus,0.000000
2,perhaps god love good story really dramatic cl...,0.391667
3,red dragon revelation 123 another sign appeare...,0.266667
4,scott morrison increase newstart payment 75 pe...,0.000000
...,...,...
125,ok set teprofits 25 letter series auto responder,0.500000
126,ha set series letter put squeeze pg splash lis...,0.000000
127,learning auto responder trafficwave signed aff...,0.000000
128,big88twin yeah raining use desert,0.000000
