In [2]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt

Read Cleaned Poverty Tweets

In [32]:
train = pd.read_csv("CleanedPovertyTweets_v1.csv")

train.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweets,uncleaned_len,cleaned_len
0,1,2018-09-30 19:39,Growing Food and Faith in Impoverished Brazil...,247,89
1,2,2018-09-30 19:37,In yrs Vietnam from poverty to an emerging ma...,429,220
2,3,2018-09-30 19:37,The Politics of Poverty Officials grapple wit...,309,96
3,4,2018-09-30 19:29,Not just talking doing kenyalendahand kenya n...,292,166
4,5,2018-09-30 19:24,Raiders Poverty,25,17


Convert all tweets to lower case

In [33]:
train['Tweets'] = train['Tweets'].apply(lambda x: x.lower())
train.Tweets.head(10)

0     growing food and faith in impoverished brazil...
1     in yrs vietnam from poverty to an emerging ma...
2     the politics of poverty officials grapple wit...
3     not just talking doing kenyalendahand kenya n...
4                                     raiders poverty 
5     for all those who believe the world has gone ...
6     even breathing is a risk in one of orlando s ...
7     maybe it s time to rethink the idea that we k...
8     this is why i volunteer with big bros big sis...
9     americas children in brief key national indic...
Name: Tweets, dtype: object

Stop Words Removal

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Nugrahan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['Tweets'] = train['Tweets'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [35]:
train.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweets,uncleaned_len,cleaned_len
0,1,2018-09-30 19:39,growing food faith impoverished brazil ben dem...,247,89
1,2,2018-09-30 19:37,yrs vietnam poverty emerging market ans oi moi...,429,220
2,3,2018-09-30 19:37,politics poverty officials grapple works doesn...,309,96
3,4,2018-09-30 19:29,talking kenyalendahand kenya nairobi kibera ja...,292,166
4,5,2018-09-30 19:24,raiders poverty,25,17


Spell Correction

In [36]:
from textblob import TextBlob

In [37]:
train.Tweets.apply(lambda x: TextBlob(x).correct);

Tokenize

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Nugrahan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [38]:
from nltk.tokenize import sent_tokenize, word_tokenize
train['Tokensize']= train.Tweets.apply(lambda x: word_tokenize(x))

In [39]:
train.head(10)

Unnamed: 0.1,Unnamed: 0,Date,Tweets,uncleaned_len,cleaned_len,Tokensize
0,1,2018-09-30 19:39,growing food faith impoverished brazil ben dem...,247,89,"[growing, food, faith, impoverished, brazil, b..."
1,2,2018-09-30 19:37,yrs vietnam poverty emerging market ans oi moi...,429,220,"[yrs, vietnam, poverty, emerging, market, ans,..."
2,3,2018-09-30 19:37,politics poverty officials grapple works doesn...,309,96,"[politics, poverty, officials, grapple, works,..."
3,4,2018-09-30 19:29,talking kenyalendahand kenya nairobi kibera ja...,292,166,"[talking, kenyalendahand, kenya, nairobi, kibe..."
4,5,2018-09-30 19:24,raiders poverty,25,17,"[raiders, poverty]"
5,6,2018-09-30 19:23,believe world gone bonkers need evolve better ...,234,212,"[believe, world, gone, bonkers, need, evolve, ..."
6,7,2018-09-30 19:13,even breathing risk one orlando poorest neighb...,207,151,"[even, breathing, risk, one, orlando, poorest,..."
7,8,2018-09-30 19:09,maybe time rethink idea know better people nee...,192,125,"[maybe, time, rethink, idea, know, better, peo..."
8,9,2018-09-30 19:00,volunteer big bros big sisters many american k...,191,166,"[volunteer, big, bros, big, sisters, many, ame..."
9,10,2018-09-30 19:00,americas children brief key national indicator...,279,211,"[americas, children, brief, key, national, ind..."


Pos Tagging

In [19]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Nugrahan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [40]:
from nltk import pos_tag
train['POS_TAG'] = train['Tokensize'].apply(lambda x: pos_tag(x))

In [41]:
train[['Tweets', 'POS_TAG']].head(10)

Unnamed: 0,Tweets,POS_TAG
0,growing food faith impoverished brazil ben dem...,"[(growing, VBG), (food, NN), (faith, NN), (imp..."
1,yrs vietnam poverty emerging market ans oi moi...,"[(yrs, NN), (vietnam, NNP), (poverty, NN), (em..."
2,politics poverty officials grapple works doesn...,"[(politics, NNS), (poverty, NN), (officials, N..."
3,talking kenyalendahand kenya nairobi kibera ja...,"[(talking, VBG), (kenyalendahand, NN), (kenya,..."
4,raiders poverty,"[(raiders, NNS), (poverty, VBP)]"
5,believe world gone bonkers need evolve better ...,"[(believe, JJ), (world, NN), (gone, VBN), (bon..."
6,even breathing risk one orlando poorest neighb...,"[(even, RB), (breathing, VBG), (risk, NN), (on..."
7,maybe time rethink idea know better people nee...,"[(maybe, RB), (time, NN), (rethink, VB), (idea..."
8,volunteer big bros big sisters many american k...,"[(volunteer, NN), (big, JJ), (bros, NN), (big,..."
9,americas children brief key national indicator...,"[(americas, JJ), (children, NNS), (brief, JJ),..."


Lemmetization

In [23]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/Nugrahan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [42]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [43]:
#https://github.com/KT12/tag-lemmatize/blob/master/tag-lemmatize.py

#penn to wordnet, takes care of only 5 POS, rest converted to noun 

part = {
    'N' : 'n',
    'V' : 'v',
    'J' : 'a',
    'S' : 's',
    'R' : 'r'
}

def convert_tag(penn_tag):
    '''
    convert_tag() accepts the **first letter** of a Penn part-of-speech tag,
    then uses a dict lookup to convert it to the appropriate WordNet tag.
    '''
    if penn_tag in part.keys():
        return part[penn_tag]
    else:
        # other parts of speech will be tagged as nouns
        return 'n'
    

def tag_and_lem():
    '''
    tag_and_lem() accepts a string, tokenizes, tags, converts tags,
    lemmatizes, and returns a string
    '''
    ""
    

    
    lemm = []
    i = 0
    #sent = pos_tag(word_tokenize(element)) # must tag in context
    for tweet in train.POS_TAG:
        lemm = []
        for words in tweet:
            text = words[0]
            tag = words[1]
            lemm.append("".join(lemmatiser.lemmatize(text, pos = convert_tag(tag))))
        train.at[i, "Lemmas"] = lemm
        i+= 1


from nltk.stem import WordNetLemmatizer
train.insert(train.shape[1], 'Lemmas', '0')   #use this to create a default column 
lemmatiser = WordNetLemmatizer()
tag_and_lem()


Display Tweets after NLP

In [44]:
train.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweets,uncleaned_len,cleaned_len,Tokensize,POS_TAG,Lemmas
0,1,2018-09-30 19:39,growing food faith impoverished brazil ben dem...,247,89,"[growing, food, faith, impoverished, brazil, b...","[(growing, VBG), (food, NN), (faith, NN), (imp...","[growing, food, faith, impoverished, brazil, b..."
1,2,2018-09-30 19:37,yrs vietnam poverty emerging market ans oi moi...,429,220,"[yrs, vietnam, poverty, emerging, market, ans,...","[(yrs, NN), (vietnam, NNP), (poverty, NN), (em...","[yr, vietnam, poverty, emerging, market, an, o..."
2,3,2018-09-30 19:37,politics poverty officials grapple works doesn...,309,96,"[politics, poverty, officials, grapple, works,...","[(politics, NNS), (poverty, NN), (officials, N...","[politics, poverty, official, grapple, work, d..."
3,4,2018-09-30 19:29,talking kenyalendahand kenya nairobi kibera ja...,292,166,"[talking, kenyalendahand, kenya, nairobi, kibe...","[(talking, VBG), (kenyalendahand, NN), (kenya,...","[talking, kenyalendahand, kenya, nairobi, kibe..."
4,5,2018-09-30 19:24,raiders poverty,25,17,"[raiders, poverty]","[(raiders, NNS), (poverty, VBP)]","[raider, poverty]"


Sentiment Analysis

In [46]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [47]:
analyser = SentimentIntensityAnalyzer()

In [48]:
#sentimental analysis based on compund score of VADER
sentiment = []
i= 0 
for tweet in train.Tweets:
    vs = analyser.polarity_scores(tweet)
    if vs['compound'] >= 0.5:
        sentiment.append(1)
    elif vs['compound'] <= -0.5:
        sentiment.append(-1)
    elif vs['compound'] > - 0.5 and vs['compound'] < 0.5:
        sentiment.append(0)

In [49]:
newtrain = pd.DataFrame(columns = ['Date', 'Review', 'Sentiment'])

In [50]:
#create a new dataframe for storing sentiments values
newtrain['Date'] = train.Date
newtrain['Review'] = train.Tweets
newtrain['Sentiment'] = sentiment

In [51]:
#dataset after sentimental analysis
newtrain.head(10)

Unnamed: 0,Date,Review,Sentiment
0,2018-09-30 19:39,growing food faith impoverished brazil ben dem...,0
1,2018-09-30 19:37,yrs vietnam poverty emerging market ans oi moi...,-1
2,2018-09-30 19:37,politics poverty officials grapple works doesn...,-1
3,2018-09-30 19:29,talking kenyalendahand kenya nairobi kibera ja...,-1
4,2018-09-30 19:24,raiders poverty,-1
5,2018-09-30 19:23,believe world gone bonkers need evolve better ...,-1
6,2018-09-30 19:13,even breathing risk one orlando poorest neighb...,-1
7,2018-09-30 19:09,maybe time rethink idea know better people nee...,0
8,2018-09-30 19:00,volunteer big bros big sisters many american k...,-1
9,2018-09-30 19:00,americas children brief key national indicator...,1


Convert time stamp column

In [52]:
import datetime
from dateutil.parser import parse

newtrain['Date'] = pd.to_datetime(newtrain.Date, dayfirst=True)  #Y/M/D default

In [53]:
newtrain.to_csv("Sentiment_Twitter_Poverty.csv")