# Activity 4: Extracting data from Twitter

Extract 100 tweets using tweepy library having hashtag '#WorldWaterDay' and do the following <br>
i) Filter out the ones which are written in english <br>
ii) Calculate sentiment score for each of them using textblob library <br>
iii) Visualize these tweets using word cloud having at most 100 words <br>

In [None]:
#!pip install tweepy

Login to twitter <br>
Go to https://developer.twitter.com/en/apps <br>
Create an app <br>
Once app is created, Go to 'Keys and tokens' tab; copy consumer_key, consumer_secret, access_token, access_token_secret from there  <br>

In [None]:
consumer_key = 'your consumer key here'
consumer_secret = 'your consumer secret key here'
access_token = 'your access token here'
access_token_secret = 'your access token secret here'

In [None]:
import pandas as pd
import numpy as np
import pickle
import json
from pprint import pprint
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [None]:
tweet_list = []
cnt = 0
for tweet in tweepy.Cursor(api.search, q='#WorldWaterDay', rpp=100).items():
    tweet_list.append(tweet)
    cnt = cnt + 1
    if cnt == 100:
        break

In [None]:
len(tweet_list)

In [None]:
tweet_list[0]

In [None]:
status = tweet_list[0]
json_str = json.dumps(status._json)
pprint(json.loads(json_str))

In [None]:
json.loads(json_str)['text']

In [None]:
tweet_text = []
for i in range(0,len(tweet_list)):
    status = tweet_list[i]
    json_str = json.dumps(status._json)
    tweet_text.append(json.loads(json_str)['text'])

In [None]:
unique_tweet_text = list(set(tweet_text))

## Creating dataframe consists of texts of tweets

In [None]:
tweet_text_df = pd.DataFrame({'tweet_text' : unique_tweet_text})
tweet_text_df.head()

In [None]:
tweet_text_df['language_detected'] = tweet_text_df['tweet_text'].apply(lambda x : \
                                                                       str(TextBlob('u'+str(x)).detect_language()))

In [None]:
tweet_text_df.head(20)

## Non-english tweets

In [None]:
tweet_text_df[tweet_text_df['language_detected']!='en']

## Filter out the english tweets

In [None]:
tweet_text_df_eng = tweet_text_df[tweet_text_df['language_detected']=='en']
tweet_text_df_eng.shape

## Extract sentiment scores of the english tweets using textblob

In [None]:
tweet_text_df_eng['sentiment_score'] = tweet_text_df_eng['tweet_text'].apply(lambda x: str(TextBlob('u'+str(x)).sentiment.polarity))
pd.set_option('display.max_colwidth', -1)
tweet_text_df_eng[['tweet_text', 'sentiment_score']].head(20)

## Creating word cloud

In [None]:
other_stopwords_to_remove = ['https', 'amp','co', 'rt']
STOPWORDS = STOPWORDS.union(set(other_stopwords_to_remove))
stopwords = set(STOPWORDS)

text=tweet_text_df_eng["tweet_text"]
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                max_words=100,
                stopwords = stopwords, 
                min_font_size = 10).generate(str(text))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()