# Tweet Word Cloud

Gets the tweets from a given user, and analyses their most frequent words and produces a word cloud.

### Requirements:

Python 3 (you can use Anaconda to get this - see https://www.anaconda.com/download/) 

- pandas
- nltk
- wordcloud
- tweepy

You can ```pip install``` all of these. Furthermore, nltk requires you to ```nltk.download()``` the popular data in order for this to run.

You will need to create an application through https://apps.twitter.com/ to get the twitter API credentials.

---

Edit the cell below, then run all the cells.


In [None]:
#CHANGE THESE

twitter_handle = 'nzsecretsanta'

#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

In [None]:
#NO TOUCHIE

#!/usr/bin/env python
# encoding: utf-8
% matplotlib inline

import tweepy #https://github.com/tweepy/tweepy

import re
import string
import nltk
import matplotlib.pyplot as plt
import pandas as pd

from time import sleep
from nltk import word_tokenize
from nltk.corpus import stopwords

from wordcloud import WordCloud


def get_all_tweets(screen_name, alltweets=[], max_id=0):
    #Twitter only allows access to a users most recent 3240 tweets with this method
    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)

    #make initial request for most recent tweets (200 is the maximum allowed count)
    if max_id is 0:
        new_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended')
    else:
        # new new_tweets
        new_tweets = api.user_timeline(screen_name=screen_name, count= 200, max_id=max_id, tweet_mode='extended')

    if len(new_tweets) > 0:
        #save most recent tweets
        alltweets.extend(new_tweets)
        # security
        sleep(2)
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        return get_all_tweets(screen_name=screen_name, alltweets=alltweets, max_id=oldest)

    #final tweets
    return alltweets


def clean_tweet(tweet):
    cleaned_text = re.sub(r'http[s]?:\/\/.*[\W]*', '', tweet, flags=re.MULTILINE) # remove urls
    cleaned_text = re.sub(r'@[\w]*', '', cleaned_text, flags=re.MULTILINE) # remove the @twitter mentions 
    cleaned_text = re.sub(r'#[\w]*', '', cleaned_text, flags=re.MULTILINE) # remove the hashtags
    cleaned_text = re.sub(r'RT.*','', cleaned_text, flags=re.MULTILINE) # delete the retweets
    cleaned_text = re.sub(r'[\']','',cleaned_text, flags=re.MULTILINE)
    return cleaned_text


def tweet_to_tokens(tweet):
    #turns tweet text into a list of words
    stop = stopwords.words('english') + list(string.punctuation) + ['`','\'\'','...','``']
    return [i for i in word_tokenize(tweet.lower()) if i not in stop]


def make_cloud_from_text(text, max_words=30):
    # Generate a word cloud image
    wordcloud = WordCloud(max_words=max_words).generate(text)    
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")

    # lower max_font_size
    wordcloud = WordCloud(max_font_size=40,max_words=max_words).generate(text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
#get all the tweets
my_tweets = get_all_tweets(twitter_handle)



In [None]:
#make word clouds based on all words
list_of_words = []
for tweet in my_tweets:
    list_of_words += (tweet_to_tokens(clean_tweet(tweet.full_text)))

make_cloud_from_text(' '.join(list_of_words))


In [None]:
#make word clouds just for nouns

#parts of speech tagging
list_of_nouns = []
for tweet in my_tweets:
    clean = clean_tweet(tweet.full_text) #clean urls etc off
    no_punc = ''.join([i.lower() for i in clean if i not in string.punctuation]) #remove punctuation
    tags = nltk.pos_tag(nltk.word_tokenize(no_punc)) #tag the text (requires sequence)
    nouns = [i[0] for i in tags if i[1].startswith('N')] #reduce to just the nouns
    nouns_no_plural = [i if not i.endswith('s') else i[:-1] for i in nouns] #remove plurals (more matches)
    list_of_nouns += nouns_no_plural #add to list


make_cloud_from_text(' '.join(list_of_nouns))