# Twitter Data Extraction

In [None]:
import os
import tweepy as tw
import pandas as pd
import requests

In [None]:
consumer_key= 
consumer_secret= 
access_token= 
access_token_secret= 
bearer_token=

In [None]:
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)
client = tw.Client(bearer_token)

In [None]:
import pandas as pd  
pd.set_option('display.max_rows', 10000000)
pd.set_option('display.max_columns', 1000000)



def get_related_tweets(key_word):

    twitter_users = []
    tweet_time = []
    tweet_string = [] 
    for tweet in tweepy.Cursor(api.search_tweets,q=key_word, count=1000).items(1000):
            if (not tweet.retweeted) and ('RT @' not in tweet.text):
                if tweet.lang == "en":
                    twitter_users.append(tweet.user.name)
                    tweet_time.append(tweet.created_at)
                    tweet_string.append(tweet.text)
                    #print([tweet.user.name,tweet.created_at,tweet.text])
    df = pd.DataFrame({'name':twitter_users, 'time': tweet_time, 'tweet': tweet_string})
    
    return df

In [None]:
df = get_related_tweets("Superbowl Halftime Show")
df.to_csv('sss.csv', index=False)

# Data Cleaning

In [None]:
import pandas as pd

data_df = pd.read_csv("VMA.csv",index_col=0)
data_df

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean_text_1(text):
    # Lowercase
    text = text.lower()
    # Remove special text in brackets ([chorus],[guitar],etc)
    text = re.sub('\[.*?\]', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)    
    # Remove quotes
    text = re.sub('[‘’“”…]', '', text)
    # Remove new line \n 
    text = re.sub('\n', ' ', text)
    # Remove stop_word
    stop_words = stopwords.words('english')
    words = word_tokenize(text)
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text
    

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.tweet.apply(clean_text_1))

# Lemmatization

In [None]:
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [None]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

print("%-20s %-20s %-20s"% ("Word","Porter Stemmer","lancaster Stemmer"))
for word in verb_list:
    print("%-20s %-20s %-20s"%(word, porter.stem(word),lancaster.stem(word)))
print("--")
for word in noun_list:
    print("%-20s %-20s %-20s"%(word, porter.stem(word),lancaster.stem(word)))
print("--")
for word in adjec_list:
    print("%-20s %-20s %-20s"%(word, porter.stem(word),lancaster.stem(word)))

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# POS tagging

from nltk import word_tokenize, pos_tag

txt = "Remember when you were young, you shone like the sun Shine on you crazy diamond"
pos_tag(word_tokenize(txt))

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

def lemmatize_tag(text):
    lemma=[]
    for i,j in pos_tag(word_tokenize(text)) :
        p=j[0].lower()
        if p in ['j','n','v']:
            if p == 'j':
                p = 'a'
            lemma.append(wnl.lemmatize(i,p))
        else :
            lemma.append(wnl.lemmatize(i))    
    return ' '.join(lemma)



In [None]:
data_clean = pd.DataFrame(data_clean.tweet.apply(lemmatize_tag))

In [None]:
data_clean.to_csv('vma_clean.csv')

# Data Representation

In [None]:
import pandas as pd

data_clean = pd.read_csv("vma_clean.csv",index_col=0)
data_clean = data_clean.reset_index()
data_clean

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

help(CountVectorizer)

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.tweet)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data_clean.tweet)
len(vectorizer.get_feature_names())

In [None]:
data_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())
data_tfidf.index = data_clean.index
data_tfidf

# Most Common Words

In [None]:
import sys
#!conda install --yes --prefix {sys.prefix} wordcloud

In [None]:
# Find the top 30 words 

data=data_dtm.transpose()

top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Look at the most common top words 
from collections import Counter

# Let's first pull out the top 30 words for each artist
words = []
for artist in data.columns:
    top = [word for (word, count) in top_dict[artist]]
    for t in top:
        words.append(t)

In [None]:
Counter(words).most_common()

In [None]:
# If more than half of the comedians have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 8]
add_stop_words

In [None]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

stop_words

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.tweet)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

In [None]:
stop_words2=[]
for w in stop_words:
    stop_words2.append(w)

#stop_words2.extend(['la','li','ooh','bird','number'])

In [None]:
# Let's make some word clouds!
# Terminal / Anaconda Prompt: conda install -c conda-forge wordcloud
import sys
#!conda install --yes --prefix {sys.prefix} wordcloud

from wordcloud import WordCloud

wc = WordCloud(collocations=False,stopwords=stop_words2, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
data_clean

In [None]:
tag = ['vma','dont','freedman','mtv vma', 'video', 'mtv','vma performance', 'fba', 'look', 'video', 'dont', 'think', 'britney', 'grammy', 'know', 'still', 'perform', 'need', 'one', 'go', 'vmas', 'day', 'even', 'year', 'say', 'time', 'pokemongoapp']
pat = r'\b(?:{})\b'.format('|'.join(tag))
pat
data_clean['tweet'] = data_clean['tweet'].str.replace(pat, ' ')

In [None]:
import matplotlib.pyplot as plt

text2 = data_clean
wordcloud2 = WordCloud(background_color="white").generate(' '.join(text2['tweet']))
# Generate plot
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()