In [14]:
from gensim import corpora, models
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk import bigrams, trigrams
import numpy as np
from collections import Counter, defaultdict
import string
import re
#from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer

In [15]:
t1 = pd.read_csv("scraped_tweets1.csv")
t2 = pd.read_csv("scraped_tweets2.csv")
t3 = pd.read_csv("scraped_tweets3.csv")
t4 = pd.read_csv("scraped_tweets4.csv")
t5 = pd.read_csv("scraped_tweets5.csv")

In [16]:
frames = [t1, t2, t3, t4, t5]
tweets = pd.concat(frames)

In [17]:
# remove non-english tweets, reset indices
tweets_eng = tweets[tweets['language'].isin(['en', 'und'])]
tweets_eng = tweets_eng.reset_index(drop = True)

In [18]:
# filter users who have less than 100 tweets
tweets_eng['count'] = 1
tweet_count_grouped = tweets_eng.groupby('query')
tweet_count = tweet_count_grouped['count'].agg([np.sum])
tweet_count['keep'] = tweet_count['sum'] >= 100
tweet_keep = tweet_count[tweet_count['keep'] == True]
users_tweet_keep = list(tweet_keep.index)
tweets_eng_keep = tweets_eng[tweets_eng['query'].isin(users_tweet_keep)]

#len(tweets_eng_keep), len(tweets_eng)

In [19]:
# filter users who have 3x as many friends as followers (potential bots)
friend_count_grouped = tweets_eng_keep.groupby('query')
followers_count = friend_count_grouped['num_followers'].agg([np.mean])
friends_count = friend_count_grouped['num_friends'].agg([np.mean])
friends_count['ratio'] = friends_count['mean']/friends_count['mean']
friends_count['keep'] = friends_count['ratio'] <= 3
ratio_keep = friends_count[friends_count['keep'] == True]
users_ratio_keep = list(ratio_keep.index)
tweets_trimmed = tweets_eng_keep[tweets_eng_keep['query'].isin(users_ratio_keep)]
#len(tweets_trimmed), len(tweets_eng_keep), len(tweets_eng)
tweets_trimmed = tweets_trimmed.reset_index(drop = True)

In [20]:
# deal with NaN
tweets_trimmed['entities_hashtags'] = tweets_trimmed['entities_hashtags'].fillna(0)
tweets_trimmed['entities_urls'] = tweets_trimmed['entities_urls'].fillna(0)

In [21]:
tweets_subset1 = tweets_trimmed[tweets_trimmed['query'] == "ForeverMoreVids"]
tweets_subset2 = tweets_trimmed[tweets_trimmed['query'] == "brianjdixon"]
subsets = [tweets_subset1, tweets_subset2]
tweets_subset = pd.concat(subsets)
tweets_subset = tweets_subset.reset_index(drop = True)

In [22]:
# expressions to remove punctuation, urls, hashtags, @mentions, numbers, emoticons
punctuation = set(string.punctuation)
blank = ["", " "]
url = re.compile(r'^http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+')
hashtags = re.compile(r'^(?:\#+[\w_]+[\w\'_\-]*[\w_]+)')
mentions = re.compile(r'^(?:@[\w_]+)')
numbers = re.compile(r'(\d+)\D*(\d*)\D*(\d*)\D*(\d*)') # includes phone numbers
smiley = re.compile(r'[:=;\|\)\(\[\]\{\}][oO\-]?[D\)\]\(\[/\\OpP\|\{\}:]')
remove_regex = [url, hashtags, mentions, numbers, smiley]

In [23]:
tknzr = TweetTokenizer(strip_handles = True, reduce_len = True)
p_stemmer = PorterStemmer()
en_stop = [u'a', u'about', u'above', u'after', u'again', u'against', u'all', u'am', u'an', u'and', u'any', u'are', u"aren't", u'as', u'at', u'be', u'because', u'been', u'before', u'being', u'below', u'between', u'both', u'but', u'by', u"can't", u'cannot', u'could', u"couldn't", u'did', u"didn't", u'do', u'does', u"doesn't", u'doing', u"don't", u'down', u'during', u'each', u'few', u'for', u'from', u'further', u'had', u"hadn't", u'has', u"hasn't", u'have', u"haven't", u'having', u'he', u"he'd", u"he'll", u"he's", u'her', u'here', u"here's", u'hers', u'herself', u'him', u'himself', u'his', u'how', u"how's", u'i', u"i'd", u"i'll", u"i'm", u"i've", u'if', u'in', u'into', u'is', u"isn't", u'it', u"it's", u'its', u'itself', u"let's", u'me', u'more', u'most', u"mustn't", u'my', u'myself', u'no', u'nor', u'not', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'ought', u'our', u'ours', u'ourselves', u'out', u'over', u'own', u'same', u"shan't", u'she', u"she'd", u"she'll", u"she's", u'should', u"shouldn't", u'so', u'some', u'such', u'than', u'that', u"that's", u'the', u'their', u'theirs', u'them', u'themselves', u'then', u'there', u"there's", u'these', u'they', u"they'd", u"they'll", u"they're", u"they've", u'this', u'those', u'through', u'to', u'too', u'under', u'until', u'up', u'very', u'was', u"wasn't", u'we', u"we'd", u"we'll", u"we're", u"we've", u'were', u"weren't", u'what', u"what's", u'when', u"when's", u'where', u"where's", u'which', u'while', u'who', u"who's", u'whom', u'why', u"why's", u'with', u"won't", u'would', u"wouldn't", u'you', u"you'd", u"you'll", u"you're", u"you've", u'your', u'yours', u'yourself', u'yourselves']

In [24]:
terms_all = []
users = []

for user in list(set(tweets_subset['query'])):
    user_sub = tweets_subset[tweets_subset['query'] == user]
    tweets = list(set(user_sub['content']))
    # unigrams only
    user_terms = []
    for tweet in tweets:
        terms = tknzr.tokenize(tweet.lower())
        for term in terms:
            if (not any(rr.search(term) for rr in remove_regex)) and (term not in punctuation) and not (term.startswith('www')):
                if term not in en_stop:
                    stemmed_term = p_stemmer.stem(term)
                    user_terms.append(stemmed_term.encode('ascii', 'ignore'))
    
    terms_all.append(user_terms)
    users.append(user)

In [55]:
#print terms_all

In [25]:
dictionary = corpora.Dictionary(terms_all)

In [26]:
corpus = [dictionary.doc2bow(term) for term in terms_all]

In [54]:
#print(dictionary.token2id)

In [53]:
#print corpus

In [31]:
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=1, id2word = dictionary, passes=100)

In [35]:
ldamodel.save('savedModel')