In [383]:
# Requires pymongo 3.6.0+
from pymongo import MongoClient
from bson.code import Code
import pandas as pd

client = MongoClient("mongodb://bigdata-mongodb-04.virtual.uniandes.edu.co:8087/", retryWrites=False)
database = client["Grupo03"]
collection = database["COL_dataset"]

In [384]:
# Created with Studio 3T, the IDE for MongoDB - https://studio3t.com/

query = {}
projection = {}
projection["reply_or_quote"] = 1.0

data = []
cursor = collection.find(query, projection = projection)
try:
    for doc in cursor:
        data.append([doc['_id'], doc['reply_or_quote']])
finally:
    client.close()

In [385]:
df = pd.DataFrame(data,columns=['_id', 'text'])
df.head()

Unnamed: 0,_id,text
0,5e95c25095e4b2cefb66fbf2,@ClaudiaLopez @infopresidencia @Bogota Para cu...
1,5e95c25095e4b2cefb66fbf3,@ClaudiaLopez @infopresidencia @Bogota La verd...
2,5e95c25095e4b2cefb66fbf4,"@ClaudiaLopez @infopresidencia @Bogota Pero, x..."
3,5e95c25095e4b2cefb66fbf5,@ClaudiaLopez @infopresidencia @Bogota Ingreso...
4,5e95c25095e4b2cefb66fbf6,@ClaudiaLopez @infopresidencia @Bogota Estimad...


In [386]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

In [387]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

In [388]:
my_stopwords = nltk.corpus.stopwords.words('spanish')
stopwords = ['d', 'x', 'pa', 'q', 'si', 'usted', 'tan', 'solo', 'ser', 'bien', 'así', 'mas', 'va', 'van', 'señor', 'hace', 'hacer', 
             'siempre', 'gracias', 'favor', 'puede', 'dio', 'como', 'aquí', 'ahí']
my_stopwords.extend(stopwords)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [389]:
df['clean_tweet'] = df.text.apply(clean_tweet)

In [390]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [391]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 5

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [392]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [393]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [394]:
no_top_words = 10
df_topics = display_topics(model, tf_feature_names, no_top_words)

In [373]:
nuevo_df = pd.DataFrame()

In [395]:
valores = []
pesos = []
temas = []
for i in range(5):
    valores+= (df_topics['Topic {} words'.format(str(i))].values.tolist()) 
    pesos+= (df_topics['Topic {} weights'.format(str(i))].values.tolist()) 
    for j in range(len(df_topics['Topic {} weights'.format(str(i))].values.tolist())):
        temas.append('tema{}'.format(str(i)))
    i+=i

In [396]:
nuevo_df['valores'] = valores
nuevo_df['pesos'] = pesos
nuevo_df['temas'] = temas

In [397]:
fig = px.treemap(nuevo_df, path=['temas', 'valores'], values='pesos')
fig.show()